In [37]:
#Open the json file
import json

def read_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return data

In [38]:
import torch
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [39]:
class Dataset_en(torch.utils.data.Dataset):
    def __init__(self, path, tokenizer):
        self.data = read_data(path)
        self.tokenizer = tokenizer
        self.max_len = 512
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['category']
        if label == "CONSPIRACY":
            label = 1
        else:
            label = 0
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, padding='max_length', truncation=True)
        item = {key: inputs[key].squeeze(0) for key in inputs}
        item['labels'] = torch.tensor(label)
        
        return item
        

In [40]:
dataset = Dataset_en("dataset_en_train.json", tokenizer)

print(len(dataset))

4000


In [41]:
#Train test split the dataset
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train_data))
print(len(test_data))

3200
800


In [42]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [49]:
from tqdm import tqdm
from sklearn.metrics import f1_score, matthews_corrcoef

def train(model, train_loader, num_epochs, optimizer):

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        predictions = []
        ground_truth = []
        
        for batch in tqdm(train_loader):
            inputs = {key: batch[key].to(device) for key in batch if key != 'labels'}
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            predictions.extend(torch.argmax(outputs.logits, axis=1).tolist())
            ground_truth.extend(labels.tolist())
            
        avg_train_loss = total_loss / len(train_loader)
        
        print(f"Epoch {epoch+1}/{num_epochs} - Training loss: {avg_train_loss:.4f} - Training F1 score: {f1_score(ground_truth, predictions)} - Training MCC: {matthews_corrcoef(ground_truth, predictions)}")
        
    evaluate(model, test_loader)

def evaluate(model, test_loader):
    model.eval()
    total_val_loss = 0
    
    predictions = []
    ground_truth = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader):
            inputs = {key: batch[key].to(device) for key in batch if key != 'labels'}
            labels = batch['labels'].to(device)
            
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            
            total_val_loss += loss.item()
            
            predictions.extend(torch.argmax(outputs.logits, axis=1).tolist())
            ground_truth.extend(labels.tolist())
            
    avg_val_loss = total_val_loss / len(test_loader)
    
    print(f"Validation loss: {avg_val_loss:.4f} - Validation F1 score: {f1_score(ground_truth, predictions)} - Validation MCC: {matthews_corrcoef(ground_truth, predictions)}")
    
    return f1_score(ground_truth, predictions), matthews_corrcoef(ground_truth, predictions)
        

In [45]:
train(model, train_loader, num_epochs, optimizer)

100%|██████████| 400/400 [02:58<00:00,  2.24it/s]


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [50]:
f1_score, MCC = evaluate(model, test_loader)

#Save the results
results = {"f1_score": f1_score, "MCC": MCC}
with open("results_base.json", "w") as file:
    json.dump(results, file)

#Save the model
model.save_pretrained("model")

100%|██████████| 100/100 [00:14<00:00,  6.79it/s]

Validation loss: 0.2996 - Validation F1 score: 0.7822222222222223 - Validation MCC: 0.7085354813057592



