In [25]:
#Open the json file
import json

def read_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return data

In [26]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [27]:
class Dataset_en(torch.utils.data.Dataset):
    def __init__(self, path, tokenizer):
        self.data = read_data(path)
        self.tokenizer = tokenizer
        self.max_len = 512

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['category']
        if label == "CONSPIRACY":
            label = 1
        else:
            label = 0
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, padding='max_length', truncation=True)
        item = {key: inputs[key].squeeze(0) for key in inputs}
        item['labels'] = torch.tensor(label)

        return item

In [28]:
dataset = Dataset_en("../dataset_en_train.json", tokenizer)

print(len(dataset))

4000


In [29]:
#Train test split the dataset
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train_data))
print(len(test_data))

3200
800


In [30]:
#Create weights for the classes of the training data
from sklearn.utils.class_weight import compute_class_weight

labels = [data['labels'].item() for data in train_data]
class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)

print(class_weights)

tensor([0.7718, 1.4197])


In [31]:
from transformers import AutoModel

BERT = AutoModel.from_pretrained("bert-base-uncased")
BERT.config.output_hidden_states = True
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [32]:
class BertLSTMClassifier(torch.nn.Module):
    def __init__(self, bert, hidden_size, num_classes):
        super(BertLSTMClassifier, self).__init__()
        self.bert = bert
        self.lstm = torch.nn.LSTM(input_size=768, hidden_size=hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.linear = torch.nn.Linear(hidden_size*2, num_classes)
        self.dropout = torch.nn.Dropout(0.5)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states
        x = hidden_states[-1]
        #The lstm should process the sequence of hidden states and use the last hidden state as input to the linear layer
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [33]:
model = BertLSTMClassifier(BERT, hidden_size=256, num_classes=2)

print(model)

BertLSTMClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [34]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))

train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)

  criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))


In [35]:
from tqdm import tqdm

EPOCHS = 2

model.train()

model.to(device)

for epoch in range(EPOCHS):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 400/400 [04:48<00:00,  1.38it/s, loss=0.412] 
Epoch 2: 100%|██████████| 400/400 [04:50<00:00,  1.38it/s, loss=0.816] 


In [37]:
#Test the model using the f1 score and the mathew correlation coefficient on the test data
from sklearn.metrics import f1_score, matthews_corrcoef

test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    loop = tqdm(test_loader, leave=True)
    for batch in loop:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

f1 = f1_score(all_labels, all_preds)
mcc = matthews_corrcoef(all_labels, all_preds)

print(f"F1 Score: {f1}")
print(f"Matthews Correlation Coefficient: {mcc}")

#Save the results
results = {
    "f1": f1,
    "mcc": mcc
}

with open("BERT_LSTM", 'w') as file:
    json.dump(results, file)

100%|██████████| 100/100 [01:21<00:00,  1.23it/s]

F1 Score: 0.8560460652591171
Matthews Correlation Coefficient: 0.7875684224774842



