In [1]:
#Open the json file
import json

def read_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return data

In [2]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Dataset_en(torch.utils.data.Dataset):
    def __init__(self, path, tokenizer):
        self.data = read_data(path)
        self.tokenizer = tokenizer
        self.max_len = 512

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['category']
        if label == "CONSPIRACY":
            label = 1
        else:
            label = 0
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, padding='max_length', truncation=True)
        item = {key: inputs[key].squeeze(0) for key in inputs}
        item['labels'] = torch.tensor(label)

        return item

In [7]:
dataset = Dataset_en("../dataset_en_train.json", tokenizer)

print(len(dataset))

4000


In [8]:
#Train test split the dataset
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train_data))
print(len(test_data))

3200
800


In [9]:
#Create weights for the classes of the training data
from sklearn.utils.class_weight import compute_class_weight

labels = [data['labels'].item() for data in train_data]
class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)

print(class_weights)

tensor([0.7718, 1.4197])


In [11]:
from transformers import AutoModel

BERT = AutoModel.from_pretrained("bert-base-uncased")
BERT.config.output_hidden_states = True
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [13]:
class BertClassifier(torch.nn.Module):
    def __init__(self, model, num_labels):
        super(BertClassifier, self).__init__()
        self.model = model
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask)
        hidden_states = outputs.hidden_states
        h1 = hidden_states[-1].mean(dim=1)
        h2 = hidden_states[-2].mean(dim=1)
        h3 = hidden_states[-3].mean(dim=1)
        hf = torch.cat((h1, h2, h3), dim=1)
        out = self.classifier(self.dropout(h1))

        return out, hf

In [14]:
model = BertClassifier(BERT, 2)

print(model)

BertClassifier(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [15]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))

train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)

  criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))


In [16]:
from tqdm import tqdm
EPOCHS = 2

model.train()

model.to(device)

for epoch in range(EPOCHS):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs, _ = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 400/400 [03:13<00:00,  2.07it/s, loss=0.153] 
Epoch 2:  15%|█▌        | 60/400 [00:30<02:52,  1.97it/s, loss=0.0433]


KeyboardInterrupt: 

In [None]:
#Test the model using the f1 score and the mathew correlation coefficient on the test data
from sklearn.metrics import f1_score, matthews_corrcoef

test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs, _ = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

f1 = f1_score(all_labels, all_preds)
mcc = matthews_corrcoef(all_labels, all_preds)

print(f"F1 Score: {f1}")
print(f"Matthews Correlation Coefficient: {mcc}")

#Save the results
results = {
    "f1": f1,
    "mcc": mcc
}

with open("results_mean_concat", 'w') as file:
    json.dump(results, file)

F1 Score: 0.823529411764706
Matthews Correlation Coefficient: 0.7453826942296566


In [272]:
#Create a numpy array with the embeddings for the train and test data
import numpy as np
from tqdm import tqdm

def create_embeddings(data, model):
    model.eval()
    embeddings = []
    labels = []
    loader = torch.utils.data.DataLoader(data, batch_size=8)
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.append(batch['labels'])
            outputs, hf = model(input_ids, attention_mask)
            embeddings.append(hf)
    embeddings = torch.cat(embeddings)
    labels = torch.cat(labels)
    return embeddings, labels

train_embeddings, train_labels = create_embeddings(train_data, model)
test_embeddings, test_labels = create_embeddings(test_data, model)

train_embeddings = train_embeddings.cpu().numpy()
test_embeddings = test_embeddings.cpu().numpy()

print(train_embeddings.shape)
print(test_embeddings.shape)

100%|██████████| 400/400 [01:37<00:00,  4.08it/s]
100%|██████████| 100/100 [00:24<00:00,  4.08it/s]


(3200, 2304)
(800, 2304)


In [273]:
from lightgbm import LGBMClassifier

In [285]:
lgbm = LGBMClassifier(class_weight="balanced")

In [286]:
lgbm.fit(train_embeddings, train_labels)

[LightGBM] [Info] Number of positive: 1127, number of negative: 2073
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.375287 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 587520
[LightGBM] [Info] Number of data points in the train set: 3200, number of used features: 2304
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [287]:
#Test the modelo using the test data and the f1 score and mathews correlation coefficient
from sklearn.metrics import f1_score, matthews_corrcoef

preds = lgbm.predict(test_embeddings)
f1 = f1_score(test_labels, preds)
mcc = matthews_corrcoef(test_labels, preds)

#Save the results in a json file
results = {"f1": f1, "mcc": mcc}
with open("results_mean_concat_lgbm.json", 'w') as file:
    json.dump(results, file)

print(results)

{'f1': 0.8300395256916996, 'mcc': 0.7514432738601028}


In [288]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

In [289]:
clf.fit(train_embeddings, train_labels)

In [290]:
#Test the modelo using the test data and the f1 score and mathews correlation coefficient
from sklearn.metrics import f1_score, matthews_corrcoef

preds = clf.predict(test_embeddings)
f1 = f1_score(test_labels, preds)
mcc = matthews_corrcoef(test_labels, preds)

#Save the results in a json file
results = {"f1": f1, "mcc": mcc}
with open("results_mean_concat_gbc.json", 'w') as file:
    json.dump(results, file)

print(results)

{'f1': 0.7594433399602386, 'mcc': 0.6491458843846949}
