In [110]:
import pandas as pd

#Load the data
data = pd.read_csv('dataset_en_sentiment.csv')

In [111]:
data["logits"] = data["logits"].apply(lambda x: x.replace("[", "").replace("]", "").split(" "))

#Remove the empty strings
data["logits"] = data["logits"].apply(lambda x: [float(i) for i in x if i != ""])

data["logits"]

0       [-4.228206, -6.7481823, -4.9048667, -4.0775337...
1       [-5.1360297, -5.928312, -5.695694, -4.957018, ...
2       [-6.5882273, -6.330186, -5.984769, -4.4509463,...
3       [-5.051609, -7.2605085, -6.6109424, -4.9246716...
4       [-6.1479216, -6.5357866, -6.369592, -4.4938283...
                              ...                        
3995    [-7.1117854, -6.361661, -5.0720863, -3.2259457...
3996    [-5.45811, -6.722521, -5.838917, -4.4321647, -...
3997    [-6.074478, -6.4609656, -6.4848986, -5.039374,...
3998    [-5.615449, -5.9232645, -5.973674, -4.699154, ...
3999    [-6.0916843, -5.4664063, -4.9749045, -2.936697...
Name: logits, Length: 4000, dtype: object

In [112]:
data["text"] = data["text"].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", "").split(","))

data["text"]

0       [THIS IS MASSIVE Australian Senator Malcolm Ro...
1       [“ I ’m deeply concerned that the push to vacc...
2       [2021 They wanted to know your vaccination sta...
3       [Anthony Fauci once again defended brutal Chin...
4       [Proof has emerged showing that death from Wuh...
                              ...                        
3996    [I personally do n’t believe Putin would set o...
3997    [Pfizer lied,  We know that,  "There s no doub...
3998    [It is utterly bizarre and inexplicable Dr,  J...
3999    ["I do nt know about you but I m getting extre...
Name: text, Length: 4000, dtype: object

In [7]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [8]:
class Dataset_en(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data["text"][idx]
        logits = self.data["logits"][idx]

In [9]:
dataset = Dataset_en("../dataset_en_train.json", tokenizer)

print(len(dataset))

4000


In [10]:
#Train test split the dataset
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

print(len(train_data))
print(len(test_data))

3200
800


In [11]:
#Create weights for the classes of the training data
from sklearn.utils.class_weight import compute_class_weight

labels = [data['labels'].item() for data in train_data]
class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)

print(class_weights)

tensor([0.7718, 1.4197])


In [12]:
from transformers import AutoModel

BERT = AutoModel.from_pretrained("bert-base-uncased")
BERT.config.output_hidden_states = True
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [20]:
class AttentionPooling(torch.nn.Module):
    def __init__(self, hidden_size):
        super(AttentionPooling, self).__init__()
        self.hidden_size = hidden_size
        self.query = torch.nn.Linear(hidden_size, hidden_size)
        self.key = torch.nn.Linear(hidden_size, hidden_size)
        self.value = torch.nn.Linear(hidden_size, hidden_size)
        
        self.f1 = torch.nn.Linear(hidden_size, hidden_size)
        
        self.softmax = torch.nn.Softmax(dim=1)
        
    def forward(self, inputs):
        #Inputs: (batch_size, seq_len, hidden_size)
        query = self.query(inputs)
        key = self.key(inputs)
        value = self.value(inputs)
        
        attention_scores = torch.bmm(query, key.transpose(1, 2)) / self.hidden_size**0.5
        attention_weights = self.softmax(attention_scores)
        
        context = torch.bmm(attention_weights, value)
        
        context = self.f1(context)
        
        pooled_output = context.mean(dim=1)
        
        return pooled_output

class BertClassifier(torch.nn.Module):
    def __init__(self, bert, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert
        self.pooling = AttentionPooling(hidden_size=bert.config.hidden_size)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(bert.config.hidden_size, num_classes)
        
        self.ffw = torch.nn.Sequential(
            torch.nn.Linear(bert.config.hidden_size, bert.config.hidden_size * 4),
            torch.nn.ReLU(),
            torch.nn.Linear(bert.config.hidden_size * 4, bert.config.hidden_size),
        )
        
    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        pooled_output = self.pooling(x)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits
        

In [22]:
model = BertClassifier(BERT, num_classes=2)

print(model)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))

train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

  criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))


In [24]:
from tqdm import tqdm
from sklearn.metrics import f1_score, matthews_corrcoef

EPOCHS = 5

model.train()

model.to(device)

best_mcc = -1

for epoch in range(EPOCHS):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())
        
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in tqdm(test_loader, leave=True):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
            
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    print(f"F1 Score: {f1}")
    print(f"MCC: {mcc}")
    
    if mcc > best_mcc:
        best_mcc = mcc
        torch.save(model.state_dict(), "model_BERT_Attention_Linear.pth")
        
    #Save the results each epoch
    with open(f"results_BERT_Attention_Linear_{epoch+1}.txt", 'w') as file:
        file.write(f"F1 Score: {f1}\n")
        file.write(f"MCC: {mcc}\n")

Epoch 1: 100%|██████████| 400/400 [03:11<00:00,  2.08it/s, loss=0.314] 
100%|██████████| 100/100 [00:15<00:00,  6.41it/s]


F1 Score: 0.8377358490566038
MCC: 0.7596927582631647


Epoch 2: 100%|██████████| 400/400 [03:12<00:00,  2.08it/s, loss=0.0569]
100%|██████████| 100/100 [00:16<00:00,  6.22it/s]


F1 Score: 0.8351648351648352
MCC: 0.7795147360812277


Epoch 3: 100%|██████████| 400/400 [03:14<00:00,  2.06it/s, loss=0.318]  
100%|██████████| 100/100 [00:16<00:00,  6.23it/s]


F1 Score: 0.8645640074211504
MCC: 0.8000260048539191


Epoch 4: 100%|██████████| 400/400 [03:15<00:00,  2.05it/s, loss=0.00119] 
100%|██████████| 100/100 [00:16<00:00,  6.19it/s]


F1 Score: 0.8765432098765431
MCC: 0.823954502802737


Epoch 5: 100%|██████████| 400/400 [03:15<00:00,  2.04it/s, loss=0.000788]
100%|██████████| 100/100 [00:16<00:00,  6.14it/s]

F1 Score: 0.8612836438923396
MCC: 0.8030201065650994





In [37]:
#Load the best model
model.load_state_dict(torch.load("model_BERT_Attention.pth"))

<All keys matched successfully>

In [38]:
#Test the model using the f1 score and the mathew correlation coefficient on the test data
from sklearn.metrics import f1_score, matthews_corrcoef

model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    loop = tqdm(test_loader, leave=True)
    for batch in loop:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

f1 = f1_score(all_labels, all_preds)
mcc = matthews_corrcoef(all_labels, all_preds)

print(f"F1 Score: {f1}")
print(f"Matthews Correlation Coefficient: {mcc}")

#Save the results
results = {
    "f1": f1,
    "mcc": mcc
}

with open("BERT_AttentionPooling.json", 'w') as file:
    json.dump(results, file)

100%|██████████| 100/100 [00:15<00:00,  6.40it/s]

F1 Score: 0.8702928870292888
Matthews Correlation Coefficient: 0.8177246968660349



