In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from scipy.stats import spearmanr, pearsonr
from scipy import spatial
import torch
from tqdm.notebook import tqdm
tqdm.pandas()


from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import torch
from torch import nn, optim
import scipy.stats

In [13]:
PATH = os.path.join("..", "data", "raw")

df_train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
df_train["Split_Text"] = df_train["Text"].apply(lambda x: x.replace("\n", " "))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: x.split("\r"))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
df_train.drop(["Split_Text"], axis=1, inplace=True)
display(df_train.head())

# For testing puposes:
# df_train = df_train.sample(n=500, random_state=42)

Unnamed: 0,PairID,Text,Score,sen_1,sen_2
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed


# Model Training

RUN WITH CUDA
- training time on a good CPU (Ryzen 5800X) ~ 50h
- training time on Google Colab with free GPU ~ 25 minutes 

In [31]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim
import lightning as L
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import scipy.stats
from tqdm import tqdm
from collections import OrderedDict


class BertClassifier(L.LightningModule):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
        self.loss_fn = torch.nn.MSELoss()

        self.val_predictions = []
        self.val_labels = []

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        return loss

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        predictions = outputs.logits.squeeze()

        # Append predictions and labels to the lists
        self.val_predictions.append(predictions)
        self.val_labels.append(labels.squeeze())

        return predictions, labels
    
    def on_test_epoch_end(self):
        # Concatenate predictions and labels at the end of each epoch
        predictions = torch.cat(self.val_predictions)
        labels = torch.cat(self.val_labels)
        
        mse = self.loss_fn(predictions, labels)
        spearman_corr, _ = scipy.stats.spearmanr(predictions.cpu().numpy(), labels.cpu().numpy())
        
        # Log the metrics
        self.log('val_loss', mse, prog_bar=True)
        self.log('val_spearman_corr', spearman_corr, prog_bar=True)

        # Clear the lists for the next epoch
        self.val_predictions = []
        self.val_labels = []


    def configure_optimizers(self):
        self.optimizer = optim.AdamW(self.model.parameters(), lr=5e-5)
        return self.optimizer



def prepare_data(df, tokenizer):
    tokenized = tokenizer(df['sen_1'].tolist(), df['sen_2'].tolist(),
                          return_tensors='pt', 
                          padding=True, 
                          truncation=True,
                          max_length=256)
    
    labels = torch.tensor(df['Score'].tolist(), dtype=torch.float32).unsqueeze(1)
    return TensorDataset(tokenized['input_ids'], tokenized['attention_mask'], labels)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

all_spearman_corrs = []
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train)):
    print(f'Fold {fold + 1}/{kf.get_n_splits()}')

    # Split data into train and validation sets
    train_df, val_df = df_train.iloc[train_idx], df_train.iloc[val_idx]

    train_data = prepare_data(train_df, tokenizer)
    val_data = prepare_data(val_df, tokenizer)

    train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_data, batch_size=8, shuffle=False)

    # Initialize the Lightning model
    model = BertClassifier()

    # Trainer
    trainer = L.Trainer(accelerator="auto", max_epochs=3, num_sanity_val_steps=0 )

    # Train the model
    trainer.fit(model, train_dataloader)

    # Evaluate on validation data
    trainer.test(model, val_dataloader)

    # Calculate and print the average Spearman correlation
    average_spearman_corr = trainer.callback_metrics['val_spearman_corr'].mean()
    print(f'Average Spearman Correlation for Fold {fold + 1}: {average_spearman_corr}')
    all_spearman_corrs.append(average_spearman_corr)


# Calculate and print the overall average Spearman correlation
overall_average_spearman_corr = sum(all_spearman_corrs) / len(all_spearman_corrs)
print(f'Overall Average Spearman Correlation across all folds: {overall_average_spearman_corr}')

Fold 1/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type                          | Params
----------------------------------------------------------
0 | model   | BertForSequenceClassification | 109 M 
1 | loss_fn | MSELoss                       | 0     
----------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 1: 0.6074076879484503
Fold 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type                          | Params
----------------------------------------------------------
0 | model   | BertForSequenceClassification | 109 M 
1 | loss_fn | MSELoss                       | 0     
----------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 2: 0.743916268649307
Fold 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type                          | Params
----------------------------------------------------------
0 | model   | BertForSequenceClassification | 109 M 
1 | loss_fn | MSELoss                       | 0     
----------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 3: 0.5957474328064633
Fold 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type                          | Params
----------------------------------------------------------
0 | model   | BertForSequenceClassification | 109 M 
1 | loss_fn | MSELoss                       | 0     
----------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 4: 0.35258521533443743
Fold 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type                          | Params
----------------------------------------------------------
0 | model   | BertForSequenceClassification | 109 M 
1 | loss_fn | MSELoss                       | 0     
----------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 5: 0.8389096502784891
Overall Average Spearman Correlation across all folds: 0.6277132510034293


Receives ~ 0.82 Spearman correlation