In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from scipy.stats import spearmanr, pearsonr
from scipy import spatial
import torch
from tqdm.notebook import tqdm
tqdm.pandas()


from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import torch
from torch import nn, optim
import scipy.stats

In [None]:
PATH = os.path.join("")

df_train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
df_train["Split_Text"] = df_train["Text"].apply(lambda x: x.replace("\n", " "))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: x.split("\r"))
df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
df_train.drop(["Split_Text"], axis=1, inplace=True)
display(df_train.head())

# For testing puposes:
# df_train = df_train.sample(n=1000, random_state=42)

# Model Training

RUN WITH CUDA
- training time on a good CPU (Ryzen 5800X) ~ 50h
- training time on Google Colab with free GPU ~ 25 minutes 

In [None]:
!nvidia-smi
import torch
torch.cuda.is_available()

In [None]:
# Tokenize and convert to BERT embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def prepare_data(df):
    sentences = df['sen_1'] + ' [SEP] ' + df['sen_2']
    tokenized = tokenizer(sentences.tolist(), return_tensors='pt', padding=True, truncation=True)
    labels = torch.tensor(df['Score'].tolist(), dtype=torch.float32).unsqueeze(1)
    return tokenized, labels

# 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

all_spearman_corrs = []

for fold, (train_idx, val_idx) in enumerate(kf.split(df_train)):
    print(f'Fold {fold + 1}/{kf.get_n_splits()}')

    # Split data into train and validation sets
    train_df, val_df = df_train.iloc[train_idx], df_train.iloc[val_idx]

    train_data, train_labels = prepare_data(train_df)
    val_data, val_labels = prepare_data(val_df)

    # Define a new BERT model for each fold
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
    model = model.to('cuda')  # Move model to GPU
    optimizer = optim.AdamW(model.parameters(), lr=5e-5)

    # Train the model
    epochs = 3
    batch_size = 8

    train_dataset = TensorDataset(train_data['input_ids'], train_data['attention_mask'], train_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in tqdm(range(epochs)):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs, masks, labels = batch
            inputs, masks, labels = inputs.to('cuda'), masks.to('cuda'), labels.to('cuda')  # Move inputs and labels to GPU
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}')

    # Evaluate on validation data
    model.eval()
    with torch.no_grad():
        val_data = {key: value.to('cuda') for key, value in val_data.items()}  # Move inputs to GPU
        outputs = model(**val_data)
        predictions = outputs.logits.squeeze().cpu().numpy()

    # Calculate Mean Squared Error
    mse = mean_squared_error(val_labels, predictions)
    print(f'Validation Mean Squared Error: {mse}')

    # Calculate Spearman correlation
    spearman_corr, _ = scipy.stats.spearmanr(predictions, val_labels.squeeze().numpy())
    print(f'Validation Spearman Correlation: {spearman_corr}')

    all_spearman_corrs.append(spearman_corr)

# Calculate and print the average Spearman correlation
average_spearman_corr = sum(all_spearman_corrs) / len(all_spearman_corrs)
print(f'Average Spearman Correlation across all folds: {average_spearman_corr}')


Receives ~ 0.82 Spearman correlation