In [1]:
import os
import re
import sys
from collections import OrderedDict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from scipy.stats import spearmanr, pearsonr, stats
from scipy import spatial

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn, optim
import torch.optim as optim

tqdm.pandas()

# For Google Colab
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = os.path.join("drive", "MyDrive", "LMU", "AppliedDL", "data", "raw")
    
    !pip install lightning
else:
    PATH = os.path.join("..", "data", "raw")

Get Data

In [2]:
def get_data(subset):

    df_train = pd.read_csv(os.path.join(PATH, 'eng_train.csv'))
    df_train["Split_Text"] = df_train["Text"].apply(lambda x: x.replace("\n", " "))
    df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: x.split("\r"))
    df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

    df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
    df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
    df_train.drop(["Split_Text"], axis=1, inplace=True)
    display(df_train.head())

    if subset is not None:
        df_train = df_train.sample(n=subset, random_state=42)

    return df_train

Model Definition

In [3]:
import lightning as L

class Classifier(L.LightningModule):
    def __init__(self, model_name):
        super(Classifier, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=1
        )
        self.loss_fn = torch.nn.MSELoss()

        self.val_predictions = []
        self.val_labels = []

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        return loss

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        predictions = outputs.logits.squeeze()

        # Append predictions and labels to the lists
        self.val_predictions.append(predictions)
        self.val_labels.append(labels.squeeze())

        return predictions, labels

    def on_test_epoch_end(self):
        # Concatenate predictions and labels at the end of each epoch
        predictions = torch.cat(self.val_predictions)
        labels = torch.cat(self.val_labels)

        mse = self.loss_fn(predictions, labels)
        spearman_corr, _ = spearmanr(
            predictions.cpu().numpy(), labels.cpu().numpy()
        )

        # Log the metrics
        self.log("val_loss", mse, prog_bar=True)
        self.log("val_spearman_corr", spearman_corr, prog_bar=True)

        # Clear the lists for the next epoch
        self.val_predictions = []
        self.val_labels = []

    def configure_optimizers(self):
        self.optimizer = optim.AdamW(self.model.parameters(), lr=5e-5)
        return self.optimizer


def prepare_torch_dataset(df, tokenizer):
    tokenized = tokenizer(
        df["sen_1"].tolist(),
        df["sen_2"].tolist(),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    )

    labels = torch.tensor(df["Score"].tolist(), dtype=torch.float32).unsqueeze(1)
    return TensorDataset(tokenized["input_ids"], tokenized["attention_mask"], labels)


def run_cv(df, model_name, n_splits=5, epochs=3, batch_size=8, test_run=False):
    if test_run:
        n_splits = 2

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # 5-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


    all_spearman_corrs = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):

        print(f"Fold {fold + 1}/{kf.get_n_splits()}")

        # Initialize the Lightning model
        model = Classifier(model_name)
        trainer = L.Trainer(
            accelerator="auto", max_epochs=epochs, num_sanity_val_steps=0, fast_dev_run=test_run
        )


        # Split data into train and validation sets
        train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]

        train_data = prepare_torch_dataset(train_df, tokenizer)
        val_data = prepare_torch_dataset(val_df, tokenizer)

        train_dataloader = DataLoader(train_data, batch_size=batch_size, num_workers=2, shuffle=True)
        val_dataloader = DataLoader(val_data, batch_size=batch_size, num_workers=2, shuffle=False)


        # Train & Evaluate the model
        trainer.fit(model, train_dataloader)
        trainer.test(model, val_dataloader)


        # Calculate and print the average Spearman correlation
        average_spearman_corr = trainer.callback_metrics["val_spearman_corr"].mean()
        print(
            f"Average Spearman Correlation for Fold {fold + 1}: {average_spearman_corr}"
        )

        all_spearman_corrs.append(average_spearman_corr)

    # Calculate and print the overall average Spearman correlation
    overall_average_spearman_corr = sum(all_spearman_corrs) / len(all_spearman_corrs)
    print(
        f"Overall Average Spearman Correlation across all folds: {overall_average_spearman_corr}"
    )

    return overall_average_spearman_corr

### Model training and evaluation using CV

Run with CUDA GPU acceleration
- 8h with a fast CPU
- XXX min on Colab

In [4]:
df = get_data(subset=None) # Set to None to use the full dataset, set to 100 to prototype

score = run_cv(
    df,
    model_name= "t5-large",
    n_splits=5,
    epochs=5,
    batch_size=10,
    test_run = True, # Set to True to run a test on one batch
)

Unnamed: 0,PairID,Text,Score,sen_1,sen_2
0,ENG-train-0000,"It that happens, just pull the plug.\r\nif tha...",1.0,It that happens just pull the plug,if that ever happens just pull the plug
1,ENG-train-0001,A black dog running through water.\r\nA black ...,1.0,A black dog running through water,A black dog is running through some water
2,ENG-train-0002,I've been searchingthe entire abbey for you.\r...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality h...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Fold 1/2


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\dchro\.conda\envs\AppliedDL\lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install 

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.
c:\Users\dchro\.conda\envs\AppliedDL\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing: |          | 0/? [00:00<?, ?it/s]



Average Spearman Correlation for Fold 1: nan
Fold 2/2


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.

  | Name    | Type                        | Params
--------------------------------------------------------
0 | model   | T5ForSequenceClassification | 738 M 
1 | loss_fn | MSELoss                     | 0     
--------------------------------------------------------
738 M     Trainable params
0         Non-trainable params
738 M     Total par

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

Average Spearman Correlation for Fold 2: nan
Overall Average Spearman Correlation across all folds: nan


Results
- t5-small, 3 epochs - 0.57, 5 epochs - 0.75, 9 epochs - 0.8
- t5-base, 3 epochs - 0.72, 9 epochs - 0.83