In [None]:
import os
import re
import sys
from collections import OrderedDict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from scipy.stats import spearmanr, pearsonr, stats
from scipy import spatial

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, BartForSequenceClassification
from torch import nn, optim
import torch.optim as optim

tqdm.pandas()

# For Google Colab
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = os.path.join("drive", "MyDrive", "LMU", "Str24")

    !pip install lightning
else:
    PATH = os.path.join("..", "data", "raw")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Get Data

In [None]:
def get_data(subset = None, train_test_path = 'eng_train.csv'):

    df_train = pd.read_csv(os.path.join(PATH, train_test_path))
    df_train['Split_Text'] = df_train['Text'].apply(lambda x: x.split("\n"))
    df_train['Split_Text'] = df_train['Split_Text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in x])

    df_train["sen_1"] = df_train["Split_Text"].apply(lambda x: x[0])
    df_train["sen_2"] = df_train["Split_Text"].apply(lambda x: x[1])
    df_train.drop(["Split_Text"], axis=1, inplace=True)
    display(df_train.head())

    if subset is not None:
        df_train = df_train.sample(n=subset, random_state=42)

    return df_train

Model Definition

In [None]:
import lightning as L

class BartClassifier(L.LightningModule):
    def __init__(self, model_name):
        super(BartClassifier, self).__init__()
        self.model = BartForSequenceClassification.from_pretrained(
            model_name, num_labels=1
        )
        self.loss_fn = torch.nn.MSELoss()


    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss

        spearman = spearmanr(outputs.logits.detach().cpu().numpy(),labels.detach().cpu().numpy()).statistic

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_spearman", spearman,on_step = False, on_epoch=True, prog_bar=True)

        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        input_ids, attention_mask = batch
        outputs = self(input_ids,attention_mask)
        return outputs.logits



    def configure_optimizers(self):
        self.optimizer = optim.AdamW(self.model.parameters(), lr=5e-5)
        return self.optimizer


def prepare_torch_dataset(df, tokenizer, dev = False):
    tokenized = tokenizer(
        df["sen_1"].tolist(),
        df["sen_2"].tolist(),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    )
    if dev:
        return TensorDataset(tokenized["input_ids"], tokenized["attention_mask"])
    labels = torch.tensor(df["Score"].tolist(), dtype=torch.float32).unsqueeze(1)
    return TensorDataset(tokenized["input_ids"], tokenized["attention_mask"], labels)


def run_BART(df,df_dev, model_name = "bart-base-uncased", epochs=3, batch_size=8, test_run=False):

    # Load the BART tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    checkpoint_path = os.path.join(PATH,'models','BART','BART_large_checkpoint.ckpt')


    # Initialize the Lightning model
    model = BartClassifier(model_name)
    trainer = L.Trainer(
        accelerator="auto",
        max_epochs=epochs,
        num_sanity_val_steps=0,
        precision = "16-mixed",
        enable_checkpointing=False

    )

    # Split data into train and validation sets

    train_data = prepare_torch_dataset(df, tokenizer)
    dev_data = prepare_torch_dataset(df_dev,tokenizer, dev = True)

    train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, num_workers = 2)
    dev_dataloader = DataLoader(dev_data, batch_size=2, shuffle=False, num_workers = 2)

    # Train & Evaluate the model
    trainer.fit(model, train_dataloader)
    predictions = trainer.predict(model, dev_dataloader)

    return predictions

### Model training and evaluation using CV

In [None]:
df_train = get_data(subset = None, train_test_path = 'eng_train.csv') # Set subset to None to use the full dataset, set to 100 to prototype
df_dev = get_data(subset = None, train_test_path = 'eng_dev.csv')
preds = run_BART(
    df = df_train,
    df_dev = df_dev,
    model_name="facebook/bart-base", #alternatively: facebook/bart-large
    epochs=4,
    batch_size=8,
    test_run = False, # Set to True to run a test on one batch
)

print(preds)

Unnamed: 0,PairID,Text,Score,sen_1,sen_2
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,It that happens just pull the plug,if that ever happens just pull the plug
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,A black dog running through water,A black dog is running through some water
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,I ve been searchingthe entire abbey for you,I m looking for you all over the abbey
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,If he is good looking and has a good personali...,If he s good looking and a good personality he...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,She does not hate you she is just annoyed with...,She doesn t hate you she is just annoyed


Unnamed: 0,PairID,Text,sen_1,sen_2
0,ENG-dev-0000,The story is gripping and interesting.\nIt's a...,The story is gripping and interesting,It s a brilliant compelling and heartfelt story
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,The majority of Southeast Alaska s area is par...,A lot of of the panhandle is part of the Tonga...
2,ENG-dev-0002,and from your post i think you are to young to...,and from your post i think you are to young to...,I think it will be very bad if he acquires her...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,The film s success also made Dreamworks Animat...,There have also been two sequels LRB follow up...
4,ENG-dev-0004,I am still confused about how I feel about thi...,I am still confused about how I feel about thi...,In this particular book Blue and Gansey are st...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:

Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=4` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
preds_cpl = []

for batch in preds:
    preds_cpl.extend(batch.squeeze().tolist())

In [None]:
test_set = pd.read_csv(os.path.join(PATH, 'eng_dev.csv'))
submission = pd.DataFrame({'PairID' : test_set['PairID'],'Pred_Score' : preds_cpl})

In [None]:
submission

In [None]:
submission.to_csv(os.path.join(PATH,'submission_file.csv'), header = True, index = False)

Results [spearman correlation]:
- base - 0.85
- large - 0.87