In [None]:
import time
import torch
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error, cohen_kappa_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, AddedToken
from datasets import Dataset

In [None]:
# Load data
train_essays = pd.read_csv('data/train.csv')

n_samples = 200 # Used for quick testing, set to -1 to use all samples
if n_samples > 0:
    train_essays = train_essays.sample(n_samples)
    
# Parse out escape characters
train_essays['full_text'] = train_essays['full_text'].str.replace("\xa0","")

# Split train and validation sets
# The validation set is only used to see if the model is overfitting during the training phase
t_size = 100
training_samples, validation_samples = train_test_split(train_essays, test_size=t_size, random_state=1, stratify=train_essays['score'])
training_samples.reset_index(drop=True, inplace=True)
validation_samples.reset_index(drop=True, inplace=True)

In [None]:
# Build model, tokenizer, and tf-idf transformers

# Define the base model
# Deberta is the current SOTA encoder-only model. Here we use the xsmall version for speed, but you can use base or large for better results
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_model = 'microsoft/deberta-v3-xsmall' 

# Load the tokenizer
# We add some new tokens that are not in the base set that are helpful
tokenizer = AutoTokenizer.from_pretrained(base_model, force_download=False)
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])

# Helper function to convert a text into a token tensor
def get_tokens(text, tokenizer, device, max_length):
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt") # Get tokens
    tokens = {k: v.to(device) for k, v in tokens.items()} # Send to device
    return tokens

# Create datasets using the Datasets library
# This allows easier loading & feature engineering later on
train_data = Dataset.from_pandas(training_samples, preserve_index=False).with_format("torch")
validation_data = Dataset.from_pandas(validation_samples, preserve_index=False).with_format("torch")

# Create tf-idf embeddings
# These will be used to evaluate the lexical similarity between essays
tf = TfidfVectorizer(max_df=0.5, min_df=0.05)
tf.fit(training_samples['full_text'])

# Get tokens and embeddings
# We change the maximum token length to 1024 (from 512) to avoid cutting off the longer essays
max_length = 1024 
train_data = train_data.map(lambda x: {"tokens": get_tokens(x["full_text"], tokenizer, device, max_length),
                                       "sparse_embedding": tf.transform([x['full_text']]).toarray()[0]})
validation_data = validation_data.map(lambda x: {"tokens": get_tokens(x["full_text"], tokenizer, device, max_length),
                                       "sparse_embedding": tf.transform([x['full_text']]).toarray()[0]})
train_data.add_faiss_index(column='sparse_embedding', index_name='lexical')

# Helper function a neighbour for each example - used in contrastive learning
def get_neighbour_index(embedding):
    neighbour_id = train_data.get_nearest_examples('lexical', embedding.numpy())[1]['essay_id'][1]
    neighbour_index = int(training_samples.loc[training_samples['essay_id'] == neighbour_id].index[0])
    return neighbour_index

# Build a dataset of contrastive examples
contrastive_data = train_data.map(lambda x: {"neighbour_index": get_neighbour_index(x['sparse_embedding'])})
contrastive_data = contrastive_data.map(lambda x: {"neighbour_tokens": contrastive_data[int(x['neighbour_index'])]['tokens'],
                                       "delta": x['score'] - contrastive_data[int(x['neighbour_index'])]['score']})

# Batch size is dependent on how much memory you have available and what size of model you are using
# The contrastive loader loads two essays per sample so it's batch size should be half the others
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
contrastive_dataloader = DataLoader(contrastive_data, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size=16, shuffle=False)

In [None]:
# Initialize model
config = AutoConfig.from_pretrained(base_model)
config.num_labels = 1
config.max_length = max_length ## 95% percentile is 787 tokens, 99% percentile is 1026
config.max_position_embeddings = config.max_length
config.attention_probs_dropout_prob = 0.0 # We remove dropout because it has been shown to perform poorly in regression
config.hidden_dropout_prob = 0.0 
model = AutoModelForSequenceClassification.from_config(config)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def train_epoch_supervised(train_dataloader, validation_dataloader, optimizer):
    start = time.time()
    epoch_loss = 0
    for i,batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch['tokens']['input_ids'].squeeze(1).to(device)
        attention_mask = batch['tokens']['attention_mask'].squeeze(1).to(device)
        token_type_ids = batch['tokens']['token_type_ids'].squeeze(1).to(device)
        labels = batch['score'].float().to(device)
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        predictions = outputs['logits'].squeeze(1)
        loss = torch.nn.MSELoss(reduction='sum')(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        
    with torch.no_grad():
        n_validation_samples = len(validation_dataloader.dataset)
        all_predictions = np.zeros(n_validation_samples)
        for i,batch in enumerate(validation_dataloader):
            input_ids = batch['tokens']['input_ids'].squeeze(1).to(device)
            attention_mask = batch['tokens']['attention_mask'].squeeze(1).to(device)
            token_type_ids = batch['tokens']['token_type_ids'].squeeze(1).to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            predictions = outputs['logits'].squeeze(1)
            # Transfer predictions to CPU and detach
            predictions = predictions.cpu().detach().numpy()
            start_idx = i * validation_dataloader.batch_size
            end_idx = start_idx + len(predictions)
            all_predictions[start_idx:end_idx] = predictions
               
        # Clear CUDA cache to free up memory
        del input_ids, attention_mask, token_type_ids, outputs, predictions
        torch.cuda.empty_cache()
            
        # Calculate metrics
        test_loss = mean_squared_error(validation_data['score'], all_predictions)
        qwk = cohen_kappa_score(validation_data['score'], all_predictions.clip(1,6).round(0), weights='quadratic')
        print(f"Labelled - Duration {(time.time()-start):.1f} - Train Loss {(epoch_loss/n_validation_samples):.3f} - Test Loss {test_loss:.3f} - QWK {qwk:.3f} - Prediction Range {all_predictions.min():.3f}-{all_predictions.max():.3f}")
        
# Training loop for contrastive loss
def train_epoch_contrastive(contrastive_dataloader, validation_dataloader, optimizer, weight):
    start = time.time()
    epoch_loss = 0
    for i,batch in enumerate(contrastive_dataloader):
        optimizer.zero_grad()
        input_ids_A = batch['tokens']['input_ids'].squeeze(1).to(device)
        attention_mask_A = batch['tokens']['attention_mask'].squeeze(1).to(device)
        token_type_ids_A = batch['tokens']['token_type_ids'].squeeze(1).to(device)

        input_ids_B = batch['neighbour_tokens']['input_ids'].squeeze(1).to(device)
        attention_mask_B = batch['neighbour_tokens']['attention_mask'].squeeze(1).to(device)
        token_type_ids_B = batch['neighbour_tokens']['token_type_ids'].squeeze(1).to(device)        
        
        delta = batch['delta'].float().to(device)
        label_A = batch['score'].float().to(device)
        
        outputs_A = model(input_ids_A, attention_mask=attention_mask_A, token_type_ids=token_type_ids_A)
        outputs_B = model(input_ids_B, attention_mask=attention_mask_B, token_type_ids=token_type_ids_B)
        
        predictions_A = outputs_A['logits'].squeeze(1)
        predictions_B = outputs_B['logits'].squeeze(1)
        contrastive_predictions = predictions_A - predictions_B
        
        target = delta.sign()
        contrastive_loss = torch.nn.MSELoss()(contrastive_predictions, target)
        
        loss_A = torch.nn.MSELoss(reduction='sum')(predictions_A, label_A)
        loss = weight*contrastive_loss + (1-weight)*loss_A
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss
    
    with torch.no_grad():
        n_validation_samples = len(validation_dataloader.dataset)
        all_predictions = np.zeros(n_validation_samples)
        for i,batch in enumerate(validation_dataloader):
            input_ids = batch['tokens']['input_ids'].squeeze(1).to(device)
            attention_mask = batch['tokens']['attention_mask'].squeeze(1).to(device)
            token_type_ids = batch['tokens']['token_type_ids'].squeeze(1).to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            predictions = outputs['logits'].squeeze(1)
            # Transfer predictions to CPU and detach
            predictions = predictions.cpu().detach().numpy()
            start_idx = i * validation_dataloader.batch_size
            end_idx = start_idx + len(predictions)
            all_predictions[start_idx:end_idx] = predictions
        
        # Clear CUDA cache to free up memory
        del input_ids, attention_mask, token_type_ids, outputs, predictions
        torch.cuda.empty_cache()
            
        # Calculate metrics
        test_loss = mean_squared_error(validation_data['score'], all_predictions)
        qwk = cohen_kappa_score(validation_data['score'], all_predictions.clip(1,6).round(0), weights='quadratic')
        print(f"Contrastive - Duration {(time.time()-start):.1f} - Train Loss {(epoch_loss/n_validation_samples):.3f} - Test Loss {test_loss:.3f} - QWK {qwk:.3f} - Prediction Range {all_predictions.min():.3f}-{all_predictions.max():.3f}")

In [None]:
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
for _ in range(1):
    train_epoch_supervised(train_dataloader, validation_dataloader, optimizer)
    scheduler.step()
for _ in range(4):
    train_epoch_contrastive(contrastive_dataloader, validation_dataloader, optimizer, weight=0.5)
    scheduler.step()
for _ in range(2):
    train_epoch_supervised(train_dataloader, validation_dataloader, optimizer)
    scheduler.step()