# Transformer

I experimented with a LSTM and found that the LSTM is unable to provide better performance than train-based ensemble models. In fact, gradient boosting and random forest models are able to have a test set weighted ROC AUC at least .2 to .3 better than the LSTM. This leads me to the believe that there are some relationships in the test data that are being captured by my custom features that the LSTM is struggling to capture. I am going to try using a 2 layer Transformer Encoder to see if I can get better performance.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
import torch.nn as nn
import torchtext
from torch.utils.data import TensorDataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from nltk.stem import SnowballStemmer
import re
import tensorflow as tf
from sklearn.metrics import roc_auc_score
import math

tqdm.pandas()

# Getting the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Getting the data
train_data = pd.read_csv('../input/training-llm-competition/train.csv')
valid_data = pd.read_csv('../input/training-llm-competition/validation.csv')

## Preprocessing 

In [None]:
# Creating a dictionary for the contradictions
# Dictionary is from https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = {
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
# Getting the tokenizer
tokenizer = get_tokenizer('spacy',language='en_core_web_sm')

# Getting the stemmer
stemmer = SnowballStemmer(language='english')

In [None]:
# A function for preprocessing
def preprocess(essay:str):
    preprocessed_essay = essay.lower()
    
    
    # Iterating through the contractions and replacing the 
    for contraction in contractions.keys():
        preprocessed_essay = re.sub(contraction.lower(),contractions[contraction].lower(),preprocessed_essay)
    
    # Subbing out \n and \t
    preprocessed_essay = re.sub("\n","",preprocessed_essay)
    preprocessed_essay = re.sub("\t","",preprocessed_essay)

    # Replacing /xa0 = non-breaking space in Latin1
    preprocessed_essay = preprocessed_essay.replace(u'\xa0', u' ')
    
    final_preprocessed_essay = []
    
    # Running through tokenizer and returning the non-whitespace tokens
    for token in tokenizer(preprocessed_essay):
        temp_token = token.strip(" ")
        
        if temp_token != "":
            final_preprocessed_essay.append(stemmer.stem(token))
    
    return final_preprocessed_essay

In [None]:
# Running the training essays and validation essays through preprocessing
tokenized_essays_train = train_data['essay'].progress_apply(preprocess)
tokenized_essays_valid = valid_data['essay'].progress_apply(preprocess)

In [None]:
# Loading the vocab
vocabulary = torch.load('../input/llm-competition-models/vocab.pt')

In [None]:
# Function to put each essay through the vocabulary
def put_through_vocab(essay:str) -> list:
    return vocabulary(essay)

# Indexed
indexed_essays_train = [put_through_vocab(essay) for essay in tokenized_essays_train]
indexed_essays_valid = [put_through_vocab(essay) for essay in tokenized_essays_valid]

In [None]:
# Padding the essays
train_padded = tf.keras.utils.pad_sequences(indexed_essays_train,maxlen=512,padding='post',truncating='post',value=vocabulary['<pad>'])
valid_padded = tf.keras.utils.pad_sequences(indexed_essays_valid,maxlen=512,padding='post',truncating='post',value=vocabulary['<pad>'])

In [None]:
# Creating the dataloaders
X_train_tensor = torch.from_numpy(train_padded)
y_train_tensor = torch.from_numpy(train_data['LLM_written'].values)
X_valid_tensor = torch.from_numpy(valid_padded)
y_valid_tensor = torch.from_numpy(valid_data['LLM_written'].values)
training_dataset = TensorDataset(X_train_tensor,y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor,y_valid_tensor)
training_loader = DataLoader(training_dataset,batch_size=128,shuffle=True)
valid_loader = DataLoader(training_dataset,batch_size=128,shuffle=True)

## Model

In [None]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self,emb_size:int, dropout:float, maxlen:int = 500):
        super(PositionalEncoding,self).__init__()
        den = torch.exp(-torch.arange(0,emb_size,2)*math.log(10000) / emb_size)
        pos = torch.arange(0,maxlen).reshape(maxlen,1)
        pos_embedding = torch.zeros((maxlen,emb_size))
        pos_embedding[:,0::2] = torch.sin(pos * den)
        pos_embedding[:,1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(0)

        self.dropout = nn.Dropout(dropout)

        # Saving the positional encoding in the model state dict, but making sure PyTorch doesn't "train"
        # these parameters because they don't need to be trained
        self.register_buffer('pos_embedding',pos_embedding)

    def forward(self,token_embedding):
        return self.dropout(token_embedding + self.pos_embedding)

# Transformer Model
class Model(nn.Module):
    def __init__(self,vocab_size: int, emb_size:int,nheads:int,dim_feedforward:int,dropout:float,num_layers:int,max_length:int):
        super().__init__()
        self.embed_size = emb_size
        self.embedding = nn.Embedding(vocab_size,emb_size,padding_idx=vocabulary['<pad>'])
        self.positional_encoder = PositionalEncoding(emb_size,dropout,max_length)
        self.encoder_layer = nn.TransformerEncoderLayer(emb_size,nheads,dim_feedforward,dropout,batch_first=True)
        self.transformer = nn.TransformerEncoder(self.encoder_layer,num_layers)
        self.fc1 = nn.Linear(emb_size,1)
    
    # Forward Function
    def forward(self,X,src_key_padding_mask):
        # Putting X through embedding
        output = self.embedding(X.long()) * math.sqrt(self.embed_size)
        output = self.positional_encoder(output)
        
        # Feeding through transformer encoder
        output = self.transformer(output,src_key_padding_mask=src_key_padding_mask)
        output = torch.mean(output,dim=1)
        return nn.functional.sigmoid(self.fc1(output))

In [None]:
# Creating a mask to make sure the padding indicies are masked
def create_padding_mask(X):
    return (X == vocabulary['<pad>'])

In [None]:
# Setting up the model
model = Model(vocab_size=vocabulary.__len__(),emb_size=512,nheads=8,dim_feedforward=2048,dropout=0.2,num_layers=2,max_length=512)

# Putting all on GPU
model.to(device)

In [None]:
# Setting up the model training parameters
EPOCHS = 100
LEARNING_RATE = 1e-5
LOSS = nn.BCELoss()
OPTIMIZER = torch.optim.Adam(model.parameters(),LEARNING_RATE)
history = []
early_stopping_threshold = 5
best_roc_auc = 0
current_count = 0

In [None]:
for epoch in range(EPOCHS):
    train_loss = 0
    train_preds = None
    valid_preds = None
    train_targets = None
    valid_targets = None
    model.train()
    for X,y in training_loader:
        # Making predictions
        y = y.to(torch.float32)
        # Putting the tensors on the right device
        X = X.to(device)
        y = y.to(device)
        
        # Putting input through model
        pred = model(X,src_key_padding_mask=create_padding_mask(X))
        if train_preds is None:
            train_preds = pred.cpu().detach().numpy()
        else:
            train_preds = np.append(train_preds,pred.cpu().detach().numpy(),axis=0)

        # Getting the targets
        if train_targets is None:
            train_targets = y.cpu().numpy()
        else:
            train_targets = np.append(train_targets,y.cpu().detach().numpy(),axis=0)

        # Getting the loss
        loss = LOSS(pred,y.view(-1,1))

        # Calculating the gradients
        loss.backward()

        # Taking a step with the optimizer
        OPTIMIZER.step()

        # Clear the gradients
        OPTIMIZER.zero_grad()

        # Adding the loss
        train_loss += loss.item()

    # Going through a validation loop
    model.eval()
    val_loss = 0
    for X,y in valid_loader:
        # Making predictions
        X = X.to(device)
        y = y.to(device)
        y = y.to(torch.float32)

        with torch.no_grad():
            pred = model(X,src_key_padding_mask=create_padding_mask(X))
            loss = LOSS(pred,y.view(-1,1))

        val_loss += loss.item()
        if valid_preds is None:
            valid_preds = pred.cpu().detach().numpy()
        else:
            valid_preds = np.append(valid_preds,pred.cpu().detach().numpy(),axis=0)

        # Getting the targets
        if valid_targets is None:
            valid_targets = y.cpu().numpy()
        else:
            valid_targets = np.append(valid_targets,y.cpu().detach().numpy(),axis=0)

    # Early Stopping
    if roc_auc_score(valid_targets,valid_preds) - best_roc_auc > 1e-3:
        best_roc_auc = roc_auc_score(valid_targets,valid_preds)
        count = 0

        # Saving the best model & best embeddings 
        torch.save(model.state_dict(),'2-layer-transformer-encoder.pt')
    else:
        count += 1

    # Appending the average example loss to the history
    print(f'----------EPOCH {epoch} loss----------')
    print(f'Train Loss: {train_loss / len(training_loader)}')
    print(f'Valid Loss: {val_loss / len(valid_loader)}')
    print(f'Training ROC AUC: {roc_auc_score(train_targets,train_preds)}')
    print(f'Validation ROC AUC: {roc_auc_score(valid_targets,valid_preds)}')
    history.append([train_loss / len(training_loader),val_loss / len(valid_loader),roc_auc_score(train_targets,train_preds),roc_auc_score(valid_targets,valid_preds)])
    print('--------------------------------------')
    print()

    # Stopping the loop
    if count == early_stopping_threshold:
        print('Found no improvement!')
        break

In [None]:
history_df = pd.DataFrame(history,columns=['Training Loss','Validation Loss','Training ROC AUC','Validation ROC AUC'])

In [None]:
# Plotting the loss
history_df[['Training Loss','Validation Loss']].plot(title='Loss vs. Epochs',xlabel='Epochs',ylabel='Loss')
plt.show()

In [None]:
# Plotting the ROC AUC
history_df[['Training ROC AUC','Validation ROC AUC']].plot(title='ROC AUC vs. Epochs',xlabel='Epochs',ylabel='ROC AUC')
plt.show()

In [None]:
# Setting up the model
model = Model(vocab_size=vocabulary.__len__(),emb_size=512,nheads=8,dim_feedforward=2048,dropout=0.2,num_layers=2,max_length=512)
model.load_state_dict(torch.load('../working/2-layer-transformer-encoder.pt'))

# Putting all on GPU
model.to(device)

In [None]:
# Inference loop
with torch.no_grad():
    model.eval()
    train_preds = None
    val_preds = None
    train_targets = None
    val_targets = None
    for X,y in training_loader:
        # Making predictions
        X = X.to(device)
        pred = model(X,src_key_padding_mask=create_padding_mask(X))
        if train_preds is None:
            train_preds = pred.cpu().detach().numpy()
        else:
            train_preds = np.append(train_preds,pred.cpu().detach().numpy(),axis=0)

        # Getting the targets
        if train_targets is None:
            train_targets = y.cpu().numpy()
        else:
            train_targets = np.append(train_targets,y.cpu().detach().numpy(),axis=0)
    for X,y in valid_loader:
        # Making predictions
        X = X.to(device)
        pred = model(X,src_key_padding_mask=create_padding_mask(X))
        if valid_preds is None:
            valid_preds = pred.cpu().detach().numpy()
        else:
            valid_preds = np.append(valid_preds,pred.cpu().detach().numpy(),axis=0)

        # Getting the targets
        if valid_targets is None:
            valid_targets = y.numpy()
        else:
            valid_targets = np.append(valid_targets,y.detach().numpy(),axis=0)

In [None]:
# Making predictions
print('Predictions for Transformer')
train_score = roc_auc_score(train_targets,train_preds)
valid_score = roc_auc_score(valid_targets,valid_preds)
print(f'Training ROC AUC: {train_score}')
print(f'Validation ROC AUC: {valid_score}')