# Model Training Notebook

Model used: Sequence-to-Sequence with Encoder and Decoder as LSTM (Long Short-Term Memory) 

## Necessary Imports

In [1]:
import os, sys
sys.path.append(os.path.abspath("../"))

## Load Necessary Data:

### Load Vocabularies

In [2]:
from src.data.load_preprocessed_data import load_vocabs

reference_vocab, translation_vocab = load_vocabs()

### Load preprocessed data

In [3]:
from src.data.load_preprocessed_data import get_dataframe

df = get_dataframe(sample_ratio=0.5)

In [4]:
df.head()

Unnamed: 0,reference_numericalized,translation_numericalized
57809,"[1, 2510, 2815, 623, 10, 62476, 92, 290, 991, ...","[1, 1344, 79, 22, 46220, 88, 113, 257, 52, 2]"
132693,"[1, 600, 10, 601, 620, 373, 236, 1998, 86, 0, 2]","[1, 436, 10, 62, 125, 36, 585, 74, 3993, 2]"
254505,"[1, 944, 1266, 186, 143, 572, 572, 157579, 2]","[1, 72, 20, 619, 45, 2]"
451186,"[1, 629, 352, 10, 116913, 581, 2]","[1, 303, 61, 109, 4119, 437, 2]"
191213,"[1, 95, 199, 182, 97, 67, 86, 4025, 1674, 791,...","[1, 130, 261, 25, 227, 22, 70525, 130, 5313, 2..."


### Split Data to Training & Evaluation Data

In [5]:
from sklearn.model_selection import train_test_split

# Train/Test Split
train, eval = train_test_split(df, test_size=0.2, shuffle=False)

In [6]:
train.head()

Unnamed: 0,reference_numericalized,translation_numericalized
57809,"[1, 2510, 2815, 623, 10, 62476, 92, 290, 991, ...","[1, 1344, 79, 22, 46220, 88, 113, 257, 52, 2]"
132693,"[1, 600, 10, 601, 620, 373, 236, 1998, 86, 0, 2]","[1, 436, 10, 62, 125, 36, 585, 74, 3993, 2]"
254505,"[1, 944, 1266, 186, 143, 572, 572, 157579, 2]","[1, 72, 20, 619, 45, 2]"
451186,"[1, 629, 352, 10, 116913, 581, 2]","[1, 303, 61, 109, 4119, 437, 2]"
191213,"[1, 95, 199, 182, 97, 67, 86, 4025, 1674, 791,...","[1, 130, 261, 25, 227, 22, 70525, 130, 5313, 2..."


### Setup Device

In [7]:
import torch

# Set up device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
device

device(type='mps')

### Create Dataset

In [8]:
import torch
from src.data.text_detox_data import TextDetoxDataset

train_dataset = TextDetoxDataset(train)
eval_dataset = TextDetoxDataset(eval)

### Create Dataloaders

In [9]:
from src.data.text_detox_data import TextDetoxDataLoader
from torch.nn.utils.rnn import pad_sequence


def text_detox_collate_fn(batch):
    reference = pad_sequence([torch.tensor(item["reference"], dtype=torch.long) for item in batch],
                                   batch_first=True, padding_value=0)
    translation = pad_sequence([torch.tensor(item["translation"], dtype=torch.long) for item in batch],
                                batch_first=True, padding_value=0)
    return {
        "reference": reference.to(device),
        "translation": translation.to(device)
    }

train_dataloader = TextDetoxDataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=text_detox_collate_fn)
eval_dataloader = TextDetoxDataLoader(eval_dataset, batch_size=4, shuffle=False, collate_fn=text_detox_collate_fn)

## Define the Model:

In [10]:
from src.models.model_utils import TextDetoxEncoder, TextDetoxDecoder, TextDetoxSeq2SeqModel

  from .autonotebook import tqdm as notebook_tqdm


### Create & Initialize the model

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

INPUT_DIM = len(reference_vocab)
OUTPUT_DIM = len(translation_vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = TextDetoxEncoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = TextDetoxDecoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

# don't forget to put the model to the right device
model = TextDetoxSeq2SeqModel(enc, dec, device).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the PAD token
optimizer = optim.Adam(model.parameters())


## Train:

In [12]:
import math
import random
from src.models.model_utils import train_detox_model, evaluate_detox_model

N_EPOCHS = 1
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train_detox_model(model, train_dataloader, optimizer, criterion, CLIP)
    eval_loss = evaluate_detox_model(model, eval_dataloader, criterion)
    
    last_loss = train_loss
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {eval_loss:.3f} |  Val. PPL: {math.exp(eval_loss):7.3f}')


                                                                               

Epoch: 01
	Train Loss: 5.950 | Train PPL: 383.619
	 Val. Loss: 6.201 |  Val. PPL: 493.276




## Saving & Loading the Model:

In [13]:
MODEL_PATH = '../models/SEQ2SEQ_LSTMs.pth'

### Save:
Uncomment and run the next cell to save the model

In [14]:
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': last_loss,
}, MODEL_PATH)

### Load:
Uncomment and run the next cell to load the model

In [15]:
# import pickle

# VOCABS_DIR = '../data/interim/'
# REFERENCE_VOCAB_PATH = VOCABS_DIR + 'reference_vocab.pkl'
# TRANSLATION_VOCAB_PATH = VOCABS_DIR + 'translation_vocab.pkl'

# # Load vocabularies
# with open(REFERENCE_VOCAB_PATH, 'rb') as f:
#     reference_vocab = pickle.load(f)
# with open(TRANSLATION_VOCAB_PATH, 'rb') as f:
#     translation_vocab = pickle.load(f)

# # Define model
# INPUT_DIM = len(reference_vocab)
# OUTPUT_DIM = len(translation_vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# HID_DIM = 512
# N_LAYERS = 2
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5

# encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
# decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

# # don't forget to put the model to the right device
# model = Seq2Seq(encoder, decoder, device).to(device)

# # Define loss function and optimizer
# criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the PAD token
# optimizer = optim.Adam(model.parameters())


# checkpoint = torch.load(MODEL_PATH)
# model = Seq2Seq(encoder, decoder, device)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# loss = checkpoint['loss']

## Predict

In [18]:
import pickle

def detoxify_sentence(sentence, reference_vocab, translation_vocab, model, device, max_len=50):
    # Define special tokens that will be used in the data preprocessing
    PAD_TOKEN = '<pad>'  # Token used for padding sentences to the same length
    SOS_TOKEN = '<sos>'  # Start-of-sentence token
    EOS_TOKEN = '<eos>'  # End-of-sentence token
    
    model.eval()
    
    # Tokenize the sentence, add the <sos> and <eos> tokens, and numericalize
    tokens = [reference_vocab.get(token, reference_vocab[PAD_TOKEN]) for token in sentence.split()]
    numericalized_tokens = [reference_vocab[SOS_TOKEN]] + tokens + [reference_vocab[EOS_TOKEN]]
    
    # Convert to Tensor and add a batch dimension
    src_tensor = torch.LongTensor(numericalized_tokens).unsqueeze(0).to(device)
    
    # Predict the target sequence
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)
        trg_indexes = [translation_vocab[SOS_TOKEN]]

        for _ in range(max_len):
            trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
            with torch.no_grad():
                output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
            
            # Get the predicted next token (the one with the highest probability)
            pred_token = output.argmax(1).item()
            trg_indexes.append(pred_token)

            # If the <eos> token is predicted, stop
            if pred_token == translation_vocab[EOS_TOKEN]:
                break
    
    # Convert the predicted numerical tokens to words
    trg_tokens = [list(translation_vocab.keys())[list(translation_vocab.values()).index(idx)] for idx in trg_indexes]
    
    # Return the words after the <sos> token
    return trg_tokens[1:-1]

In [20]:
# Change this sentence if you want to make another prediction
src_sentence = "This assignment is fucking difficult"
detoxified_tokens = detoxify_sentence(src_sentence, reference_vocab, translation_vocab, model, device)
print(" ".join(detoxified_tokens))

this is a criminal.


## Evaluation

In [30]:
import torch
import ast
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import pandas as pd
from tqdm import tqdm



inverse_reference_vocab = {v: k for k, v in reference_vocab.items()}
inverse_translation_vocab = {v: k for k, v in translation_vocab.items()}

# Your model's prediction function
def predict(sentence, reference_vocab, translation_vocab, model, device):
    # Insert the detoxify_sentence function here
    return detoxify_sentence(sentence, reference_vocab, translation_vocab, model, device)

# Evaluation
model.eval()  # Make sure the model is in evaluation mode
bleu_scores = []
rouge_scores = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rougeL_scores = []


for index, row in tqdm(eval.iterrows(), total=eval.shape[0]):
    reference_numericalized = ast.literal_eval(row['reference_numericalized'])
    translation_numericalized = ast.literal_eval(row['translation_numericalized'])

    
    reference_text = ' '.join([inverse_reference_vocab[token] for token in reference_numericalized if token not in (0, 1, 2)])  # exclude pad, sos, eos
    ground_truth_text = ' '.join([inverse_translation_vocab[token] for token in translation_numericalized if token not in (0, 1, 2)])  # exclude pad, sos, eos
    
    predicted_text_tokens = predict(reference_text, reference_vocab, translation_vocab, model, device)
    predicted_text = ' '.join(predicted_text_tokens)
    
    # BLEU score
    bleu_score = sentence_bleu([ground_truth_text.split()], predicted_text.split(), smoothing_function=SmoothingFunction().method1)
    bleu_scores.append(bleu_score)
    
    # ROUGE scores
    scores = rouge_scores.score(ground_truth_text, predicted_text)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate average scores
average_bleu = sum(bleu_scores) / len(bleu_scores)
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)


print(f'Average BLEU score: {average_bleu}')
print(f"Average ROUGE-1 score: {average_rouge1}")
print(f"Average ROUGE-L score: {average_rougeL}")


100%|██████████| 57778/57778 [35:36<00:00, 27.04it/s] 

Average BLEU score: 0.039391276526298424
Average ROUGE-1 score: 0.22633753009957291
Average ROUGE-L score: 0.21869081536854712



