Load Raw Data

In [18]:
import os, sys
sys.path.append(os.path.abspath("../"))
from src.data.make_dataset import get_dataset
import pandas as pd

DATASET_PATH = '../data/raw/filtered_paranmt/filtered.tsv'
VOCABS_DIR = '../data/interim/'
REFERENCE_VOCAB_PATH = VOCABS_DIR + 'reference_vocab.pkl'
TRANSLATION_VOCAB_PATH = VOCABS_DIR + 'translation_vocab.pkl'

# Load data
get_dataset()
df = pd.read_csv(DATASET_PATH, delimiter='\t')

Dataset already exists at /Users/hamadasalhab/Library/CloudStorage/OneDrive-АНОВОУниверситетИннополис/Disk D/Innopolis Study Materials/F23/PMLDL/Assignments/Assignment#01/text-detoxification/src/data/../../data/raw/filtered_paranmt.zip
All set. The dataset can be found in project_root_dir/data/raw.


Build tokens

In [19]:
import pandas as pd
from collections import Counter
from itertools import chain

# Define special tokens
PAD_TOKEN = '<pad>'
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'

# Build vocabularies
def build_vocab(texts, min_freq=2):
    # Count the frequencies of tokens in the texts
    counter = Counter(chain.from_iterable(texts))
    # Create the vocabulary mapping each token to a unique index
    vocab = {token: idx for idx, (token, freq) in enumerate(counter.items()) if freq >= min_freq}
    # Add special tokens to the beginning of the dictionary
    vocab = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2, **vocab}
    return vocab

# Tokenization and numericalization
def tokenize_and_numericalize(text, vocab):
    tokens = text.split()
    numericalized = [vocab[SOS_TOKEN]] + [vocab.get(token, vocab[PAD_TOKEN]) for token in tokens] + [vocab[EOS_TOKEN]]
    return numericalized

# Tokenize texts
reference_tokenized = df['reference'].str.split().tolist()
translation_tokenized = df['translation'].str.split().tolist()

# Build vocabularies
reference_vocab = build_vocab(reference_tokenized)
translation_vocab = build_vocab(translation_tokenized)

Load preprocessed data

In [20]:
import os, sys
sys.path.append(os.path.abspath("../"))
from src.data.make_dataset import get_dataset

import pandas as pd

INTERIM_DATASET_PATH = '../data/interim/preprocessed_new.tsv'

df = pd.read_csv(INTERIM_DATASET_PATH, delimiter='\t')
df = df.sample(frac=0.1, random_state=42)

In [21]:
df.head()

Unnamed: 0,reference_numericalized,translation_numericalized
57809,"[1, 2510, 2815, 623, 10, 62476, 92, 290, 991, ...","[1, 1344, 79, 22, 46220, 88, 113, 257, 52, 2]"
132693,"[1, 600, 10, 601, 620, 373, 236, 1998, 86, 0, 2]","[1, 436, 10, 62, 125, 36, 585, 74, 3993, 2]"
254505,"[1, 944, 1266, 186, 143, 572, 572, 157579, 2]","[1, 72, 20, 619, 45, 2]"
451186,"[1, 629, 352, 10, 116913, 581, 2]","[1, 303, 61, 109, 4119, 437, 2]"
191213,"[1, 95, 199, 182, 97, 67, 86, 4025, 1674, 791,...","[1, 130, 261, 25, 227, 22, 70525, 130, 5313, 2..."


In [22]:
from sklearn.model_selection import train_test_split

# Train/Test Split
train, eval = train_test_split(df, test_size=0.2, shuffle=False)

In [23]:
train.head()

Unnamed: 0,reference_numericalized,translation_numericalized
57809,"[1, 2510, 2815, 623, 10, 62476, 92, 290, 991, ...","[1, 1344, 79, 22, 46220, 88, 113, 257, 52, 2]"
132693,"[1, 600, 10, 601, 620, 373, 236, 1998, 86, 0, 2]","[1, 436, 10, 62, 125, 36, 585, 74, 3993, 2]"
254505,"[1, 944, 1266, 186, 143, 572, 572, 157579, 2]","[1, 72, 20, 619, 45, 2]"
451186,"[1, 629, 352, 10, 116913, 581, 2]","[1, 303, 61, 109, 4119, 437, 2]"
191213,"[1, 95, 199, 182, 97, 67, 86, 4025, 1674, 791,...","[1, 130, 261, 25, 227, 22, 70525, 130, 5313, 2..."


## Create Dataloader:

In [24]:
import torch

# Set up device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
device

device(type='mps')

In [25]:
import json
import torch
from torch.utils.data import Dataset, DataLoader

def string_to_list(s):
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        return [int(item) for item in s.strip('[]').split(', ')]

class TextDetoxDataset(Dataset):
    def __init__(self, dataframe):
        self.reference_numericalized = dataframe['reference_numericalized'].apply(string_to_list)
        self.translation_numericalized = dataframe['translation_numericalized'].apply(string_to_list)
        
    def __len__(self):
        return len(self.reference_numericalized)

    def __getitem__(self, idx):
        input_ids = self.reference_numericalized.iloc[idx]
        labels = self.translation_numericalized.iloc[idx]

        return {
            "reference": input_ids,
            "translation": labels
        }

train_dataset = TextDetoxDataset(train)
eval_dataset = TextDetoxDataset(eval)

In [26]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate_fn(batch):
    reference = pad_sequence([torch.tensor(item["reference"], dtype=torch.long) for item in batch],
                                   batch_first=True, padding_value=0)
    translation = pad_sequence([torch.tensor(item["translation"], dtype=torch.long) for item in batch],
                                batch_first=True, padding_value=0)
    return {
        "reference": reference.to(device),
        "translation": translation.to(device)
    }

NUM_WORKERS = 8

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)



## Define the Model:

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
import random


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden

# Adjustments in the Seq2Seq model to accommodate GRUs
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        hidden = self.encoder(src)
        
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = trg[:, t] if random.random() < teacher_forcing_ratio else top1
        
        return outputs

## Training Loop:

In [28]:
INPUT_DIM = len(reference_vocab)
OUTPUT_DIM = len(translation_vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
LR = 0.01

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

# don't forget to put the model to the right device
model = Seq2Seq(enc, dec, device).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the PAD token
optimizer = optim.Adam(model.parameters(), lr=LR)




In [29]:
from tqdm.auto import tqdm

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(iterator, desc='Training', leave=False)
    for i, batch in enumerate(progress_bar):
        src = batch['reference']
        trg = batch['translation']

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
        

    return epoch_loss / len(iterator)


In [30]:
from tqdm.auto import tqdm

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    progress_bar = tqdm(iterator, desc='Evaluating', leave=False)
    with torch.no_grad():
        for i, batch in enumerate(progress_bar):
            src = batch['reference']
            trg = batch['translation']

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

    return epoch_loss / len(iterator)


In [31]:
import math
N_EPOCHS = 1
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    eval_loss = evaluate(model, eval_dataloader, criterion)
    
    last_loss = train_loss
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {eval_loss:.3f} |  Val. PPL: {math.exp(eval_loss):7.3f}')


Training:   5%|▍         | 269/5778 [01:21<24:15,  3.79it/s, loss=nan]

In [None]:
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': last_loss,  # Now 'last_loss' is defined
}, 'checkpoint.pth')

# Load everything (assumes model and optimizer are already instantiated)
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']  # This will now have the value from the last batch of the last epoch


In [None]:
# # Load the entire model
# model = torch.load('model.pth')

# # Load only the model state dictionary
# # First, instantiate your model architecture
# model = Seq2Seq(encoder, decoder, device)
# # Then load the state dictionary
# model.load_state_dict(torch.load('model_state_dict.pth'))
