# Model training pipeline

In [58]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from collections import Counter
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split
from torchinfo import summary
import torchmetrics as tm
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from torch.cuda.amp import autocast, GradScaler

In [22]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

device

2.5.1+cu121
12.1
True
NVIDIA GeForce MX350


'cuda'

In [23]:
DATA_PATH = Path("../data")
file_path = DATA_PATH / "processed/cleaned_sentiment_data.csv"
df = pd.read_csv(file_path)

In [62]:
df

Unnamed: 0,cleaned_review,sentiment,review_length
0,teenager martha moxley maggie grace move high ...,1,125
1,ok really like kris kristofferson usual easy g...,0,111
2,spoiler read think watching movie although wou...,0,147
3,hi people seen wonderful movie im sure thet wo...,1,36
4,recently bought dvd forgetting much hated movi...,0,61
...,...,...,...
49577,ok let start best building although hard belie...,0,110
49578,british heritage film industry control nothing...,0,147
49579,even know begin one family worst line dialogue...,0,65
49580,richard tyler little boy scared everything lik...,0,50


In [68]:
df = df[~(df['review_length'] < 3) & ~(df['review_length'] > 180)] #Por temas de memoria durante el entrenamiento

In [70]:
BATCH_SIZE = 8  # smaller batch to stay within 2GB GPUs

class CustomDataset(Dataset):
    def __init__(self, sents, labels):
        self.sents = sents

        self.PAD_TOKEN = "<PAD>"
        self.UNK_TOKEN = "<UNK>"
        self.PAD_IDX = 0
        self.UNK_IDX = 1

        self.vocab, self.index_word = self.get_vocab(self.sents)

        self.label_mapping = {value: key for key, value in enumerate(np.unique(labels))}
        self.labels = [self.label_mapping[l] for l in labels]

    def get_vocab(self, sents):
        tokenized_sents = list(map(lambda x: simple_preprocess(x), list(sents)))
        corpus = [token for s in tokenized_sents for token in s]

        word_count = Counter(corpus)
        common_words = word_count.most_common()

        vocab = {self.PAD_TOKEN: self.PAD_IDX, self.UNK_TOKEN: self.UNK_IDX}

        for token, _ in common_words:
            if token not in vocab:
                vocab[token] = len(vocab)

        index_to_vocab = dict(zip(vocab.values(), vocab.keys()))

        return vocab, index_to_vocab

    def vocab_size(self):
        return len(self.vocab)

    def vectorize(self, sent):
        tokens = simple_preprocess(sent)
        return [self.vocab.get(token, self.UNK_IDX) for token in tokens]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.vectorize(self.sents[idx])
        input_ids = torch.tensor(input_ids)

        label = torch.tensor(self.labels[idx])

        return input_ids, label

In [71]:
ds = CustomDataset(
    sents = df['cleaned_review'].tolist(),
    labels = df['sentiment'].tolist()
)


train_size = round(0.8 * ds.__len__()) 
val_size = round(0.2 * ds.__len__())

train_set , val_set = random_split(ds , [train_size , val_size])


In [72]:
print(f"the size of the dataset is : {ds.__len__()}")
print(f"The size of the training set is : {train_size}")
print(f"The size of the validation set is : {val_size}")

the size of the dataset is : 41295
The size of the training set is : 33036
The size of the validation set is : 8259


In [73]:

def COLLATE_FN(batch):
    inputs , labels = zip(*batch)

    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return padded_inputs, labels


In [76]:
loader_kwargs = {
    "batch_size": BATCH_SIZE,
    "pin_memory": device == "cuda",
    "num_workers": 0,
    "collate_fn": COLLATE_FN,
}

torch.manual_seed(42)
train_ds = DataLoader(
    train_set,
    shuffle=True,
    **loader_kwargs,
)

torch.manual_seed(42)
val_ds = DataLoader(
    val_set,
    shuffle=False,
    **loader_kwargs,
)

In [77]:
EMBEDDING_DIM = 256

word2vec_data = list(map(lambda x: simple_preprocess(x), df['cleaned_review'].tolist()))

word2vec = Word2Vec(
    sentences=word2vec_data,
    window=10,
    min_count=1,
    vector_size=EMBEDDING_DIM,
    workers=4,
)

In [78]:
VOCAB_SIZE = ds.vocab_size()
word2vec_embedding_matrix = np.zeros((VOCAB_SIZE , EMBEDDING_DIM))

In [79]:
pretrained_embedding_matrix = torch.from_numpy(word2vec_embedding_matrix).float()

In [80]:
class LSTM(nn.Module) : 
    def __init__(
        self , 
        vocab_size , 
        embedding_dim , 
        hidden_dim , 
        num_layers , 
        fc_dim , 
        n_classes , 
        bidirectional = False , 
        dropout_rate = None
    ) : 
        super(LSTM , self).__init__() 

        self.embedding = nn.Embedding.from_pretrained(
            pretrained_embedding_matrix , 
            padding_idx = 0 , 
            freeze = True
        )

        self.num_directional = 2 if bidirectional else 1
        self.n_layers = num_layers 
        self.h_dim = hidden_dim

        self.lstm1 = nn.LSTM(
            embedding_dim , 
            hidden_dim , 
            num_layers , 
            dropout = dropout_rate , 
            bidirectional = bidirectional , 
            batch_first = True 
        )

        self.lstm2 = nn.LSTM(
            hidden_dim * self.num_directional , 
            hidden_dim // 2 , 
            num_layers , 
            dropout = dropout_rate , 
            bidirectional = bidirectional , 
            batch_first = True 
        )

        self.dropout = nn.Dropout(0.5)

        self.fc1 = nn.Linear((hidden_dim // 2) * self.num_directional , fc_dim)
        self.fc2 = nn.Linear(fc_dim , n_classes)

    def forward(self , x) : 
        batch_size = x.size(0)

        embed = self.embedding(x) #[batch_size,seq_len,embedding_dim]
        
        out , _ = self.lstm1(embed , None) #[batch_size,seq_len,n_directional*h_dim]
        
        _ , (hn , _) = self.lstm2(out ,None) #[n_directional*n_layers,batch_size,h_dim]

        # We concatenate the final forward state [-2] and final backward state [-1] of the LAST layer (Layer 2)
        hn = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=1)

        fc_out = nn.functional.relu(self.fc1(hn))
        fc_out = self.dropout(fc_out)
        logits = self.fc2(fc_out)

        return logits

In [81]:
HIDDEN_DIM = 256
NUM_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT_RATE = 0.5
FC_DIM = 64
N_CLASSES = len(np.unique(df['sentiment']))

In [82]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())

2.5.1+cu121
True
NVIDIA GeForce MX350


In [83]:
torch.cuda.empty_cache()
model = LSTM(
    vocab_size = VOCAB_SIZE , 
    embedding_dim = EMBEDDING_DIM , 
    hidden_dim = HIDDEN_DIM , 
    num_layers = NUM_LAYERS , 
    fc_dim = FC_DIM , 
    n_classes = N_CLASSES , 
    bidirectional = BIDIRECTIONAL , 
    dropout_rate = DROPOUT_RATE
).to(device)

In [84]:
summary(model, input_size=(BATCH_SIZE, 148), dtypes=[torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
LSTM                                     [8, 2]                    --
├─Embedding: 1-1                         [8, 148, 256]             (18,083,840)
├─LSTM: 1-2                              [8, 148, 512]             2,629,632
├─LSTM: 1-3                              [8, 148, 256]             1,052,672
├─Linear: 1-4                            [8, 64]                   16,448
├─Dropout: 1-5                           [8, 64]                   --
├─Linear: 1-6                            [8, 2]                    130
Total params: 21,782,722
Trainable params: 3,698,882
Non-trainable params: 18,083,840
Total mult-adds (Units.GIGABYTES): 4.50
Input size (MB): 0.01
Forward/backward pass size (MB): 9.70
Params size (MB): 87.13
Estimated Total Size (MB): 96.84

In [85]:


def accuracy_fn(y_pred , y_true) : 
    correct = torch.eq(y_true , y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100 

    return acc



In [86]:
class EvaluationMetrics : 
    def __init__(self , n_classes) : 
        self.metrics = {
            'accuracy' : tm.Accuracy(
                task = "multiclass" , 
                num_classes = N_CLASSES , 
                average = "macro"
            ) ,
    
            'precision' : tm.Precision(
                task = "multiclass" , 
                num_classes = N_CLASSES , 
                average = "macro"
            ) ,
    
            'recall' : tm.Recall(
                task = "multiclass" , 
                num_classes = N_CLASSES , 
                average = "macro"
            ) ,
    
            'f1' : tm.F1Score(
                task = "multiclass" , 
                num_classes = N_CLASSES , 
                average = "macro"
            ) 
        }

        for metric in self.metrics.values() : 
            metric.to(device)

    def update(self , y_preds , y_true) : 
        for metric in self.metrics.values() : 
            metric.update(y_preds , y_true)

    def compute(self) : 
        return {name : metric.compute().item() for name , metric in self.metrics.items()}

    def reset(self) : 
        for metric in self.metrics.values() : 
            metric.reset()

In [87]:
def ConfMatPlot(cm , confmat) :     
    fig, ax = cm.plot()
    ax.set_title("Validation Confusion Matrix" , fontsize = 15 , fontweight = "bold")
    ax.set_xlabel("Predicted", fontsize=10, fontweight="bold")
    ax.set_ylabel("True", fontsize=10, fontweight="bold")

    ax.set_xticks(np.arange(len(label_map)))
    ax.set_xticklabels(list(label_map.values()), rotation=45, ha='right', fontsize=8)
    ax.set_yticks(np.arange(len(label_map)))
    ax.set_yticklabels(list(label_map.values()), fontsize=8)


In [88]:


loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()) , lr = 1e-3)



In [89]:


model_metric = {
        'Train_Loss' : [] , 
        'Train_Accuracy' : [] ,
        'Validation_Loss' : [] , 
        'Validation_Accuracy' : []
    }



In [90]:
EPOCHS = 5

def train() : 
    for epoch in tqdm(range(EPOCHS)) : 
        # TRAIN 
        model.train()
        train_loss , train_acc = 0.0 , 0.0 

        for X , y in train_ds : 
            X, y = X.to(device), y.to(device)

            optim.zero_grad()
            logits = model(X)
            y_preds = torch.argmax(logits , dim = 1)

            loss = loss_fn(logits , y)
            loss.backward()
            optim.step()

            train_loss += loss.item()
            train_acc += accuracy_fn(y_preds , y)

        train_loss /= len(train_ds)
        train_acc /= len(train_ds)

        # VALIDATE
        model.eval()
        val_loss , val_acc = 0.0 , 0.0

        with torch.inference_mode() : 
            for X , y in val_ds : 
                X, y = X.to(device), y.to(device)

                logits = model(X)
                y_preds = torch.argmax(logits , dim = 1)

                val_loss += loss_fn(logits, y).item()
                val_acc += accuracy_fn(y_preds , y)


        val_loss /= len(val_ds)
        val_acc /= len(val_ds)

        print(
            f"""[Epoch {epoch+1}/{EPOCHS}]
            [Train Loss: {train_loss:0.5f}] [Train Accuracy: {train_acc:0.2f}%]
            [Validation Loss: {val_loss:0.5f}] [Validation Accuracy: {val_acc:0.2f}%]
            """
        )

        model_metric["Train_Loss"].append(train_loss)
        model_metric["Train_Accuracy"].append(train_acc)
        model_metric["Validation_Loss"].append(val_loss)
        model_metric["Validation_Accuracy"].append(val_acc)

    return model_metric

In [91]:
model_metric = train()

 20%|██        | 1/5 [06:20<25:22, 380.74s/it]

[Epoch 1/5]
            [Train Loss: 0.69374] [Train Accuracy: 50.35%]
            [Validation Loss: 0.69347] [Validation Accuracy: 49.33%]
            


 40%|████      | 2/5 [12:35<18:52, 377.37s/it]

[Epoch 2/5]
            [Train Loss: 0.69335] [Train Accuracy: 50.01%]
            [Validation Loss: 0.69306] [Validation Accuracy: 50.67%]
            


 60%|██████    | 3/5 [19:32<13:10, 395.33s/it]

[Epoch 3/5]
            [Train Loss: 0.69335] [Train Accuracy: 50.27%]
            [Validation Loss: 0.69313] [Validation Accuracy: 50.67%]
            


 80%|████████  | 4/5 [26:07<06:35, 395.18s/it]

[Epoch 4/5]
            [Train Loss: 0.69322] [Train Accuracy: 50.27%]
            [Validation Loss: 0.69315] [Validation Accuracy: 49.33%]
            


100%|██████████| 5/5 [32:30<00:00, 390.16s/it]

[Epoch 5/5]
            [Train Loss: 0.69343] [Train Accuracy: 49.70%]
            [Validation Loss: 0.69311] [Validation Accuracy: 50.67%]
            





In [92]:
# Guardar el modelo entrenado
MODEL_PATH = Path("../models/sentiment_lstm_model.pth")
torch.save(model.state_dict(), MODEL_PATH)