In [1]:
import os
import json

from dataclasses import dataclass
from collections import Counter

import gensim
import nltk
import torch
import torchmetrics

import numpy as np
import pandas as pd
import seaborn as sns

from torch import nn
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# Setup

In [2]:
data_dir = os.path.join(os.curdir, "data")
vocab_path = os.path.join(data_dir, "word-level-vocab.json")
dataset_path = "https://nlp-slides.vercel.app/clean-tweets.tsv"

with open(vocab_path, "rt") as f:
    vocab = json.load(f)
    
dataset = pd.read_csv(filepath_or_buffer=dataset_path, sep="\t")

In [3]:
tweets = dataset["clean_text"].tolist()

In [4]:
OOV_TOKEN = "[OOV]"
PAD_TOKEN = "[PAD]"

OOV_INDEX = vocab.get(OOV_TOKEN)

print(f"Vocab Size = {len(vocab)}")

Vocab Size = 961


In [5]:
tokenized_tweets = [[vocab.get(token) for token in tweet.split(" ") if token in vocab] for tweet in tweets]

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Dataset

In [7]:
# Tweets that are empty (all tokens appear less than the minimum frequency)
len([len(tweet) for tweet in tokenized_tweets if len(tweet) == 0])

36

In [8]:
# Max Tweet length
max([len(tweet) for tweet in tokenized_tweets])

27

In [9]:
# 99th percentile of tweets' lengths
np.percentile([len(tweet) for tweet in  tokenized_tweets], q=99)

13.0

In [10]:
# Filtered Tweets indices by length
MIN_LENGTH = 0
MAX_LENGTH = np.percentile([len(tweet) for tweet in  tokenized_tweets], q=99)
filtered_indices = [index for index, tweet in enumerate(tokenized_tweets) if len(tweet) < MAX_LENGTH and len(tweet) > MIN_LENGTH]

In [11]:
X = nn.utils.rnn.pad_sequence([torch.tensor(tweet) for tweet in tokenized_tweets],
                              batch_first=True,
                              padding_value=vocab.get(PAD_TOKEN))

In [12]:
y = torch.tensor(dataset["Sentiment"].replace({"negative": 0, "neutral": 1, "positive": 2}).to_numpy())

In [13]:
X, y = X[filtered_indices], y[filtered_indices]

In [14]:
BATCH_SIZE = 16

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

In [16]:
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Pretrained Embeddings

In [31]:
word2vec = gensim.models.Word2Vec(sentences=tokenized_tweets)

In [19]:
embedding_layer = nn.Embedding.from_pretrained(embeddings=torch.tensor(word2vec.wv.vectors)).to(device)

In [20]:
embedding_layer

Embedding(959, 100)

# Model

In [24]:
class SentimentAnalyzer(nn.Module):
    def __init__(self, vocab_size: int, embedding_layer: nn.Embedding, latent_dim: int, padding_idx: int = 0):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = embedding_layer
        self.embedding_dim = embedding_layer.embedding_dim
#         self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.latent_dim = latent_dim
        
#         self.embedding = nn.Embedding(num_embeddings=self.vocab_size, 
#                                       embedding_dim=self.embedding_dim, 
#                                       padding_idx=self.padding_idx, 
#                                       max_norm=1.0)
        
        self.rnn = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.latent_dim, 
                          batch_first=True, num_layers=2, bidirectional=True)
        
       
        
        self.fc1 = nn.Linear(in_features=self.latent_dim * 2, out_features=self.latent_dim)
        self.fc2 = nn.Linear(in_features=self.latent_dim, out_features=3)
        
        
    
    def forward(self, x):
        # x = (batch_size, sequence_length) 
        
        embeddings = embedding_layer(x)       
        
        # embeddings = (batch_size, sequence_length, embedding_dim)
        
        hidden_states, (last_hidden_state, last_cell_state) = self.rnn(embeddings)
        
        output = torch.concat([last_hidden_state[-1,:,:], last_hidden_state[-2,:,:]], dim=-1) 
        
        output = torch.relu(self.fc1(output))
        
        output = self.fc2(output)
        
        
        
        return output
        
    

In [25]:
EMBEDDING_DIM = 100
LATENT_DIM = 32

In [26]:
model = SentimentAnalyzer(vocab_size=len(vocab), embedding_layer=embedding_layer, latent_dim=LATENT_DIM).to(device)

# Training

In [27]:
EPOCHS = 50

optim = torch.optim.Adam(params=model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

train_metric = torchmetrics.F1Score(num_classes=3).to(device)
val_metric = torchmetrics.F1Score(num_classes=3).to(device)

In [28]:
# A dataclass object defined to store the loss and metric results of each epoch
# REFERENCE: https://realpython.com/python-data-classes/
@dataclass
class Epoch:
    epoch: int
    training_loss: float
    validation_loss: float
    training_acc: float
    validation_acc: float
        
    
    def log(self) -> None:
        print(f"Epoch {self.epoch + 1}: Training Loss: {self.training_loss}\tValidation Loss: {self.validation_loss} || Training F1: {self.training_acc}\tValidation F1: {self.validation_acc}\n----------------------------------------------------")

In [29]:
history: list[Epoch] = []

for epoch in range(EPOCHS):
    # TODO: Define epoch train and test: loss and metrics
    # TODO: Make sure to reset the metric function
    epoch_train_loss = 0
    epoch_val_loss = 0
    
    train_metric.reset()
    val_metric.reset()
    
    # TODO: Set the model to training mode
    model.train()
    # Iterate train batches
    for batch in train_dataloader:
        
        # REMEMBER: you need to reset the optimizer in order to avoid incorrect accumulation of gradientrs 
        optim.zero_grad()
        
        # TODO: Unpack the batch and move it to the correct device
        inputs, target = batch
        inputs, target = inputs.to(device), target.to(device)

        # TODO: get prediction from model
        predictions = model(inputs)
        
        # TODO: Calculate loss and metric
        
        loss = criterion(predictions, target)
        train_metric(predictions, target)
        
        # TODO: Backward propagataion
        loss.backward()
        optim.step()
        
        # Accumulate train loss
        epoch_train_loss += loss.item()
    
    # Calculate epoch training metric
    epoch_train_f1 = round(train_metric.compute().item(), 3)

    
    # TODO: set the model to test mode
    model.eval()
    # Reset the metric function
    # Make sure PyTorch will run inference without tracking gradients for enhancing performance
    with torch.no_grad():
        # TODO: Load batches from `test_loader`
        for batch in test_dataloader:
            
            # TODO: Unpack the batch and move it to the correct device
            inputs, target = batch
            inputs, target = inputs.to(device), target.to(device)

            # TODO: get prediction from model
            predictions = model(inputs)
            # TODO: Calculate loss and metric values
            loss = criterion(predictions, target)
            val_metric(predictions, target)
            
            # TODO: Accumulate validation loss
            epoch_val_loss += loss.item()
            
        # TODO: Calculate epoch validation metric
        epoch_val_f1 = round(val_metric.compute().item(), 3)
      

    # Calculate epoch training loss
    epoch_train_loss = round(epoch_train_loss / len(train_dataloader), 3)
    # TODO: Calculate epoch validation loss
    epoch_val_loss = round(epoch_val_loss / len(test_dataloader), 3)
    
    # TODO: Create `Epoch` instance with the results
    epoch_result = Epoch(epoch=epoch, 
                         training_loss=epoch_train_loss, 
                         validation_loss=epoch_val_loss,
                         training_acc=epoch_train_f1,
                         validation_acc=epoch_val_f1,
                        )
    
    # Add to history list
    history.append(epoch_result)
    # Log epoch output
    epoch_result.log()

../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [67,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [67,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [67,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [67,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [67,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [67,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [67,0,0], thread: [6,0,0

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

In [None]:
train_loss = list(map(lambda x: x.training_loss ,history))
val_loss = list(map(lambda x: x.validation_loss ,history))
train_acc = list(map(lambda x: x.training_acc, history))
val_acc = list(map(lambda x: x.validation_acc, history))


fig, (ax1, ax2) = plt.subplots(figsize=(15, 12), ncols=2)

ax1.plot(range(EPOCHS), train_loss, label="Training")
ax1.plot(range(EPOCHS), val_loss, label="Validation")

ax2.plot(range(EPOCHS), train_acc, label="Training")
ax2.plot(range(EPOCHS), val_acc, label="Validation")

ax1.set_title("Loss", fontdict=dict(size=15), pad=15)
ax2.set_title("F1", fontdict=dict(size=15), pad=15)

ax1.set_xticks(range(EPOCHS))
ax2.set_xticks(range(EPOCHS))

ax1.set_xlabel("Epoch")
ax2.set_xlabel("Epoch")

plt.legend()
plt.show()