In [1]:
import os
import json

import torch

import numpy as np
import pandas as pd

from torch import nn

# Setup

In [33]:
data_dir = os.path.join(os.curdir, "data")
vocab_path = os.path.join(data_dir, "word-level-vocab.json")
dataset_path = "https://shai-nlp-course.netlify.app/clean-tweets.tsv"

with open(vocab_path, "rt") as f:
    vocab = json.load(f)
    
dataset = pd.read_csv(filepath_or_buffer=dataset_path, sep="\t")

In [34]:
tweets = dataset["clean_text"].tolist()

In [35]:
OOV_TOKEN = "[OOV]"
PAD_TOKEN = "[PAD]"

OOV_INDEX = vocab.get(OOV_TOKEN)

print(f"Vocab Size = {len(vocab)}")

Vocab Size = 10998


In [36]:
tokenized_tweets = [[vocab.get(token) for token in tweet.split(" ") if token in vocab] for tweet in tweets]

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Dataset

In [38]:
max([len(tweet) for tweet in tokenized_tweets])

72

In [66]:
X = nn.utils.rnn.pad_sequence([torch.tensor(tweet) for tweet in tokenized_tweets], 
                          batch_first=True, 
                          padding_value=vocab.get(PAD_TOKEN))

In [67]:
y = torch.tensor(pd.get_dummies(dataset["Sentiment"]).to_numpy())

In [68]:
X.shape, y.shape

(torch.Size([4000, 72]), torch.Size([4000, 3]))

In [69]:
BATCH_SIZE = 32

In [122]:
data = torch.utils.data.TensorDataset(X, y)
dataloader = torch.utils.data.DataLoader(dataset=data, batch_size=BATCH_SIZE, shuffle=True)

# Model

In [75]:
data[0:16][0].shape

torch.Size([16, 72])

In [153]:
class SentimentAnalyzer(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, latent_dim: int, padding_idx: int = 0):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.latent_dim = latent_dim
        
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                      embedding_dim=self.embedding_dim, 
                                      padding_idx=self.padding_idx, 
                                      max_norm=1.0)
        
        self.rnn = nn.RNN(input_size=self.embedding_dim, hidden_size=self.latent_dim, 
                          batch_first=True)
        
        
        self.fc = nn.Linear(in_features=self.latent_dim, out_features=3)
        
    
    def forward(self, x):
        # x = (batch_size, sequence_length) 
        
        embeddings = self.embedding(x)
        
        # embeddings = (batch_size, sequence_length, embedding_dim)
        
        hidden_states, output = self.rnn(embeddings)
        
        # Continue from here
        output = output[-1,:,:]
        output = self.fc(output)
        
        return output
        
    

In [154]:
EMBEDDING_DIM = 300
LATENT_DIM = 128

In [155]:
model = SentimentAnalyzer(vocab_size=len(vocab), embedding_dim=EMBEDDING_DIM, latent_dim=LATENT_DIM).to(device)

In [156]:
sample = data[0:BATCH_SIZE][0].to(device)

In [157]:
output = model(sample)

In [158]:
output.shape

torch.Size([32, 3])