In [1]:
import numpy as np
import pandas as pd
import torch
import torchtext
import zipfile
import pathlib
from pathlib import Path
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import numericalize_tokens_from_iterator
from torch.nn.utils.rnn import pad_sequence

import logging
import os
import warnings
warnings.filterwarnings("ignore")

# Due to warning when initializing the "spacy" tokenizer
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # disable tensorflow logging
logging.getLogger('tensorflow').disabled = True  # disable tensorflow warning messages



In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:

df = pd.read_csv('artifacts/train_cleaned.csv')

# limit df to 5000
df = df[:5000]
df

Unnamed: 0,tweet,sentiment,label
0,im getting on borderlands and i will murder yo...,Positive,3
1,I am coming to the borders and I will kill you...,Positive,3
2,im getting on borderlands and i will kill you ...,Positive,3
3,im coming on borderlands and i will murder you...,Positive,3
4,im getting on borderlands 2 and i will murder ...,Positive,3
...,...,...,...
4995,"Now now, don't mistake my niceness for an open...",Negative,1
4996,"1 Now now, don't mistake all my niceness somet...",Negative,1
4997,"Now now, don't mistake my reply for being open...",Negative,1
4998,When the good girl meets that good black dick!...,Neutral,2


In [4]:
df['label'].unique()

array([3, 2, 1, 0])

In [5]:
df_valid = pd.read_csv('artifacts/valid_cleaned.csv')

df_valid = df_valid[:500]
df_valid

Unnamed: 0,tweet,sentiment,label
0,I mentioned on Facebook that I was struggling ...,Irrelevant,0
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,2
2,@Microsoft Why do I pay for WORD when it funct...,Negative,1
3,"CSGO matchmaking is so full of closet hacking,...",Negative,1
4,Now the President is slapping Americans in the...,Neutral,2
...,...,...,...
495,special shoutouts to microsoft excel 2013,Positive,3
496,Dumb Lucky☘️ (Fortnite Montage) youtu.be/psW...,Irrelevant,0
497,Dang there goes my birthday present but maybe ...,Positive,3
498,It was ab fab seeing the 6 bungalows built in ...,Irrelevant,0


In [6]:
df_valid['label'].unique()

array([0, 2, 1, 3])

In [7]:
tokenizer = get_tokenizer("spacy")

def token_gen(text):
    """
    Tokenizes each sentence in a given text and yields the resulting tokens.

    Args:
        text (list[str]): A list of sentences to tokenize.

    Yields:
        list[str]: The resulting tokens from each sentence.
    """
    for sent in text:
        tokens = tokenizer(sent)
        yield tokens



In [8]:
vocab = build_vocab_from_iterator(token_gen(df['tweet']),specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])  ## to handel OOV problem



In [9]:
#print(vocab.get_stoi())

In [10]:
# numericalize tokens from iterator using vocab
sequence = numericalize_tokens_from_iterator(vocab=vocab,iterator=token_gen(df['tweet']))

print('data type is:',type(sequence))

count=0
for ids in sequence:
    print([num for num in ids])
    count+=1
    if count==10:
        break
    


data type is: <class 'generator'>
[48, 147, 211, 16, 47, 7, 48, 98, 1139, 23, 37, 5]
[4, 105, 265, 6, 2, 6086, 7, 4, 98, 651, 23, 37, 5]
[48, 147, 211, 16, 47, 7, 48, 98, 651, 23, 37, 5]
[48, 147, 265, 16, 47, 7, 48, 98, 1139, 23, 37, 5]
[48, 147, 211, 16, 47, 41, 7, 48, 98, 1139, 23, 50, 37, 5]
[48, 147, 211, 181, 47, 7, 48, 88, 1139, 23, 37, 5]
[97, 4, 576, 8, 479, 235, 425, 231, 12, 82, 1, 1, 1, 153, 23, 49, 36, 160, 4, 105, 8, 1421, 91, 498, 7, 639, 10, 79, 11, 20, 163, 370, 1, 97, 4, 693, 6, 178, 320, 8, 3770, 12, 20, 230, 1, 1, 755, 10, 2, 959, 1032, 3763, 2, 3474, 4, 201, 561, 3898, 3, 4186]
[97, 4, 576, 8, 900, 11, 235, 335, 231, 12, 82, 18, 153, 23, 49, 36, 160, 24, 4, 54, 8, 434, 26, 13, 498, 7, 639, 10, 79, 11, 20, 163, 370, 5, 4, 693, 6, 178, 8, 3770, 12, 20, 230, 21, 755, 33, 2, 959, 3656, 1452, 6, 2, 3474, 4, 201, 561, 408, 82, 3, 386, 17, 6670]
[97, 4, 576, 8, 479, 235, 335, 231, 12, 82, 18, 153, 23, 49, 36, 160, 4, 54, 8, 1421, 26, 13, 498, 7, 639, 10, 79, 11, 20, 163, 

In [11]:
## check how "numericalize_tokens_from_iterator" works

from torchtext.data.functional import numericalize_tokens_from_iterator

sequence = numericalize_tokens_from_iterator(vocab,["hi how are you", "what is your name?"])
list(next(sequence))

[4737, 48, 38, 4737, 6749, 1393, 38, 8, 966, 646, 38, 489, 6749, 301]

In [12]:
# numericalize tokens from iterator using vocab
sequence = numericalize_tokens_from_iterator(vocab=vocab,iterator=token_gen(df['tweet']))

# create a list to store tokenized sequences
text = []
for i in range(len(df)):
    x = list(next(sequence))
    text.append(x)

# Pad the sequences to the same length along dimension 0
padded_text = pad_sequence([torch.tensor(x) for x in text], batch_first=True, padding_value=0)

# restrict the length of every sequence in the padded_text
MAX_LENGTH = 100
padded_text = padded_text[:,:MAX_LENGTH]

print(padded_text.shape)
print(padded_text)

torch.Size([5000, 100])
tensor([[ 48, 147, 211,  ...,   0,   0,   0],
        [  4, 105, 265,  ...,   0,   0,   0],
        [ 48, 147, 211,  ...,   0,   0,   0],
        ...,
        [393,  75,   5,  ...,   0,   0,   0],
        [595,   2,  60,  ...,   0,   0,   0],
        [595,   2,  60,  ...,   0,   0,   0]])


In [13]:
len(padded_text)

5000

In [14]:
padded_text[0]

tensor([  48,  147,  211,   16,   47,    7,   48,   98, 1139,   23,   37,    5,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])

In [15]:
tokenizer('I will see you!')

['I', 'will', 'see', 'you', '!']

In [16]:
## vocab([tokens])

vocab(tokenizer('I will see you!')) ## similar to the 'fit_on_texts()' in tensorflow

[4, 98, 74, 23, 3]

In [17]:
torch.tensor(vocab(tokenizer('I will see you')))

tensor([ 4, 98, 74, 23])

In [18]:
#len(vocab.get_stoi())
len(vocab)

7350

In [19]:
# ?torch.nn.Embedding

In [20]:
embedd = torch.nn.Embedding(num_embeddings=len(vocab),embedding_dim=5,padding_idx=0) 

## if we want to add embedding for a text, we should assign the value to "num_embeddings" according to the 
## max 'integer_id' that is present in the tokenized text


In [21]:
test_input = embedd(torch.tensor(vocab(tokenizer('I will see you nonsense!'))))
test_input

tensor([[-5.6123e-01, -1.3822e+00,  6.2183e-01, -1.7639e+00,  9.9752e-01],
        [ 1.5047e+00,  1.4646e+00, -2.0749e+00, -8.0589e-01, -5.6431e-01],
        [-5.3838e-04,  4.6979e-02, -1.8258e+00,  1.9002e+00, -3.6944e-01],
        [ 1.6585e+00,  1.7957e+00, -1.2168e+00, -5.4170e-01, -3.9997e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 1.2981e+00, -1.0127e+00, -1.0710e+00, -2.8276e-03,  1.1104e+00]],
       grad_fn=<EmbeddingBackward0>)

In [22]:
test_input.shape

torch.Size([6, 5])

In [23]:
# Create the Embedding module with the correct weight matrix size
embedd = torch.nn.Embedding(len(vocab), 5, padding_idx=0)

# Check the shape of the padded_text and compare it to the expected input shape of the Embedding module
print(padded_text.shape)
# should be: torch.Size([batch_size, sequence_length])

# Use the Embedding module with the padded_text
input_text = embedd(padded_text)
print(input_text)

torch.Size([5000, 100])
tensor([[[ 1.1493, -1.1865, -0.0466,  0.3617, -0.7821],
         [ 0.7771, -0.9686, -1.6340,  0.2386, -0.9245],
         [ 2.3880, -0.5202,  0.9542,  1.8797, -0.6081],
         ...,
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-1.8303,  1.3537, -1.0909,  1.3556, -0.5178],
         [ 0.8772,  0.9927, -0.6076, -1.6259, -0.4072],
         [-0.4508, -0.7191, -0.6209, -1.0954,  0.1068],
         ...,
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.1493, -1.1865, -0.0466,  0.3617, -0.7821],
         [ 0.7771, -0.9686, -1.6340,  0.2386, -0.9245],
         [ 2.3880, -0.5202,  0.9542,  1.8797, -0.6081],
         ...,
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.000

In [24]:
# Check the shape of the padded_text and compare it to the expected input shape of the Embedding module
print(padded_text[0].shape)
# should be: torch.Size([batch_size, sequence_length])

# Use the Embedding module with the padded_text
embedd(padded_text[0]).shape


torch.Size([100])


torch.Size([100, 5])

In [25]:
df

Unnamed: 0,tweet,sentiment,label
0,im getting on borderlands and i will murder yo...,Positive,3
1,I am coming to the borders and I will kill you...,Positive,3
2,im getting on borderlands and i will kill you ...,Positive,3
3,im coming on borderlands and i will murder you...,Positive,3
4,im getting on borderlands 2 and i will murder ...,Positive,3
...,...,...,...
4995,"Now now, don't mistake my niceness for an open...",Negative,1
4996,"1 Now now, don't mistake all my niceness somet...",Negative,1
4997,"Now now, don't mistake my reply for being open...",Negative,1
4998,When the good girl meets that good black dick!...,Neutral,2


In [26]:
print(padded_text.shape)
padded_text

torch.Size([5000, 100])


tensor([[ 48, 147, 211,  ...,   0,   0,   0],
        [  4, 105, 265,  ...,   0,   0,   0],
        [ 48, 147, 211,  ...,   0,   0,   0],
        ...,
        [393,  75,   5,  ...,   0,   0,   0],
        [595,   2,  60,  ...,   0,   0,   0],
        [595,   2,  60,  ...,   0,   0,   0]])

In [27]:
label =df['label'].to_list()

#label

In [28]:
label= torch.tensor(label)
print(label.shape)
label

torch.Size([5000])


tensor([3, 3, 3,  ..., 1, 2, 2])

In [29]:
len(label.unique())

4

In [30]:
# Import required libraries
import torch.nn as nn

# Determine the number of classes
num_classes = len(label.unique())

# Define the RNNClassify module
class RNNClassify(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        
        # Define the embedding layer
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Define the RNN layer
        self.rnn = nn.RNN(embed_dim, hidden_size,batch_first=True)
        
        # Define the linear layer
        self.linear = nn.Linear(hidden_size, num_classes)
        
        # Initialize the weights of the module
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embed.weight.data.uniform_(-initrange, initrange)
        self.rnn.weight_ih_l0.data.uniform_(-initrange, initrange)
        self.rnn.weight_hh_l0.data.uniform_(-initrange, initrange)
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        
    def forward(self, input):
        # Embed the input
        embedded = self.embed(input)
        #print('embedded shape:',embedded.shape)
        
        # Pass the embedded input through the RNN layer
        output, hidden = self.rnn(embedded)
        #print('rnn output shape:',output.shape)
        #print('rnn hidden shape:',hidden.shape)
        
        output = output[:, -1, :]  # taking last output of RNN
        #print('rnn last output shape:',output.shape)
        
        # Pass the output through the linear layer
        output = self.linear(output)
        
        # Return the output
        return output


In [31]:
VOCAB_SIZE = len(vocab)
VOCAB_SIZE

7350

In [32]:
model = RNNClassify(vocab_size=VOCAB_SIZE,embed_dim=100,hidden_size=32).to(device)

In [33]:
padded_text[0]

tensor([  48,  147,  211,   16,   47,    7,   48,   98, 1139,   23,   37,    5,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])

In [34]:
padded_text[0].shape

torch.Size([100])

In [35]:
padded_text[0].unsqueeze(0).shape

torch.Size([1, 100])

In [36]:
model(padded_text[0].unsqueeze(0))

tensor([[-0.0631,  1.1285, -0.5048,  0.2922]], grad_fn=<AddmmBackward0>)

In [37]:
model(padded_text[0].unsqueeze(0)).shape

torch.Size([1, 4])

In [38]:
padded_text

tensor([[ 48, 147, 211,  ...,   0,   0,   0],
        [  4, 105, 265,  ...,   0,   0,   0],
        [ 48, 147, 211,  ...,   0,   0,   0],
        ...,
        [393,  75,   5,  ...,   0,   0,   0],
        [595,   2,  60,  ...,   0,   0,   0],
        [595,   2,  60,  ...,   0,   0,   0]])

In [40]:
padded_text.shape

torch.Size([5000, 100])

In [41]:
model(padded_text)  

tensor([[-0.0631,  1.1285, -0.5048,  0.2922],
        [-0.1203,  0.4269, -0.4568, -1.0651],
        [-0.0742,  0.6605,  0.0694, -0.7276],
        ...,
        [ 1.3229,  1.3524,  0.3528,  1.5246],
        [-0.3698,  1.1526, -0.3615,  0.1575],
        [ 0.3327,  0.9626,  0.5979,  0.3315]], grad_fn=<AddmmBackward0>)

In [42]:
model(padded_text).shape            

torch.Size([5000, 4])

### will try `Batch Gradient Descent`

#### first of all, let me fix `(X_train,y_train)` and `(X_test,y_test)`

In [43]:
# X_train,y_train

X_train,y_train = padded_text,label

In [44]:
df_valid

Unnamed: 0,tweet,sentiment,label
0,I mentioned on Facebook that I was struggling ...,Irrelevant,0
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,2
2,@Microsoft Why do I pay for WORD when it funct...,Negative,1
3,"CSGO matchmaking is so full of closet hacking,...",Negative,1
4,Now the President is slapping Americans in the...,Neutral,2
...,...,...,...
495,special shoutouts to microsoft excel 2013,Positive,3
496,Dumb Lucky☘️ (Fortnite Montage) youtu.be/psW...,Irrelevant,0
497,Dang there goes my birthday present but maybe ...,Positive,3
498,It was ab fab seeing the 6 bungalows built in ...,Irrelevant,0


In [45]:
len(df_valid)

500

In [46]:
valid_token_ids = []
for i in range(len(df_valid)):
    token_id = vocab(tokenizer(df_valid['tweet'][i]))
    valid_token_ids.append(token_id)
    
    
#valid_token_ids

In [47]:
# valid_token_ids = torch.tensor(valid_token_ids) # this will throw an error, because all sequence are not of same length

# Pad the sequences to the same length along dimension 0
padded_text_valid = pad_sequence([torch.tensor(x) for x in valid_token_ids], batch_first=True, padding_value=0)
# here look, <UNK> will be assign to 0 and padding_idx will be assign also 0

padded_text_valid = padded_text_valid[:,:MAX_LENGTH]

print(padded_text_valid.shape)
print(padded_text_valid)

torch.Size([500, 67])
tensor([[   4, 4137,   16,  ...,    0,    0,    0],
        [5115,    0,   25,  ...,    0,    0,    0],
        [   0,  245,   49,  ...,    0,    0,    0],
        ...,
        [   0,  122, 2122,  ...,    0,    0,    0],
        [  55,   32,    0,  ...,    0,    0,    0],
        [   0, 1246,  977,  ...,    0,    0,    0]])


In [48]:
label_valid = df_valid['label'].to_list()

In [49]:
label_valid = torch.tensor(label_valid)
#label_valid

In [50]:
# X_test,y_test

X_test, y_test = padded_text_valid, label_valid
X_test, y_test

(tensor([[   4, 4137,   16,  ...,    0,    0,    0],
         [5115,    0,   25,  ...,    0,    0,    0],
         [   0,  245,   49,  ...,    0,    0,    0],
         ...,
         [   0,  122, 2122,  ...,    0,    0,    0],
         [  55,   32,    0,  ...,    0,    0,    0],
         [   0, 1246,  977,  ...,    0,    0,    0]]),
 tensor([0, 2, 1, 1, 2, 1, 3, 3, 3, 1, 3, 3, 1, 2, 1, 3, 3, 1, 3, 1, 1, 2, 0, 1,
         2, 2, 1, 0, 0, 1, 3, 3, 1, 3, 1, 2, 2, 0, 3, 2, 3, 2, 2, 2, 3, 2, 1, 1,
         1, 2, 3, 1, 1, 3, 3, 3, 3, 3, 1, 0, 1, 3, 3, 0, 1, 2, 1, 0, 2, 1, 3, 1,
         1, 3, 3, 0, 3, 0, 2, 2, 2, 3, 3, 2, 3, 2, 1, 0, 1, 2, 2, 1, 3, 0, 0, 1,
         1, 1, 2, 3, 2, 1, 3, 3, 2, 3, 2, 3, 1, 2, 2, 2, 1, 2, 1, 2, 2, 3, 3, 2,
         1, 1, 3, 1, 2, 1, 3, 2, 1, 2, 0, 3, 2, 3, 3, 0, 2, 2, 0, 0, 0, 2, 2, 0,
         0, 0, 3, 2, 3, 0, 3, 1, 2, 2, 2, 0, 2, 1, 2, 3, 1, 2, 1, 0, 0, 0, 2, 1,
         1, 1, 3, 3, 3, 2, 2, 3, 0, 2, 2, 2, 3, 2, 1, 1, 2, 3, 3, 0, 0, 2, 3, 3,
         2, 0, 2, 

#### Now, write the train and test loop for `Batch Gradient Descent`

In [51]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()  # remember it gives logits (row outputs)

In [52]:
# Here, we will use Batch Gradient Descent, BUT, generally we prefer Mini-Batch Gradient Descent

epochs = 50


for epoch in range(epochs):
    train_loss,train_acc = 0,0
    
    # Set model to training mode
    model.train()
    
    X_train,y_train = X_train.to(device), y_train.to(device)
    
    y_logits = model(X_train)
    #print('shape of y_logits:',y_logits.shape)

    
    # Compute loss with one-hot encoded targets
    loss = loss_fn(y_logits, y_train)
    
    train_loss += loss
    train_acc += (y_logits.argmax(1) == y_train).sum().item() / len(y_train)
    
    optimizer.zero_grad()
    loss.backward()
    
    optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')
    
    
    with torch.inference_mode():
        
        model.eval()
        
        test_loss,test_acc = 0,0
    
        X_test, y_test = X_test.to(device), y_test.to(device)
        
        y_logits = model(X_test)

        # Compute loss with one-hot encoded targets
        loss = loss_fn(y_logits, y_test)

        test_loss += loss.item()
            
        # Compute accuracy
        test_preds = y_logits.argmax(dim=1)
        test_acc += (test_preds == y_test).sum().item() / len(y_test)

        
        print(f'Epoch {epoch+1}/{epochs}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')
        
        print('--'*50)

Epoch 1/50, Train Loss: 1.6379, Train Accuracy: 0.2374
Epoch 1/50, Test Loss: 1.5334, Test Accuracy: 0.2780
----------------------------------------------------------------------------------------------------
Epoch 2/50, Train Loss: 1.5692, Train Accuracy: 0.2484
Epoch 2/50, Test Loss: 1.5297, Test Accuracy: 0.2840
----------------------------------------------------------------------------------------------------
Epoch 3/50, Train Loss: 1.5449, Train Accuracy: 0.2578
Epoch 3/50, Test Loss: 1.5032, Test Accuracy: 0.2940
----------------------------------------------------------------------------------------------------
Epoch 4/50, Train Loss: 1.5004, Train Accuracy: 0.2616
Epoch 4/50, Test Loss: 1.4785, Test Accuracy: 0.2840
----------------------------------------------------------------------------------------------------
Epoch 5/50, Train Loss: 1.4694, Train Accuracy: 0.2836
Epoch 5/50, Test Loss: 1.4399, Test Accuracy: 0.2900
--------------------------------------------------------