In [1]:
import numpy as np
import pandas as pd
import torch
import torchtext
import zipfile
import pathlib
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import numericalize_tokens_from_iterator
from torch.nn.utils.rnn import pad_sequence

import logging
import os
import warnings
warnings.filterwarnings("ignore")

# Due to warning when initializing the "spacy" tokenizer
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # disable tensorflow logging
logging.getLogger('tensorflow').disabled = True  # disable tensorflow warning messages



In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:

df = pd.read_csv('data/train_cleaned.csv')

# limit df to 1000
df = df[:1000]
df

Unnamed: 0,tweet,sentiment,label
0,im getting on borderlands and i will murder yo...,Positive,3
1,I am coming to the borders and I will kill you...,Positive,3
2,im getting on borderlands and i will kill you ...,Positive,3
3,im coming on borderlands and i will murder you...,Positive,3
4,im getting on borderlands 2 and i will murder ...,Positive,3
...,...,...,...
995,Who's down with some @Borderlands on,Positive,3
996,Who't s someone down for some @Borderlands on,Positive,3
997,Who's down for the @Borderlands on,Positive,3
998,@EpicGames @2K @Steam why add crossplay for @B...,Negative,1


In [4]:
df['label'].unique()

array([3, 2, 1, 0])

In [5]:
df_valid = pd.read_csv('data/valid_cleaned.csv')

df_valid = df_valid[:100]
df_valid

Unnamed: 0,tweet,sentiment,label
0,I mentioned on Facebook that I was struggling ...,Irrelevant,0
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,2
2,@Microsoft Why do I pay for WORD when it funct...,Negative,1
3,"CSGO matchmaking is so full of closet hacking,...",Negative,1
4,Now the President is slapping Americans in the...,Neutral,2
...,...,...,...
95,@BlizzardCS so when i try to buy overwatch wit...,Negative,1
96,@verizon Can you waive some data overage charg...,Negative,1
97,No one buy battlefield 3 on steam! It doesn’t ...,Negative,1
98,Our #HISAPerth #OBIawards ceremony is taking p...,Neutral,2


In [6]:
df_valid['label'].unique()

array([0, 2, 1, 3])

In [7]:
tokenizer = get_tokenizer("spacy")

def token_gen(text):
    """
    Tokenizes each sentence in a given text and yields the resulting tokens.

    Args:
        text (list[str]): A list of sentences to tokenize.

    Yields:
        list[str]: The resulting tokens from each sentence.
    """
    for sent in text:
        tokens = tokenizer(sent)
        yield tokens



In [8]:
vocab = build_vocab_from_iterator(token_gen(df['tweet']),specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])  ## to handel OOV problem
print(vocab.get_stoi())

{'•': 2606, '½': 2605, '|': 2604, 'youtu.be/0SKu6Vr4iXU': 2603, 'yo': 2602, 'years': 2600, 'ya': 2599, 'x': 2597, "won'the": 2594, 'white': 2593, 'whilst': 2592, 'whether': 2591, 'whenever': 2590, 'whatsoever': 2588, 'whatever': 2587, 'weapon': 2586, "we'D": 2585, 'war': 2584, 'walks': 2582, 'view': 2578, 'vibrating': 2577, 'vcgNcpMAvu': 2575, 'various': 2574, 'usually': 2573, 'ur': 2572, 'unsorted.co': 2570, 'unsorted': 2569, 'wheels': 2589, 'unsorteco': 2568, 'unk><unk': 2567, 'uninstall': 2566, 'underpaying': 2564, 'ultimately': 2563, 'uk.googlehits.com': 2562, 'tw1ztedpr1ncess': 2558, 'trip': 2556, 'tram': 2553, 'trainer': 2552, 'tracks': 2551, 'touch': 2547, 'tits': 2546, 'three': 2543, "that'yer": 2540, 'temptation': 2538, 'tears': 2536, 'tearing': 2535, 'talking': 2534, 'tales': 2533, 'taken': 2532, 't7RJPcJ7DT': 2530, 'swaggiedeals.com': 2529, 'swaggiedeals': 2528, 'subbids': 2518, 'struggle': 2516, 'track': 2550, 'striptease': 2515, 'street': 2514, 'stoked': 2513, 'stay': 2511

In [9]:
# numericalize tokens from iterator using vocab
sequence = numericalize_tokens_from_iterator(vocab=vocab.get_stoi(),iterator=token_gen(df['tweet']))

for ids in sequence:
    print([num for num in ids])

[25, 123, 238, 20, 21, 7, 25, 126, 1005, 19, 29, 6]
[2, 86, 263, 5, 3, 2125, 7, 2, 126, 429, 19, 29, 6]
[25, 123, 238, 20, 21, 7, 25, 126, 429, 19, 29, 6]
[25, 123, 263, 20, 21, 7, 25, 126, 1005, 19, 29, 6]
[25, 123, 238, 20, 21, 31, 7, 25, 126, 1005, 19, 40, 29, 6]
[25, 123, 238, 190, 21, 7, 25, 73, 1005, 19, 29, 6]
[72, 2, 347, 9, 961, 146, 492, 151, 13, 55, 1, 1, 1, 195, 19, 44, 36, 154, 2, 86, 9, 1145, 53, 139, 7, 259, 10, 90, 12, 15, 78, 368, 1, 72, 2, 939, 5, 142, 1444, 9, 1324, 13, 15, 857, 1, 1, 827, 10, 3, 1008, 489, 1500, 3, 1209, 2, 172, 248, 1366, 4, 1466]
[72, 2, 347, 9, 2172, 12, 146, 1415, 151, 13, 55, 14, 195, 19, 44, 36, 154, 23, 2, 47, 9, 207, 22, 8, 139, 7, 259, 10, 90, 12, 15, 78, 368, 6, 2, 939, 5, 142, 9, 1324, 13, 15, 857, 35, 827, 46, 3, 1008, 1471, 2163, 5, 3, 1209, 2, 172, 248, 452, 55, 4, 239, 28, 2345]
[72, 2, 347, 9, 961, 146, 1415, 151, 13, 55, 14, 195, 19, 44, 36, 154, 2, 47, 9, 1145, 22, 8, 139, 7, 259, 10, 90, 12, 15, 78, 368, 1]
[72, 2, 347, 9, 961, 14

[220, 2431, 8, 20, 3, 348, 4, 4, 115, 18, 500, 7, 77, 36, 233, 5, 41, 1603, 3, 125, 4]
[220, 332, 8, 20, 3, 348, 4, 4, 57, 18, 500, 7, 77, 36, 233, 5, 41, 20, 3, 189, 4]
[220, 332, 8, 20, 3, 348, 4, 4, 57, 23, 500, 7, 77, 150, 233, 5, 41, 20, 5, 189, 4]
[111, 220, 332, 8, 20, 3, 348, 4, 4, 144, 57, 17, 23, 500, 7, 77, 150, 233, 5, 41, 20, 3, 1478, 189, 4]
[220, 332, 8, 37, 3, 348, 4, 4, 57, 23, 500, 7, 77, 150, 208, 5, 41, 20, 3, 343, 4]
[53, 2, 77, 36, 41, 21, 11, 20, 15, 196, 444, 17, 652, 73, 19, 167, 18, 1016]
[22, 8, 2, 77, 36, 41, 8, 11, 20, 15, 196, 444, 17, 652, 1]
[22, 8, 2, 77, 36, 41, 21, 11, 20, 15, 196, 444, 17, 652, 73, 19, 167, 18, 1016]
[53, 127, 143, 96, 77, 36, 41, 21, 11, 20, 15, 2531, 444, 17, 652, 73, 19, 167, 18, 1016]
[80, 53, 2, 77, 36, 41, 21, 11, 20, 15, 196, 444, 17, 927, 652, 33, 73, 19, 1255, 167, 18, 1016]
[493, 2, 2140, 41, 21, 11, 20, 204, 196, 444, 17, 652, 73, 19, 167, 18, 1016]
[118, 667, 34, 38, 514, 109, 21, 496, 10, 18, 515, 186, 23, 1003, 342, 38,

In [10]:
## check how "numericalize_tokens_from_iterator" works

from torchtext.data.functional import numericalize_tokens_from_iterator

sequence = numericalize_tokens_from_iterator(vocab,["hi how are you", "what is your name?"])
list(next(sequence))

[0, 25, 45, 0, 2382, 1077, 45, 9, 2439, 1417, 45, 353, 2382, 232]

In [11]:
# numericalize tokens from iterator using vocab
sequence = numericalize_tokens_from_iterator(vocab=vocab.get_stoi(),iterator=token_gen(df['tweet']))

# create a list to store tokenized sequences
text = []
for i in range(len(df)):
    x = list(next(sequence))
    text.append(x)

# Pad the sequences to the same length along dimension 0
padded_text = pad_sequence([torch.tensor(x) for x in text], batch_first=True, padding_value=0)


print(padded_text.shape)
print(padded_text)

torch.Size([1000, 137])
tensor([[  25,  123,  238,  ...,    0,    0,    0],
        [   2,   86,  263,  ...,    0,    0,    0],
        [  25,  123,  238,  ...,    0,    0,    0],
        ...,
        [ 328,   46,  252,  ...,    0,    0,    0],
        [1099, 1091,  386,  ...,    0,    0,    0],
        [  22, 1367,   22,  ...,    0,    0,    0]])


In [12]:
padded_text[0]

tensor([  25,  123,  238,   20,   21,    7,   25,  126, 1005,   19,   29,    6,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])

In [13]:
## vocab([tokens])

vocab(tokenizer('I will see you!')) ## similar to the 'fit_on_texts()' in tf

[2, 126, 83, 19, 4]

In [14]:
torch.tensor(vocab(tokenizer('I will see you')))

tensor([  2, 126,  83,  19])

In [15]:
len(padded_text)

1000

In [16]:
# ?torch.nn.Embedding

In [17]:
## vocab([tokens])

vocab(tokenizer('I will see you!')) ## similar to the 'fit_on_texts()' in tf

[2, 126, 83, 19, 4]

In [18]:
embedd = torch.nn.Embedding(num_embeddings=len(vocab.get_stoi()),embedding_dim=5,padding_idx=0) 

## if we want to add embedding for a text, we should assign the value to "num_embeddings" according to the 
## max 'integer_id' that is present in the tokenized text


In [19]:
input = embedd(torch.tensor(vocab(tokenizer('I will see you nonsense!'))))
input

tensor([[-0.3571, -0.4032,  0.9592, -0.4115, -1.1382],
        [ 0.4083,  0.0797, -0.2803, -1.6861,  0.9552],
        [ 0.0401,  0.9522, -0.9773, -0.8119, -0.4003],
        [-0.0648, -0.7802,  0.2634,  1.6336, -0.4806],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-1.6911,  0.3955, -1.5170,  1.4016, -0.8760]],
       grad_fn=<EmbeddingBackward0>)

In [20]:
input.shape

torch.Size([6, 5])

In [21]:
len(vocab.get_stoi())

2608

In [22]:
# numericalize tokens from iterator using vocab
sequence = numericalize_tokens_from_iterator(vocab=vocab.get_stoi(), iterator=token_gen(df['tweet']))

# create a list to store tokenized sequences
text = []
for i in range(len(df)):
    x = list(next(sequence))
    text.append(x)

# Pad the sequences to the same length along dimension 0
padded_text = pad_sequence([torch.tensor(x) for x in text], batch_first=True, padding_value=0)

# Create the Embedding module with the correct weight matrix size
embedd = torch.nn.Embedding(len(vocab.get_stoi()), 5, padding_idx=0)

# Check the shape of the padded_text and compare it to the expected input shape of the Embedding module
print(padded_text.shape)
# should be: torch.Size([batch_size, sequence_length])

# Use the Embedding module with the padded_text
input = embedd(padded_text)
print(input)


torch.Size([1000, 137])
tensor([[[-0.6555, -0.3766,  0.8659,  0.5110, -1.6438],
         [ 0.4780,  0.8237,  0.9176, -0.5199, -1.0795],
         [ 0.8093, -0.0148, -0.2834, -0.0786,  0.3546],
         ...,
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.6306, -0.8548,  1.3830, -0.1126,  1.4427],
         [-0.0295,  1.0474,  0.6779,  0.6523,  0.4205],
         [-0.2795,  0.9198,  0.6044,  1.7222,  0.6262],
         ...,
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-0.6555, -0.3766,  0.8659,  0.5110, -1.6438],
         [ 0.4780,  0.8237,  0.9176, -0.5199, -1.0795],
         [ 0.8093, -0.0148, -0.2834, -0.0786,  0.3546],
         ...,
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.000

In [23]:
# numericalize tokens from iterator using vocab
sequence = numericalize_tokens_from_iterator(vocab=vocab.get_stoi(), iterator=token_gen(df['tweet']))

# create a list to store tokenized sequences
text = []
for i in range(len(df)):
    x = list(next(sequence))
    text.append(x)

# Pad the sequences to the same length along dimension 0
padded_text = pad_sequence([torch.tensor(x) for x in text], batch_first=True, padding_value=0)

# Create the Embedding module with the correct weight matrix size
embedd = torch.nn.Embedding(len(vocab.get_stoi()), 5, padding_idx=0)

# Check the shape of the padded_text and compare it to the expected input shape of the Embedding module
print(padded_text[0].shape)
# should be: torch.Size([batch_size, sequence_length])

# Use the Embedding module with the padded_text
input = embedd(padded_text[0])
print(input)


torch.Size([137])
tensor([[ 0.1940, -0.4496, -0.3837, -0.0237,  0.2262],
        [-1.2530,  0.9247,  0.3665,  2.2365,  1.6596],
        [ 0.8095, -1.4800,  0.2387, -1.2856,  0.7340],
        [ 0.6913, -0.7684,  0.1224,  0.6262, -1.8186],
        [ 1.7514, -1.4499,  0.7342, -0.0062, -0.5063],
        [-1.0421,  0.4839,  1.9202, -0.6021,  0.0416],
        [ 0.1940, -0.4496, -0.3837, -0.0237,  0.2262],
        [ 1.8969, -1.7971,  0.0827,  0.8236, -0.9634],
        [ 1.9041, -0.5768,  0.4792,  0.1585,  0.3965],
        [ 0.8535,  0.3713,  2.0798, -0.0761,  0.4754],
        [ 0.8035,  0.0107, -0.7748,  1.4706, -0.4778],
        [ 0.3280, -0.2969,  0.9380,  0.1781, -1.1407],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0

In [24]:
df

Unnamed: 0,tweet,sentiment,label
0,im getting on borderlands and i will murder yo...,Positive,3
1,I am coming to the borders and I will kill you...,Positive,3
2,im getting on borderlands and i will kill you ...,Positive,3
3,im coming on borderlands and i will murder you...,Positive,3
4,im getting on borderlands 2 and i will murder ...,Positive,3
...,...,...,...
995,Who's down with some @Borderlands on,Positive,3
996,Who't s someone down for some @Borderlands on,Positive,3
997,Who's down for the @Borderlands on,Positive,3
998,@EpicGames @2K @Steam why add crossplay for @B...,Negative,1


In [25]:
print(padded_text.shape)
padded_text

torch.Size([1000, 137])


tensor([[  25,  123,  238,  ...,    0,    0,    0],
        [   2,   86,  263,  ...,    0,    0,    0],
        [  25,  123,  238,  ...,    0,    0,    0],
        ...,
        [ 328,   46,  252,  ...,    0,    0,    0],
        [1099, 1091,  386,  ...,    0,    0,    0],
        [  22, 1367,   22,  ...,    0,    0,    0]])

In [26]:
label =df['label'].to_list()
label

[3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 3,
 3,
 3,
 3,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,


In [27]:
label= torch.tensor(label)
print(label.shape)
label

torch.Size([1000])


tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,
        3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2,
        2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0,
        0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
        2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,

In [28]:
len(label.unique())

4

In [29]:
# Import required libraries
import torch.nn as nn

# Determine the number of classes
num_classes = len(label.unique())

# Define the RNNClassify module
class RNNClassify(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        
        # Define the embedding layer
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Define the RNN layer
        self.rnn = nn.RNN(embed_dim, hidden_size,batch_first=True)
        
        # Define the linear layer
        self.linear = nn.Linear(hidden_size, num_classes)
        
        # Initialize the weights of the module
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embed.weight.data.uniform_(-initrange, initrange)
        self.rnn.weight_ih_l0.data.uniform_(-initrange, initrange)
        self.rnn.weight_hh_l0.data.uniform_(-initrange, initrange)
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        
    def forward(self, input):
        # Embed the input
        embedded = self.embed(input)
        #print('embedded shape:',embedded.shape)
        
        # Pass the embedded input through the RNN layer
        output, hidden = self.rnn(embedded)
        #print('rnn output shape:',output.shape)
        #print('rnn hidden shape:',hidden.shape)
        
        output = output[:, -1, :]  # taking last output of RNN
        #print('rnn last output shape:',output.shape)
        
        # Pass the output through the linear layer
        output = self.linear(output)
        
        # Return the output
        return output


In [30]:
VOCAB_SIZE = len(vocab.get_stoi())

In [31]:
model = RNNClassify(vocab_size=VOCAB_SIZE,embed_dim=100,hidden_size=32).to(device)

In [32]:
padded_text[0]

tensor([  25,  123,  238,   20,   21,    7,   25,  126, 1005,   19,   29,    6,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])

In [33]:
padded_text[0].shape

torch.Size([137])

In [34]:
padded_text[0].unsqueeze(0).shape

torch.Size([1, 137])

In [35]:
model(padded_text[0].unsqueeze(0))

tensor([[-0.0204,  0.2515, -2.0407,  1.1971]], grad_fn=<AddmmBackward0>)

In [36]:
model(padded_text[0].unsqueeze(0)).shape

torch.Size([1, 4])

In [37]:
padded_text

tensor([[  25,  123,  238,  ...,    0,    0,    0],
        [   2,   86,  263,  ...,    0,    0,    0],
        [  25,  123,  238,  ...,    0,    0,    0],
        ...,
        [ 328,   46,  252,  ...,    0,    0,    0],
        [1099, 1091,  386,  ...,    0,    0,    0],
        [  22, 1367,   22,  ...,    0,    0,    0]])

In [38]:
padded_text.shape

torch.Size([1000, 137])

In [39]:
model(padded_text)  

tensor([[-0.0204,  0.2515, -2.0407,  1.1971],
        [-0.1059,  0.2541, -2.0365,  1.0953],
        [-0.1866,  0.2991, -2.1340,  0.9955],
        ...,
        [ 0.2618,  0.1544, -1.7033,  1.4860],
        [ 0.1290,  0.1948, -1.9411,  1.4386],
        [-0.0311,  0.2527, -2.0405,  1.1715]], grad_fn=<AddmmBackward0>)

In [40]:
model(padded_text).shape              # why?? we should get [1000, 4]

torch.Size([1000, 4])

### will try `Batch Gradient Descent`

#### first of all, let me fix `(X_train,y_train)` and `(X_test,y_test)`

In [41]:
# X_train,y_train

X_train,y_train = padded_text,label

In [42]:
df_valid

Unnamed: 0,tweet,sentiment,label
0,I mentioned on Facebook that I was struggling ...,Irrelevant,0
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,2
2,@Microsoft Why do I pay for WORD when it funct...,Negative,1
3,"CSGO matchmaking is so full of closet hacking,...",Negative,1
4,Now the President is slapping Americans in the...,Neutral,2
...,...,...,...
95,@BlizzardCS so when i try to buy overwatch wit...,Negative,1
96,@verizon Can you waive some data overage charg...,Negative,1
97,No one buy battlefield 3 on steam! It doesn’t ...,Negative,1
98,Our #HISAPerth #OBIawards ceremony is taking p...,Neutral,2


In [43]:
len(df_valid)

100

In [44]:
valid_token_ids = []
for i in range(len(df_valid)):
    token_id = vocab(tokenizer(df_valid['tweet'][i]))
    valid_token_ids.append(token_id)
    
valid_token_ids

[[2,
  0,
  20,
  0,
  23,
  2,
  34,
  0,
  13,
  0,
  5,
  189,
  13,
  9,
  313,
  3,
  130,
  334,
  6,
  382,
  170,
  63,
  0,
  198,
  0,
  354,
  169,
  0,
  50,
  775,
  0,
  77,
  308,
  64,
  51,
  12,
  465,
  152,
  7,
  0,
  5,
  303,
  0,
  6,
  76,
  68,
  0,
  2,
  410,
  9,
  0,
  6,
  1484,
  0,
  0],
 [1787,
  0,
  30,
  0,
  367,
  562,
  0,
  2453,
  0,
  2161,
  0,
  38,
  9,
  1507,
  0,
  0,
  1507,
  0,
  194],
 [0, 1595, 44, 2, 1678, 13, 0, 102, 17, 0, 33, 0, 20, 15, 0, 0, 32, 0],
 [0, 0, 10, 33, 425, 12, 0, 0, 6, 17, 46, 9, 0, 0, 39, 1],
 [1159,
  3,
  1992,
  10,
  0,
  0,
  16,
  3,
  1627,
  23,
  254,
  62,
  199,
  0,
  135,
  0,
  0,
  419,
  303,
  45,
  0,
  4,
  1878,
  0,
  20,
  0,
  0,
  194],
 [1906,
  0,
  2,
  411,
  65,
  0,
  0,
  16,
  15,
  0,
  13,
  3,
  2398,
  412,
  2600,
  7,
  3,
  396,
  0,
  505,
  52,
  0,
  2592,
  2,
  34,
  994,
  69,
  58,
  0,
  0,
  6,
  2482,
  507,
  15,
  2142,
  7,
  2,
  410,
  428,
  5,
  1317,
  15,


In [45]:
# valid_token_ids = torch.tensor(valid_token_ids) # this will through

# Pad the sequences to the same length along dimension 0
padded_text_valid = pad_sequence([torch.tensor(x) for x in valid_token_ids], batch_first=True, padding_value=0)

# here look, <UNK> will be assign to 0 and padding_idx will be assign also 0

print(padded_text_valid.shape)
print(padded_text_valid)

torch.Size([100, 64])
tensor([[   2,    0,   20,  ...,    0,    0,    0],
        [1787,    0,   30,  ...,    0,    0,    0],
        [   0, 1595,   44,  ...,    0,    0,    0],
        ...,
        [1563,   90,  467,  ...,    0,    0,    0],
        [ 854,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]])


In [46]:
label_valid = df_valid['label'].to_list()

In [47]:
label_valid = torch.tensor(label_valid)
label_valid

tensor([0, 2, 1, 1, 2, 1, 3, 3, 3, 1, 3, 3, 1, 2, 1, 3, 3, 1, 3, 1, 1, 2, 0, 1,
        2, 2, 1, 0, 0, 1, 3, 3, 1, 3, 1, 2, 2, 0, 3, 2, 3, 2, 2, 2, 3, 2, 1, 1,
        1, 2, 3, 1, 1, 3, 3, 3, 3, 3, 1, 0, 1, 3, 3, 0, 1, 2, 1, 0, 2, 1, 3, 1,
        1, 3, 3, 0, 3, 0, 2, 2, 2, 3, 3, 2, 3, 2, 1, 0, 1, 2, 2, 1, 3, 0, 0, 1,
        1, 1, 2, 3])

In [48]:
# X_test,y_test

X_test, y_test = padded_text_valid, label_valid
X_test, y_test

(tensor([[   2,    0,   20,  ...,    0,    0,    0],
         [1787,    0,   30,  ...,    0,    0,    0],
         [   0, 1595,   44,  ...,    0,    0,    0],
         ...,
         [1563,   90,  467,  ...,    0,    0,    0],
         [ 854,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]]),
 tensor([0, 2, 1, 1, 2, 1, 3, 3, 3, 1, 3, 3, 1, 2, 1, 3, 3, 1, 3, 1, 1, 2, 0, 1,
         2, 2, 1, 0, 0, 1, 3, 3, 1, 3, 1, 2, 2, 0, 3, 2, 3, 2, 2, 2, 3, 2, 1, 1,
         1, 2, 3, 1, 1, 3, 3, 3, 3, 3, 1, 0, 1, 3, 3, 0, 1, 2, 1, 0, 2, 1, 3, 1,
         1, 3, 3, 0, 3, 0, 2, 2, 2, 3, 3, 2, 3, 2, 1, 0, 1, 2, 2, 1, 3, 0, 0, 1,
         1, 1, 2, 3]))

#### Now, write the train and test loop for `Batch Gradient Descent`

In [49]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()  # remember it gives logits (row outputs)

In [50]:
# Batch Gradient Descent

epochs = 60


for epoch in range(epochs):
    train_loss,train_acc = 0,0
    
    # Set model to training mode
    model.train()
    
    X_train,y_train = X_train.to(device), y_train.to(device)
    
    y_logits = model(X_train)
    #print('shape of y_logits:',y_logits.shape)

    
    # Compute loss with one-hot encoded targets
    loss = loss_fn(y_logits, y_train)
    
    train_loss += loss
    train_acc += (y_logits.argmax(1) == y_train).sum().item() / len(y_train)
    
    optimizer.zero_grad()
    loss.backward()
    
    optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')
    
    
    with torch.inference_mode():
        
        model.eval()
        
        test_loss,test_acc = 0,0
    
        X_test, y_test = X_test.to(device), y_test.to(device)
        
        y_logits = model(X_test)

        # Compute loss with one-hot encoded targets
        loss = loss_fn(y_logits, y_test)

        test_loss += loss.item()
            
        # Compute accuracy
        test_preds = y_logits.argmax(dim=1)
        test_acc += (test_preds == y_test).sum().item() / len(y_test)

        
        print(f'Epoch {epoch+1}/{epochs}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')
        
        print('--'*50)

Epoch 1/60, Train Loss: 1.7308, Train Accuracy: 0.4250
Epoch 1/60, Test Loss: 1.6580, Test Accuracy: 0.3000
----------------------------------------------------------------------------------------------------
Epoch 2/60, Train Loss: 1.6423, Train Accuracy: 0.4250
Epoch 2/60, Test Loss: 1.5763, Test Accuracy: 0.2900
----------------------------------------------------------------------------------------------------
Epoch 3/60, Train Loss: 1.5874, Train Accuracy: 0.4250
Epoch 3/60, Test Loss: 1.5316, Test Accuracy: 0.3200
----------------------------------------------------------------------------------------------------
Epoch 4/60, Train Loss: 1.5489, Train Accuracy: 0.3590
Epoch 4/60, Test Loss: 1.4828, Test Accuracy: 0.3200
----------------------------------------------------------------------------------------------------
Epoch 5/60, Train Loss: 1.4919, Train Accuracy: 0.3340
Epoch 5/60, Test Loss: 1.4862, Test Accuracy: 0.3100
--------------------------------------------------------

Epoch 40/60, Train Loss: 1.2419, Train Accuracy: 0.4630
Epoch 40/60, Test Loss: 1.4314, Test Accuracy: 0.3200
----------------------------------------------------------------------------------------------------
Epoch 41/60, Train Loss: 1.2376, Train Accuracy: 0.4660
Epoch 41/60, Test Loss: 1.4303, Test Accuracy: 0.3200
----------------------------------------------------------------------------------------------------
Epoch 42/60, Train Loss: 1.2340, Train Accuracy: 0.4710
Epoch 42/60, Test Loss: 1.4290, Test Accuracy: 0.3200
----------------------------------------------------------------------------------------------------
Epoch 43/60, Train Loss: 1.2353, Train Accuracy: 0.4710
Epoch 43/60, Test Loss: 1.4223, Test Accuracy: 0.3200
----------------------------------------------------------------------------------------------------
Epoch 44/60, Train Loss: 1.2426, Train Accuracy: 0.4670
Epoch 44/60, Test Loss: 1.4197, Test Accuracy: 0.3300
----------------------------------------------