In [52]:
import numpy as np
import pandas as pd
import torch
import string
import re
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F

In [5]:
training_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
training_df.shape

(74682, 4)

In [6]:
training_df.sample(5)

Unnamed: 0,0,1,2,3
65189,7964,MaddenNFL,Positive,Happy Birthday day!.
68100,3663,Cyberpunk2077,Irrelevant,I’m honestly not that bothered that @Cyberpunk...
18773,12417,WorldOfCraft,Positive,how do I tell a youtube ads not not a GamerTM ...
47857,5815,HomeDepot,Neutral,If you want to inform people in the toilet at ...
73598,9007,Nvidia,Positive,Holy crap is killing him


In [13]:
print(f'Shape before dropping nulls {training_df.shape}')
training_df = training_df.dropna()
print(f'Shape after dropping nulls {training_df.shape}')

Shape before dropping nulls (74682, 4)
Shape after dropping nulls (73996, 4)


In [23]:
training_df.loc[:, 3] = training_df.loc[:, 3].str.replace(re.escape(string.punctuation), '', regex=True)

In [5]:
# input_texts = training_df[3].tolist()
# print(len(input_texts))

tokenizer = get_tokenizer('basic_english')
# tokenized_texts = [tokenizer(text) for text in input_texts]

In [46]:
input_x = ['I loved the movie', 'The product exceeded my expectations', 'The service was terrible', 'The performance of the device is disappointing']
output_y = [1, 1, 0, 0]

input_x = list(map(str.lower, input_x))
tokenized_inputs = [tokenizer(text) for text in input_x]
flattened_list = sum(tokenized_inputs, [])
print(flattened_list)
vocab = sorted(list(set(flattened_list)))
word_to_id = {word:i+1 for i, word in enumerate(vocab)}
id_to_word = {i+1:word for i, word in enumerate(vocab)}

['i', 'loved', 'the', 'movie', 'the', 'product', 'exceeded', 'my', 'expectations', 'the', 'service', 'was', 'terrible', 'the', 'performance', 'of', 'the', 'device', 'is', 'disappointing']


In [47]:
encode_text = lambda x: [word_to_id[_] for _ in x]

encoded_inputs = list(map(encode_text, tokenized_inputs))
padded = pad_sequence(list(map(torch.tensor, encoded_inputs)), batch_first=True)
output_y = torch.tensor(output_y, dtype=torch.float32).unsqueeze(-1)
padded.shape, output_y.shape

[[5, 7, 15, 8], [15, 12, 3, 9, 4], [15, 13, 16, 14], [15, 11, 10, 15, 1, 6, 2]]


(torch.Size([4, 7]), torch.Size([4, 1]))

In [54]:
padded

tensor([[ 5,  7, 15,  8,  0,  0,  0],
        [15, 12,  3,  9,  4,  0,  0],
        [15, 13, 16, 14,  0,  0,  0],
        [15, 11, 10, 15,  1,  6,  2]])

In [58]:
B, T = padded.shape
mask = torch.eq(padded, 0).to(torch.float32)
mask = mask * -1e9
masked_reshape = mask.reshape(B, 1, timesteps)

In [10]:
class Embedding(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        self.embedding_layer = nn.Embedding(n_vocab, n_embed)
        
    def forward(self, x):
        return self.embedding_layer(x)

In [77]:
class Head(nn.Module):
    def __init__(self, head_size=16):
        super().__init__()
        self.query = nn.Linear(n_embed, head_size)
        self.key = nn.Linear(n_embed, head_size)
        self.value = nn.Linear(n_embed, head_size)
        
    def forward(self, x, mask):
        B, T, C = x.shape
        
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        
        wei = query @ key.transpose(-2, -1)
        
        if mask is not None:
            wei = wei + mask
        
        wei = F.softmax(wei, dim=-1)
        out = wei @ value # (B, T, head_size)
        
        return out

In [84]:
class Encoder(nn.Module):
    def __init__(self, n_vocab, n_embed, timesteps, head_size):
        super().__init__() # What happens if I pass the class name in super?
        self.embedding = Embedding(n_vocab, n_embed)
        self.sa = Head(head_size)
        self.inter1_layer = nn.Linear(head_size, timesteps)
        self.output = nn.Linear(timesteps**2, 1)
        
    def forward(self, x, mask):
        B, T = x.shape
        embedding = self.embedding(x) # (B, timesteps, n_embed)
        sa_out = self.sa(embedding, mask) # (B, timesteps, head_size)
        inter1 = self.inter1_layer(sa_out) # (B, timesteps, head_size) @ (head_size, timesteps) --> (B, timesteps, timesteps)
        output = self.output(inter1.view(B, -1))
        output = torch.sigmoid(output)
        
        return output

In [85]:
n_embed = 32
timesteps = padded.shape[-1]
model = Encoder(len(vocab) + 1, n_embed, timesteps, head_size=16)

loss_function = nn.BCELoss()
learning_rate = 0.001 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(50):
    model.train()
    inputs, targets, masked_reshape = padded.to(device), output_y.to(device), masked_reshape.to(device)
    optimizer.zero_grad()
    output = model(inputs, masked_reshape)
    loss = loss_function(output, targets)
    print(f'Epoch {epoch}: {loss}')
    loss.backward()
    optimizer.step()

Epoch 0: 0.7115222215652466
Epoch 1: 0.6946083307266235
Epoch 2: 0.6778351068496704
Epoch 3: 0.6611291170120239
Epoch 4: 0.6444856524467468
Epoch 5: 0.6278667449951172
Epoch 6: 0.6111794710159302
Epoch 7: 0.5942966938018799
Epoch 8: 0.5770798921585083
Epoch 9: 0.5594139695167542
Epoch 10: 0.5412314534187317
Epoch 11: 0.5225162506103516
Epoch 12: 0.5032942295074463
Epoch 13: 0.48361754417419434
Epoch 14: 0.4635489881038666
Epoch 15: 0.44315290451049805
Epoch 16: 0.4224931001663208
Epoch 17: 0.4016406536102295
Epoch 18: 0.3806852102279663
Epoch 19: 0.35974377393722534
Epoch 20: 0.3389568328857422
Epoch 21: 0.31847116351127625
Epoch 22: 0.2984168529510498
Epoch 23: 0.2788921594619751
Epoch 24: 0.2599628269672394
Epoch 25: 0.24167224764823914
Epoch 26: 0.22405298054218292
Epoch 27: 0.20713363587856293
Epoch 28: 0.19094163179397583
Epoch 29: 0.17550377547740936
Epoch 30: 0.1608477532863617
Epoch 31: 0.14700400829315186
Epoch 32: 0.1340065896511078
Epoch 33: 0.12188851088285446
Epoch 34: 0.1

In [34]:
nested_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened_list = sum(nested_list, [])
print(flattened_list)


[1, 2, 3, 4, 5, 6, 7, 8, 9]
