In [2]:
import numpy as np
import pandas as pd
import torch
import string
import re
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from sklearn.preprocessing import LabelEncoder
from torchtext.vocab import build_vocab_from_iterator
import torchtext.vocab as vocab

In [3]:
training_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
training_df.shape

(74682, 4)

In [None]:
training_df.sample(5)

In [4]:
print(f'Shape before dropping nulls {training_df.shape}')
training_df = training_df.dropna()
print(f'Shape after dropping nulls {training_df.shape}')

Shape before dropping nulls (74682, 4)
Shape after dropping nulls (73996, 4)


In [5]:
training_df.loc[:, 3] = training_df.loc[:, 3].str.replace(re.escape(string.punctuation), ' ', regex=True)
training_df.loc[:, 3] = training_df.loc[:, 3].str.replace(',', ' ', regex=True)

In [6]:
input_texts = training_df[3].tolist()
print(len(input_texts))

tokenizer = get_tokenizer('basic_english')

tokenized_texts = [tokenizer(text) for text in input_texts]
# tokenized_texts = tokenized_texts



# flattened_list = sum(tokenized_texts, [])
flattened_list = [token for tokens in tokenized_texts for token in tokens]

vocab = sorted(list(set(flattened_list)))
word_to_id = {word:i+1 for i, word in enumerate(vocab)}
id_to_word = {i+1:word for i, word in enumerate(vocab)}

le = LabelEncoder()
training_df['Labels']  = le.fit_transform(training_df[2])
output_y = training_df['Labels'].tolist()

73996


In [None]:
# Read glove embeddings
glove_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'

with open(glove_path, 'r') as file:
     lines = file.readlines()

print(len(lines))

In [None]:
word_to_vec = dict()
for line in lines:
    word_and_vec = line.split(' ', maxsplit=1)
    word, vec = word_and_vec[0], word_and_vec[1]
    vec_array = np.fromstring(vec, sep=' ').astype('float32')
    word_to_vec[word] = vec_array


In [None]:
input_x = ['I loved the movie', 'The product exceeded my expectations', 'The service was terrible', 'The performance of the device is disappointing']
output_y = [1, 1, 0, 0]

input_x = list(map(str.lower, input_x))
tokenized_inputs = [tokenizer(text) for text in input_x]
flattened_list = sum(tokenized_inputs, [])
print(flattened_list)
vocab = sorted(list(set(flattened_list)))
word_to_id = {word:i+1 for i, word in enumerate(vocab)}
id_to_word = {i+1:word for i, word in enumerate(vocab)}

In [16]:
encode_text = lambda x: [word_to_id[_] for _ in x]

encoded_inputs = list(map(encode_text, tokenized_texts))
padded = pad_sequence(list(map(torch.tensor, encoded_inputs)), batch_first=True)
output_y = torch.tensor(output_y, dtype=torch.float32).unsqueeze(-1)
output_y = output_y.type(torch.LongTensor)
padded.shape, output_y.shape

  output_y = torch.tensor(output_y, dtype=torch.float32).unsqueeze(-1)


(torch.Size([73996, 311]), torch.Size([73996, 1, 1]))

In [8]:
B, T = padded.shape
mask = torch.eq(padded, 0).to(torch.float32)
mask = mask * -1e9
masked_reshape = mask.reshape(B, 1, T)

In [9]:
class Embedding(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        self.embedding_layer = nn.Embedding(n_vocab, n_embed)
        
    def forward(self, x):
        return self.embedding_layer(x)

In [10]:
class Head(nn.Module):
    def __init__(self, head_size=16):
        super().__init__()
        self.query = nn.Linear(n_embed, head_size)
        self.key = nn.Linear(n_embed, head_size)
        self.value = nn.Linear(n_embed, head_size)
        
    def forward(self, x, mask):
        B, T, C = x.shape
        
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        
        wei = query @ key.transpose(-2, -1)
        
        if mask is not None:
            wei = wei + mask
        
        wei = F.softmax(wei, dim=-1)
        out = wei @ value # (B, T, head_size)
        
        return out

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [12]:
class Encoder(nn.Module):
    def __init__(self, n_vocab, n_embed, timesteps, head_size, output):
        super().__init__() # What happens if I pass the class name in super?
        self.embedding = Embedding(n_vocab, n_embed)
        self.positional_encodings = self.get_positional_embeddings(self.get_angle(timesteps, n_embed)).to(device)
        self.sa = Head(head_size)
        self.inter1_layer = nn.Linear(head_size, timesteps)
        self.output = nn.Linear(timesteps**2, output)
        
    def forward(self, x, mask):
        B, T = x.shape
        embedding = self.embedding(x) + self.positional_encodings # (B, timesteps, n_embed)
        sa_out = self.sa(embedding, mask) # (B, timesteps, head_size)
        inter1 = self.inter1_layer(sa_out) # (B, timesteps, head_size) @ (head_size, timesteps) --> (B, timesteps, timesteps)
        output = self.output(inter1.view(B, -1))
#         output = F.softmax(output, dim=-1)
        
        return output
    
    def get_angle(self, timesteps, dim):
        k = np.arange(dim)[np.newaxis, :]
        i = k // 2
        
        positions = np.arange(timesteps)[:, np.newaxis]
        angles = positions / (10000 ** (2*i/dim))
        
        return angles
    
    def get_positional_embeddings(self, angles):
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        
        return torch.tensor(angles, dtype=torch.float32)

In [17]:
split = 0.8
train_size = int(split * padded.shape[0])

x_train, x_val, y_train, y_val = padded[:train_size], padded[train_size:], output_y[:train_size], output_y[train_size:]
x_train, x_val, y_train, y_val = x_train.to(device), x_val.to(device), y_train.to(device), y_val.to(device)
mask_train, mask_val = masked_reshape[:train_size], masked_reshape[train_size:]
mask_train, mask_val = mask_train.to(device), mask_val.to(device)
list(map(lambda x: x.shape, [x_train, x_val, y_train, y_val, mask_train, mask_val]))

[torch.Size([59196, 311]),
 torch.Size([14800, 311]),
 torch.Size([59196, 1, 1]),
 torch.Size([14800, 1, 1]),
 torch.Size([59196, 1, 311]),
 torch.Size([14800, 1, 311])]

In [None]:
padded.shape[0] // 2028

In [18]:
n_embed = 64
timesteps = padded.shape[-1]
model = Encoder(len(vocab) + 1, n_embed, timesteps, head_size=16, output=4)
batch_size = 64
batch_per_epoch = x_train.shape[0] // batch_size

# loss_function = nn.BCELoss()
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# output_y = output_y.type(torch.LongTensor)
# inputs, targets, masked_reshape = padded.to(device), output_y.to(device), masked_reshape.to(device)

for epoch in range(10):
    train_loss, val_loss = 0, 0
    for i in range(batch_per_epoch):
        start = i * batch_size
        x_batch, y_batch, mask = x_train[start:start+batch_size], y_train[start:start+batch_size], mask_train[start:start+batch_size]

        model.train(True)

        optimizer.zero_grad()
        output = model(x_batch, mask)
        loss = loss_function(output, y_batch.view(-1))
        train_loss += loss
        
        loss.backward()
        optimizer.step()
        
    print(f'Epoch {epoch} Loss {i}: {train_loss / (i+1)}')   
    model.eval()
    with torch.no_grad():
        output_val = model(x_val, mask_val)
        loss_val = loss_function(output_val, y_val.view(-1))
        print(f'Epoch {epoch} Val loss: {loss_val}')

Epoch 0 Loss 923: 1.9859893321990967
Epoch 0 Val loss: 1.5022014379501343
Epoch 1 Loss 923: 1.3363515138626099
Epoch 1 Val loss: 1.45757257938385
Epoch 2 Loss 923: 1.1962958574295044
Epoch 2 Val loss: 1.5224149227142334
Epoch 3 Loss 923: 1.076869010925293
Epoch 3 Val loss: 1.9083342552185059
Epoch 4 Loss 923: 0.9610947966575623
Epoch 4 Val loss: 2.1496798992156982
Epoch 5 Loss 923: 0.8442988395690918
Epoch 5 Val loss: 2.8225908279418945
Epoch 6 Loss 923: 0.7202357649803162
Epoch 6 Val loss: 2.80661940574646
Epoch 7 Loss 923: 0.6065431237220764
Epoch 7 Val loss: 2.5435023307800293
Epoch 8 Loss 923: 0.5160390734672546
Epoch 8 Val loss: 2.1813507080078125
Epoch 9 Loss 923: 0.4337286949157715
Epoch 9 Val loss: 2.438356637954712


In [None]:
print(start, start + batch_size)
x_train[start : start + batch_size], x_train.shape

In [None]:
batch_per_epoch = x_train.shape[0] // batch_size
print(batch_per_epoch)

for i in range(batch_per_epoch+1):
    start = i * batch_size
    print(i, start, start+batch_size)
    print(x_train[start : start+batch_size].shape)
    print()

In [None]:
nested_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened_list = sum(nested_list, [])
print(flattened_list)
