In [1]:
import numpy as np
import pandas as pd
import torch
import string
import re
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from sklearn.preprocessing import LabelEncoder
from torchtext.vocab import build_vocab_from_iterator

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
training_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
validation_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', header=None)
training_df.shape, validation_df.shape

((74682, 4), (1000, 4))

In [4]:
print(f'Shape before dropping nulls {training_df.shape}')
training_df = training_df.dropna()
validation_df = validation_df.dropna()
print(f'Shape after dropping nulls {training_df.shape}')

Shape before dropping nulls (74682, 4)
Shape after dropping nulls (73996, 4)


In [5]:
validation_df.loc[:, 3] = validation_df.loc[:, 3].str.replace(re.escape(string.punctuation), ' ', regex=True)
validation_df.loc[:, 3] = validation_df.loc[:, 3].str.replace(',', ' ', regex=True)

validation_df.loc[:, 3] = validation_df.loc[:, 3].str.replace(re.escape(string.punctuation), ' ', regex=True)
validation_df.loc[:, 3] = validation_df.loc[:, 3].str.replace(',', ' ', regex=True)

In [6]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

In [7]:
training_texts = training_df[3].tolist()
val_texts = validation_df[3].tolist()

vocab = build_vocab_from_iterator(yield_tokens(training_texts+val_texts), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
text_pipeline = lambda x: vocab(tokenizer(x))
(text_pipeline('im happy'))



[313, 189]

In [8]:
int_training_texts = list(map(text_pipeline, training_texts))
int_val_texts = list(map(text_pipeline, val_texts))

print(list(map(len, [int_training_texts, int_val_texts])))

[73996, 1000]


In [9]:
le = LabelEncoder()
le.fit(training_df[2])

training_df['Labels']  = le.transform(training_df[2])
training_output_y = training_df['Labels'].tolist()

validation_df['Labels']  = le.transform(validation_df[2])
validation_output_y = validation_df['Labels'].tolist()

In [10]:
# # Read glove embeddings
# glove_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'

# with open(glove_path, 'r') as file:
#      lines = file.readlines()

# print(len(lines))

In [11]:
# word_to_vec = dict()
# for line in lines:
#     word_and_vec = line.split(' ', maxsplit=1)
#     word, vec = word_and_vec[0], word_and_vec[1]
#     vec_array = np.fromstring(vec, sep=' ').astype('float32')
#     word_to_vec[word] = vec_array


In [12]:
def padded_and_convert(tokens, output):
#     encode_text = lambda x: [word_to_id[_] for _ in x]

#     encoded_inputs = list(map(encode_text, tokens))
    padded = pad_sequence(list(map(torch.tensor, tokens)), batch_first=True)
    output_y = torch.tensor(output, dtype=torch.float32).unsqueeze(-1)
    output_y = output_y.type(torch.LongTensor)
    print(padded.shape, output_y.shape)
    
    return padded, output_y

train_padded, train_y = padded_and_convert(int_training_texts, training_output_y)
val_padded, val_y = padded_and_convert(int_val_texts, validation_output_y )

max_sequence_length = max(max(len(seq) for seq in train_padded), max(len(seq) for seq in val_padded))
print(max_sequence_length)

train_padded = pad_sequence([torch.cat([seq, torch.zeros(max_sequence_length - len(seq))]) for seq in train_padded], batch_first=True)
val_padded = pad_sequence([torch.cat([seq, torch.zeros(max_sequence_length - len(seq))]) for seq in val_padded], batch_first=True)
print(train_padded.shape, val_padded.shape)

torch.Size([73996, 311]) torch.Size([73996, 1])
torch.Size([1000, 73]) torch.Size([1000, 1])
311
torch.Size([73996, 311]) torch.Size([1000, 311])


In [13]:
def get_masks(padded):
    B, T = padded.shape
    mask = torch.eq(padded, 0).to(torch.float32)
    mask = mask * -1e9
    masked_reshape = mask.reshape(B, 1, T)
    return masked_reshape

train_mask = get_masks(train_padded)
val_mask = get_masks(val_padded)
print(train_mask.shape, val_mask.shape)

torch.Size([73996, 1, 311]) torch.Size([1000, 1, 311])


In [14]:
def get_angle(timesteps, dim):
    k = np.arange(dim)[np.newaxis, :]
    i = k // 2

    positions = np.arange(timesteps)[:, np.newaxis]
    angles = positions / (10000 ** (2*i/dim))

    return angles

def get_positional_embeddings(angles):
    angles[:, 0::2] = np.sin(angles[:, 0::2])
    angles[:, 1::2] = np.cos(angles[:, 1::2])

    return torch.tensor(angles, dtype=torch.float32)
n_embed = 64
timesteps = train_padded.shape[-1]
print(timesteps)


311


In [15]:
class Embedding(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        self.embedding_layer = nn.Embedding(n_vocab, n_embed)
        
    def forward(self, x):
        return self.embedding_layer(x)

In [16]:
class Head(nn.Module):
    def __init__(self, head_size=16):
        super().__init__()
        self.query = nn.Linear(n_embed, head_size)
        self.key = nn.Linear(n_embed, head_size)
        self.value = nn.Linear(n_embed, head_size)
        
    def forward(self, x, mask):
        B, T, C = x.shape
        
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        
        wei = query @ key.transpose(-2, -1)
        
        if mask is not None:
            wei = wei + mask
        
        wei = F.softmax(wei, dim=-1)
        out = wei @ value # (B, T, head_size)
        
        return out

In [17]:
# I expect n_embed output from Multi Head Attention
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(n_embed, n_embed*4)
        self.layer2 = nn.Linear(n_embed*4, n_embed)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        
        return out

In [18]:
# word embedding and output dimension from multihead attention are same.  
# If I have 8 heads, the dimension of query, key and value are, let's say, 800
# then the head_size for each head will be 800 // 8 = 100
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, n_embed):
        super().__init__()
        head_size = n_embed // num_heads
        print(f'Size of embedding is {n_embed}, number of heads is {num_heads}, so head_size is {head_size}')
        self.mha = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        
    def forward(self, x, mask):
        out = torch.cat([h(x, mask) for h in self.mha], dim=-1)
#         print(out.shape)
        out = self.proj(out)
        return out

In [19]:
class Block(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=4, n_embed=64)
        self.ffw = FeedForward()
        
    def forward(self, x, mask):
        out = self.mha(x, mask)
        out = self.ffw(out)
        
        return out

In [20]:
n_vocab = vocab.get_itos().__len__()
e = Embedding(n_vocab, n_embed)
x = e(train_padded[:2].to(torch.long))
out = nn.Linear(311*64, 4)
print(x.shape)
b = Block()
out(b(x, train_mask[:2]).view(2, -1))

torch.Size([2, 311, 64])
Size of embedding is 64, number of heads is 4, so head_size is 16


tensor([[ 0.0271,  0.0075, -0.0280, -0.0075],
        [ 0.0276, -0.0112, -0.0006, -0.0252]], grad_fn=<AddmmBackward0>)

In [21]:
class Encoder(nn.Module):
    def __init__(self, n_vocab, n_embed, timesteps, head_size, output):
        super().__init__() # What happens if I pass the class name in super?
        self.embedding = Embedding(n_vocab, n_embed)
#         self.sa = Head(head_size)
        self.block = Block()
        self.output = nn.Linear(timesteps*head_size, output)
        
    def forward(self, x, mask, positional_encoding):
        B, T = x.shape # validation shape --> (1000, 73)
#         print(self.embedding(x).shape, positional_encoding.shape)
        embedding = self.embedding(x) + positional_encoding # (B, timesteps, n_embed) # validation: (1000, 73, 64)
        sa_out = self.block(embedding, mask) # (B, timesteps, head_size) # validation: (1000, 73, 16)
#         inter1 = self.inter1_layer(sa_out) # (B, timesteps, head_size) @ (head_size, timesteps) --> (B, timesteps, timesteps)
        output = self.output(sa_out.view(B, -1))
#         output = F.softmax(output, dim=-1)
        
        return output

In [22]:
def train_epoch(x_batch, mask, y_batch, positional_encoding):
    optimizer.zero_grad()
    output = model(x_batch, mask, positional_encoding)
    outputs = F.softmax(output, dim=-1)
    
    loss = loss_function(output, y_batch.view(-1))
    
    correct = 0
    correct += (torch.argmax(outputs, dim=-1, keepdims=True) == y_batch).float().sum()
    accuracy = correct / y_batch.shape[0]

    loss.backward()
    optimizer.step()
    
    return loss, accuracy

In [23]:
train_padded, train_y = train_padded.to(device).long(), train_y.to(device)
val_padded, val_y = val_padded.to(device).long(), val_y.to(device) 
train_mask, val_mask = train_mask.to(device), val_mask.to(device)
train_positional_encoding = get_positional_embeddings(get_angle(timesteps, n_embed)).to(device)
val_positional_encoding = get_positional_embeddings(get_angle(val_padded.shape[-1], n_embed)).to(device)
print('Shape of encodings are', train_positional_encoding.shape, val_positional_encoding.shape)

n_embed = 64
timesteps = train_padded.shape[-1]
model = Encoder(vocab.get_itos().__len__(), n_embed, timesteps, head_size=64, output=4)
batch_size = 64
batch_per_epoch = train_padded.shape[0] // batch_size

# loss_function = nn.BCELoss()
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Set the device (CPU or GPU)
model.to(device)

Shape of encodings are torch.Size([311, 64]) torch.Size([311, 64])
Size of embedding is 64, number of heads is 4, so head_size is 16


Encoder(
  (embedding): Embedding(
    (embedding_layer): Embedding(43587, 64)
  )
  (block): Block(
    (mha): MultiHeadAttention(
      (mha): ModuleList(
        (0-3): 4 x Head(
          (query): Linear(in_features=64, out_features=16, bias=True)
          (key): Linear(in_features=64, out_features=16, bias=True)
          (value): Linear(in_features=64, out_features=16, bias=True)
        )
      )
      (proj): Linear(in_features=64, out_features=64, bias=True)
    )
    (ffw): FeedForward(
      (layer1): Linear(in_features=64, out_features=256, bias=True)
      (layer2): Linear(in_features=256, out_features=64, bias=True)
      (relu): ReLU()
    )
  )
  (output): Linear(in_features=19904, out_features=4, bias=True)
)

In [24]:
def calculate_accuracy(outputs, labels):
    correct = 0
    correct += (torch.argmax(F.softmax(outputs, dim=-1), dim=-1, keepdims=True) == labels).float().sum()
    return correct / labels.shape[0]

In [None]:

# model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)
# output_y = output_y.type(torch.LongTensor)
# inputs, targets, masked_reshape = padded.to(device), output_y.to(device), masked_reshape.to(device)

for epoch in range(21, 41):
    train_loss, val_loss = 0, 0
    training_accuracy = 0
    for i in range(batch_per_epoch):
        start = i * batch_size
        x_batch, y_batch, mask = train_padded[start:start+batch_size], train_y[start:start+batch_size], train_mask[start:start+batch_size]
#         x_batch, y_batch, mask = x_batch.to(device).long(), y_batch.to(device).long(), mask.to(device).long()

        model.train(True)
        loss, accuracy = train_epoch(x_batch, mask, y_batch, train_positional_encoding)
        train_loss += loss
        training_accuracy += accuracy
        
    print(f'Epoch {epoch} Loss: {train_loss / (i+1)}')
    print(f'Accuracy at Epoch {epoch} is {training_accuracy / (batch_per_epoch)}')
    print('\n')
    
    model.eval()
    with torch.no_grad():
        output_val = model(val_padded, val_mask, val_positional_encoding)
        loss_val = loss_function(output_val, val_y.view(-1))
        
        outputs_val = F.softmax(output_val, dim=-1)
        accuracy = calculate_accuracy(outputs_val, val_y)
        
        print(f'Epoch {epoch} Val loss: {loss_val}')
        print(f'Accuracy at Epoch {epoch} is {accuracy}')
        
    print()

Epoch 21 Loss: 0.17153874039649963
Accuracy at Epoch 21 is 0.9347696900367737


Epoch 21 Val loss: 1.3473796844482422
Accuracy at Epoch 21 is 0.7800000309944153

Epoch 22 Loss: 0.16457676887512207
Accuracy at Epoch 22 is 0.9370675086975098


Epoch 22 Val loss: 1.3463139533996582
Accuracy at Epoch 22 is 0.8100000619888306

Epoch 23 Loss: 0.16395629942417145
Accuracy at Epoch 23 is 0.9374054074287415


Epoch 23 Val loss: 1.325027585029602
Accuracy at Epoch 23 is 0.8110000491142273

Epoch 24 Loss: 0.15521062910556793
Accuracy at Epoch 24 is 0.9395139813423157


Epoch 24 Val loss: 1.477818489074707
Accuracy at Epoch 24 is 0.8060000538825989

Epoch 25 Loss: 0.14966003596782684
Accuracy at Epoch 25 is 0.942636251449585


Epoch 25 Val loss: 1.273161768913269
Accuracy at Epoch 25 is 0.8160000443458557

Epoch 26 Loss: 0.1410703808069229
Accuracy at Epoch 26 is 0.9464073777198792


Epoch 26 Val loss: 1.315171480178833
Accuracy at Epoch 26 is 0.8330000638961792

Epoch 27 Loss: 0.14596439898014069

In [26]:
output = model(x_batch, mask, train_positional_encoding)
print(F.softmax(output[10], dim=-1))
torch.argmax(F.softmax(output[10], dim=-1)), y_batch[10]

tensor([1.1495e-05, 6.0458e-09, 2.8058e-04, 9.9971e-01], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)


(tensor(3, device='cuda:0'), tensor([3], device='cuda:0'))

In [27]:
calculate_accuracy(output, y_batch)

tensor(0.9219, device='cuda:0')

In [28]:
nested_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flattened_list = sum(nested_list, [])
print(flattened_list)


[1, 2, 3, 4, 5, 6, 7, 8, 9]
