## This notebook describes the step to manually develop an encoder model and use it to train on a dataset from kaggle for sentiment analysis.

In [445]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter



In [446]:
class EncoderBlock(nn.Module):
    def __init__(self, emb_dim=4, nheads=1, ffn_mult=4):
        super().__init__()
        #layer norm before attention
        self.ln1=nn.LayerNorm(emb_dim)
        # multi-head attention 
        self.mha=nn.MultiheadAttention(emb_dim, nheads, batch_first=True)
        #layer nnorm for the FF layers
        self.ln2=nn.LayerNorm(emb_dim)
        # FF layer
        hidden =emb_dim * ffn_mult
        # Two linear layers provide richer transformation than a single one
        self.ffn=nn.Sequential(
            nn.Linear(emb_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, emb_dim)
        )
    def forward(self,x):
        # self attention layer
        x_norm=self.ln1(x)
        attention_op,_=self.mha(x_norm, x_norm, x_norm)# representing Q, K, V vectors
        # o/p will be added with the input to build the
        # residual connection
        x=x+attention_op

        # FF layer
        x_norm=self.ln2(x)
        ff_op=self.ffn(x_norm)
        x=x+ff_op
        return x
        
        

In [447]:
enc=EncoderBlock(emb_dim=4, nheads=1,ffn_mult=4)
sample_ip=torch.randn(2,3,4)
print(sample_ip.shape)
op=enc(sample_ip)
print(op.shape)

torch.Size([2, 3, 4])
torch.Size([2, 3, 4])


In [448]:
print(sample_ip, op)

tensor([[[ 0.0255,  1.1561, -0.3986, -0.3131],
         [ 0.7852,  0.4384, -0.7380,  0.3271],
         [-0.2821,  0.2723, -0.9541,  0.3321]],

        [[ 0.0157, -0.4823, -0.3852, -1.4038],
         [-1.5332,  0.3876, -0.5351,  0.5146],
         [ 0.0317,  0.0847,  0.7276, -0.4101]]]) tensor([[[ 0.0529,  0.7691, -0.3429,  0.7733],
         [ 0.7472, -0.0769, -0.4051,  1.2703],
         [-0.1978, -0.1688, -0.6576,  1.3997]],

        [[ 0.5068, -0.5339, -0.2660, -1.5183],
         [-1.0082,  0.6158, -0.2217,  0.5962],
         [ 0.7361, -0.0350,  0.9410, -0.5245]]], grad_fn=<AddBackward0>)


In [449]:
# Sinusoidal positional embeddings. They can generalize well to longer sequences not 
# seen during training
class PositionalEmbedding(nn.Module):
    def __init__(self, max_seq_len, emb_dimension):
        super().__init__()
        self.max_seq_len=max_seq_len
        self.emb_dimension=emb_dimension
        # create posinional embedding matrix
        pem=torch.zeros(max_seq_len,emb_dimension)
        position=torch.arange(0,max_seq_len).unsqueeze(1).float()
        # 10000^{-2i/d_model}
        even_indices=torch.arange(0, emb_dimension,2) #2i values
        # −ln(10000)/dmodel
        scaler_term=torch.log(torch.tensor(10000.0))/emb_dimension  
        # 10000 −2i/dmodel
        div_term=torch.exp(even_indices.float() * -(scaler_term))
        # Fill even dimensions with sine values
        pem[:,0::2]=torch.sin(position*div_term)
        # Fill odd dimensions with cosine values
        pem[:,1::2]=torch.cos(position*div_term)
        # register as non trainable buffer
        self.register_buffer('pem', pem.unsqueeze(0))
    def forward(self, x):
            # get the seq length
            seq_len=x.size(1)
            return self.pem[:,:seq_len,:]


In [450]:
# test
emb_dimension=128
max_len=100
pos_enc=PositionalEmbedding(max_len,emb_dimension)
batch_size=2
seq_len=10
dummy_emb=torch.randn(batch_size, seq_len,emb_dimension)

In [451]:
print(dummy_emb.shape)
print(dummy_emb)
pos_enc(dummy_emb)

torch.Size([2, 10, 128])
tensor([[[-0.9516, -0.2559,  1.5300,  ..., -0.2252, -0.4155, -0.2399],
         [-0.7829, -0.6417,  0.3801,  ...,  2.4412,  1.9696, -1.1654],
         [ 0.7047,  0.2306,  0.3696,  ...,  1.1410, -0.8331,  1.0201],
         ...,
         [-0.9909, -0.8743,  1.3671,  ..., -1.0081,  0.8707, -0.8133],
         [ 0.5122,  1.1875,  1.1939,  ...,  0.5370, -0.9782,  1.1669],
         [ 1.1159, -1.7818,  0.0869,  ..., -0.8459,  0.2155,  0.8763]],

        [[-0.0923, -1.0994, -0.9169,  ..., -0.8463,  0.2483,  0.4395],
         [ 0.1922, -0.5567,  0.6774,  ...,  0.2837, -1.6245,  0.5813],
         [ 1.4495, -0.2102, -0.0809,  ..., -0.7944, -1.6144, -0.4606],
         ...,
         [-0.4427, -0.3114, -0.3017,  ...,  1.3911, -0.1953,  0.0790],
         [ 1.2849, -0.5876,  1.4305,  ...,  1.2620, -0.4614,  0.4424],
         [ 0.0968,  0.2780,  0.4359,  ..., -2.0978, -0.5354,  0.2515]]])


tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
           0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  7.6172e-01,  ...,  1.0000e+00,
           1.1548e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  9.8705e-01,  ...,  1.0000e+00,
           2.3096e-04,  1.0000e+00],
         ...,
         [ 6.5699e-01,  7.5390e-01, -2.1963e-01,  ...,  1.0000e+00,
           8.0835e-04,  1.0000e+00],
         [ 9.8936e-01, -1.4550e-01,  6.0082e-01,  ...,  1.0000e+00,
           9.2383e-04,  1.0000e+00],
         [ 4.1212e-01, -9.1113e-01,  9.9818e-01,  ...,  1.0000e+00,
           1.0393e-03,  1.0000e+00]]])

In [452]:
# used to pad the sentences with varying length to the same size.
def padding_mask(seq, padding_idx=0):    
    return seq==padding_idx

In [453]:
import os
BASE_DIR='/kaggle/input/datasets/datatattle/covid-19-nlp-text-classification'
TR_FILENAME='Corona_NLP_train.csv'
TST_FILENAME='Corona_NLP_test.csv'
TR_PATH=os.path.join(BASE_DIR, TR_FILENAME)
TST_PATH=os.path.join(BASE_DIR, TST_FILENAME)

In [454]:
import pandas as pd

tr_df=pd.read_csv(TR_PATH, encoding='latin1')
tr_df=tr_df[['OriginalTweet','Sentiment']][:4000]
tr_df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
3995,Giving shout out to workers at grocery &amp; r...,Positive
3996,"@Publix Whatever happens, remain open as we al...",Extremely Positive
3997,LOOK: Residents line up to buy basic goods at ...,Neutral
3998,@kerrimpr Is there COVID-19 risk in fresh prod...,Positive


In [455]:
test_df=pd.read_csv(TST_PATH, encoding='latin1')
test_df=test_df[['OriginalTweet','Sentiment']][:1500]
test_df

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...
1495,Travelzoo offering holidays for Â£33? sad thin...,Negative
1496,"Yes, I blame Communist China &amp; their Wuhan...",Positive
1497,How many Retail store CEOs can I get to compli...,Extremely Positive
1498,Elbow Bump is the new way to streamline your d...,Neutral


In [474]:
tr_df.Sentiment.value_counts()

Sentiment
Positive              1092
Negative              1008
Neutral                696
Extremely Negative     618
Extremely Positive     586
Name: count, dtype: int64

## 5 classes of Sentiments

In [456]:
def check_data(ds):
    # length of dataset
    print(len(ds))
    # number of classes
    print(ds.Sentiment.unique())
    inp=ds.OriginalTweet
    ds['Tweet_length']=ds.OriginalTweet.apply(lambda x: len(x.split()))
    print(ds['Tweet_length'].describe())
   
    
        

In [457]:
check_data(tr_df)

4000
['Neutral' 'Positive' 'Extremely Negative' 'Negative' 'Extremely Positive']
count    4000.000000
mean       32.029000
std        11.661182
min         4.000000
25%        23.000000
50%        34.000000
75%        41.000000
max        61.000000
Name: Tweet_length, dtype: float64


In [458]:
check_data(test_df)

1500
['Extremely Negative' 'Positive' 'Extremely Positive' 'Negative' 'Neutral']
count    1500.000000
mean       32.630667
std        11.678407
min         6.000000
25%        24.000000
50%        34.000000
75%        42.000000
max        62.000000
Name: Tweet_length, dtype: float64


In [459]:
tr_df.Sentiment.unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [460]:
import re
from collections import Counter

class CovidTweetTokenizer:
    def __init__(self,vocabulary_size=5000):
        self.counter=Counter()
        # define the 4 reserved tokens
        self.word2idx={'<PAD>':0,'<SOS>':1,'<EOS>':2,'<UNK>':3 }
        self.idx2word={0:'<PAD>',1:'<SOS>',2:'<EOS>',3:'<UNK>'}
        self.vocabulary_size=vocabulary_size
    def tokenize(self,text):
        #print(text)
        text=text.lower()
        text=re.sub(r'^a-z\s', '', text)
        words=text.split()
        return words

    def build_vocab(self,texts, min_freq=2):
        for text in texts:
            words=self.tokenize(text)
            self.counter.update(words)
        # find the most common words in the corpus
        common_words=self.counter.most_common(self.vocabulary_size-4)
        #print(common_words)
        idx=4 # starting word indexing after the reserved indices
        for word, freq in common_words:
            if freq>min_freq:
                self.word2idx[word]=idx
                self.idx2word[idx]=word
                idx+=1

    def encode(self,text, max_length=128):
         tokens=self.tokenize(text)
         #print(tokens)
         numerical_ids=[1] # sequence start token
         for token in tokens:
             if token in self.word2idx:
                 numerical_ids.append(self.word2idx[token])
             else:
                 # append unknown token
                 numerical_ids.append(3)
         # end the sequence
         numerical_ids.append(2)
         while len(numerical_ids) < max_length:
                numerical_ids.append(0)
         return numerical_ids[:max_length]
     
        

In [461]:

tk=CovidTweetTokenizer()
tk.build_vocab(tr_df.OriginalTweet)

In [462]:
# test
tk.encode("This is hopeless",20)

[1, 21, 12, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [463]:
class CovidTweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_length):
        super().__init__()
        self.tokenizer=tokenizer
        self.inputs=[]
        self.labels=[]
        self.max_length=max_length

        # get the numerical ids for every tweet
        for tweet, label in zip(tweets, labels):            
            encoded_tweet=self.tokenizer.encode(tweet,self.max_length)
            self.inputs.append(encoded_tweet)
            self.labels.append(label)
        self.inputs=torch.tensor(self.inputs, dtype=torch.long)
        self.labels=torch.tensor(self.labels, dtype=torch.long)
        
        
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

In [464]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
tr_df['encoded_labels']=le.fit_transform(tr_df['Sentiment'])
test_df['encoded_labels']=le.transform(test_df['Sentiment'])

In [465]:
train_tweets=tr_df.OriginalTweet
train_labels=tr_df.encoded_labels

test_tweets=test_df.OriginalTweet
test_label=test_df.encoded_labels

In [466]:
train_ds=CovidTweetDataset(train_tweets,train_labels, tk, max_length=128)
test_ds=CovidTweetDataset(test_tweets,test_label, tk, max_length=128)

In [467]:
batch=next(iter(train_ds))
batch

(tensor([1, 3, 3, 3, 3, 6, 3, 6, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(3))

In [468]:
class CovidTweetEncoderModel(nn.Module):
    def __init__(self, vocab_size, embedding_dimension=128, n_layers=5, max_seq_len=256, dropout=.01):
        super().__init__()
        self.embedding_dimension=embedding_dimension
        self.embedding=nn.Embedding(vocab_size,embedding_dimension,0 )
        self.POE=PositionalEmbedding(max_seq_len,embedding_dimension)
        self.dropout=nn.Dropout(dropout)
        self.encoder_layers=nn.ModuleList([EncoderBlock(embedding_dimension, nheads=4,ffn_mult=4 ) for _ in range(n_layers)])
        self.classifier=nn.Linear(embedding_dimension, 5)
    def forward(self,x):
        x=self.embedding(x)
        x_poe=self.POE(x)
        x=x+x_poe
        x=self.dropout(x)
        for enc in self.encoder_layers:
            x=enc(x)
        x=x.mean(dim=1)
        output=self.classifier(x)
        return output

In [469]:
vocab_size=len(tk.word2idx)

In [470]:
model=CovidTweetEncoderModel(vocab_size,128,5)
loss_fn=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(), lr=.01)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [471]:
train_dl=DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl=DataLoader(test_ds, batch_size=32, shuffle=False)

In [472]:
def train_model(model,train_ds, test_ds,loss_fn,optimizer,epochs=20,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model=model.to(device)
    history={"train_acc":[], "test_acc":[],"train_loss":[],"test_loss":[]}
    
    for epoch in range(epochs):
        print(f"epoch: {epoch+1}")
        model.train()
        tr_loss=0
        total_train_size=0
        tr_correct=0
        for inputs, labels in train_ds:
            optimizer.zero_grad()
            inputs=inputs.to(device)
            labels=labels.to(device)
    
            outputs=model(inputs)
            loss=loss_fn(outputs, labels)
            # print("loss.requires_grad:", loss.requires_grad)
            # print("loss.grad_fn:", getattr(loss, "grad_fn", None))
            # print("any param requires_grad?:", any(p.requires_grad for p in model.parameters()))
            # print("model training mode:", model.training)
            
            loss.backward()
            optimizer.step()
    
            _, predicted=torch.max(outputs,1)
            total_train_size+=len(labels)
            tr_correct+=(predicted==labels).sum().item()
            tr_loss+=loss.item()* len(labels)
    
            train_acc=100* tr_correct/total_train_size
            tr_avg_loss=tr_loss/total_train_size
        print(f"train acc: {train_acc} train loss: {tr_avg_loss}")
            
        model.eval()
        test_loss=0
        total_test_size=0
        test_correct=0
        with torch.no_grad():
            for inputs, labels in test_dl:
                inputs=inputs.to(device)
                labels=labels.to(device)
    
                outputs=model(inputs)
                loss=loss_fn(outputs, labels)          
                
                _, predicted=torch.max(outputs,1)
                total_test_size+=len(labels)
                test_correct+=(predicted==labels).sum().item()
                test_loss+=loss.item()* len(labels)
        
                test_acc=100* test_correct/total_test_size
                test_avg_loss=test_loss/total_test_size
            print(f"test acc: {test_acc} test loss: {test_avg_loss}")
            
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)
        history['train_loss'].append(tr_avg_loss)
        history['test_loss'].append(test_avg_loss)
    return history
        

        

In [473]:
history=train_model(model,train_dl, test_dl,loss_fn,optimizer,epochs=10,device=device)

epoch: 1
train acc: 23.975 train loss: 2.804992370605469
test acc: 23.466666666666665 test loss: 1.576015175819397
epoch: 2
train acc: 30.05 train loss: 1.5311879196166993
test acc: 27.4 test loss: 1.6575156456629436
epoch: 3
train acc: 34.6 train loss: 1.4766562576293945
test acc: 32.13333333333333 test loss: 1.5302965078353883
epoch: 4
train acc: 36.5 train loss: 1.461296238899231
test acc: 31.4 test loss: 1.5350473219553629
epoch: 5
train acc: 41.025 train loss: 1.3653128747940064
test acc: 31.733333333333334 test loss: 1.5539536774953207
epoch: 6
train acc: 45.475 train loss: 1.2980395460128784
test acc: 36.266666666666666 test loss: 1.4837761917114258
epoch: 7
train acc: 48.6 train loss: 1.2083226451873779
test acc: 37.666666666666664 test loss: 1.5403889430363973
epoch: 8
train acc: 46.65 train loss: 1.271181414604187
test acc: 30.733333333333334 test loss: 1.640172616004944
epoch: 9
train acc: 37.425 train loss: 12.440426169395447
test acc: 22.6 test loss: 198.07689298502603
epo

## Best Model Accuracy: 37.6%

## Future Improvements

     More epochs
    
     Learning rate tuning
    
     Class imbalance handling
    
     Pretrained encoder instead of training from scratch