In [1]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 kaggle.json
!kaggle datasets download  'kazanova/sentiment140'
!unzip sentiment140.zip

Downloading sentiment140.zip to /content
 85% 69.0M/80.9M [00:00<00:00, 170MB/s]
100% 80.9M/80.9M [00:00<00:00, 158MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [2]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/493.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m327.7/493.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-

In [3]:
import re
import bz2
import tqdm
import pandas as pd


import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

In [4]:
def clean_text(text):
    text=text.lower()
    text= re.sub(r'(.)1+', r'1', text) #REPEATING CHARS
    text=re.sub('((www.[^s]+)|(https?://[^s]+))',' ',text) #URLS
    text=re.sub('[0-9]+', '', text) #NUMBERS
    text=" ".join(filter(lambda x:x[0]!='@', text.split())) #REPLY
    return text

In [14]:
df=pd.read_csv('./training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=['label','ids','date','flag','user','text']).sample(frac = 1).reset_index(drop=True)
df['text']=df['text'].apply(clean_text)

In [15]:
train_data=df.iloc[:50000]
test_data=df.iloc[50000:75000]
del df

In [16]:
max_length=64
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = max_length)

In [17]:
class SentimentDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_ids=torch.tensor(tokenizer.encode(row['text'],padding='max_length',max_length=max_length,truncation=True))
        attention_mask=torch.where(input_ids!=1,False,True)
        return {'input_ids':input_ids,
         'attention_mask': attention_mask,
         'label':torch.tensor(0 if row['label']==0 else 1)}

In [18]:
train_p=SentimentDataset(train_data)
test_p=SentimentDataset(test_data)

In [19]:
class Sentiment_Model(torch.nn.Module):
    def __init__(self, embed_dim=64,max_seq_len=max_length,dim1=32):
        super(Sentiment_Model, self).__init__()
        self.input_embeddings = nn.Embedding(len(tokenizer), embed_dim)
        self.dense1 = nn.Linear(embed_dim, dim1)
        self.dense = nn.Linear(dim1, 2)

    def forward(self, input_ids):
        input_embeddings = self.input_embeddings(input_ids)
        dense1_output = F.relu(self.dense1(input_embeddings))
        mean_output = dense1_output.mean(dim=1)

        outputs = self.dense(mean_output)

        return outputs

In [20]:
train_loader=DataLoader(train_p,batch_size=32,shuffle=True)
test_loader=DataLoader(test_p,batch_size=32,shuffle=True)

In [21]:
def create_mini_batch(samples):
    input_ids = [s['input_ids'] for s in samples]
    attention_mask = [(s['attention_mask']) for s in samples]
    label = [s['label'] for s in samples]
    l=max_length
    input_ids=torch.stack(input_ids)[:,:l]
    attention_mask=torch.stack(attention_mask)[:,:l]
    label=torch.stack(label)
    return input_ids, attention_mask, label

In [22]:
history=[]

In [32]:
def train(num_epochs=10,learning_rate=1e-3,dim1=32,embed_dim=32, optimizer_name='adam'):
    criterion= nn.CrossEntropyLoss()
    if optimizer_name=='adam':
      optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    if optimizer_name =='sgd':
      optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    if optimizer_name =='rmsprop':
      optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
    best_valid_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_corrects=0.0
        train_total=0.0
        test_corrects=0.0
        test_total=0.0
        for batch in tqdm.tqdm(train_loader):
            input_ids=batch['input_ids']
            attention_mask=batch['attention_mask'].T
            labels=batch['label']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids)
            loss = criterion(outputs,labels)
            train_loss += loss.item()
            train_corrects+=torch.sum(outputs.argmax(dim=-1)==labels).item()
            train_total+=outputs.size(0)
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for batch in test_loader:
                input_ids=batch['input_ids']
                attention_mask=batch['attention_mask'].T
                labels=batch['label']

                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outputs = model(input_ids)
                loss = criterion(outputs,labels)
                test_corrects+=torch.sum(outputs.argmax(dim=-1)==labels).item()
                test_total+=outputs.size(0)
                valid_loss += loss.item()
        avg_valid_loss = valid_loss / len(test_loader)

        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), "best_model.pt")
        history.append([epoch,learning_rate,dim1,embed_dim,optimizer,avg_train_loss,train_corrects/train_total,avg_valid_loss,test_corrects/test_total])
        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f},Train acc: {train_corrects/train_total}, Valid Loss: {avg_valid_loss:.4f},Valid acc: {test_corrects/test_total}")

In [33]:
for dim1 in [4,8,16,32,64]:
  for learning_rate in[1e-1,1e-2,1e-3,1e-4]:
    for embed_dim in [4,8,16,32,64]:
      for optimizer_name in ['sgd','adam','rmsprop']:
        model=Sentiment_Model(embed_dim=embed_dim,dim1=dim1)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # device='cpu'
        model.to(device)
        train(100,learning_rate,dim1,embed_dim,optimizer_name)

 63%|██████▎   | 99/157 [00:03<00:02, 25.06it/s]


KeyboardInterrupt: ignored

In [28]:
history=pd.DataFrame(history,columns=['epoch','learning_rate','dim1','embed_dim','optimizer','avg_train_loss','train_acc','avg_valid_loss','test_acc'])

In [29]:
history

Unnamed: 0,epoch,learning_rate,dim1,embed_dim,optimizer,avg_train_loss,train_acc,avg_valid_loss,test_acc
0,0,0.1,4,4,SGD (\nParameter Group 0\n dampening: 0\n ...,0.694887,0.5062,0.692421,0.5212
1,0,0.1,4,4,Adam (\nParameter Group 0\n amsgrad: False\...,0.644763,0.6206,0.610482,0.6744
2,0,0.1,4,4,SGD (\nParameter Group 0\n dampening: 0\n ...,0.698759,0.49,0.70517,0.5036
3,0,0.1,4,4,Adam (\nParameter Group 0\n amsgrad: False\...,0.65755,0.6128,0.627662,0.6468
4,0,0.1,4,4,RMSprop (\nParameter Group 0\n alpha: 0.99\...,0.724291,0.5862,0.648509,0.6308
5,0,0.1,4,8,SGD (\nParameter Group 0\n dampening: 0\n ...,0.696523,0.5196,0.693158,0.5036
6,0,0.1,4,8,Adam (\nParameter Group 0\n amsgrad: False\...,0.669041,0.5794,0.626839,0.6564
7,0,0.1,4,8,RMSprop (\nParameter Group 0\n alpha: 0.99\...,0.684473,0.5894,0.669059,0.6172
8,0,0.1,4,16,SGD (\nParameter Group 0\n dampening: 0\n ...,0.694091,0.5132,0.693633,0.5064
9,0,0.1,4,16,Adam (\nParameter Group 0\n amsgrad: False\...,0.661028,0.5998,0.708619,0.6472


In [None]:
history.to_csv('history.csv')

List 'history' has been pickled and saved to history.pkl


In [None]:
def run_pipeline(input_):
  input_ids=torch.tensor([tokenizer.encode(input_,padding='max_length',max_length=max_length,truncation=True)]).to(device)
  # print(input_ids)
  outputs=model(input_ids)[0].argmax(dim=-1)
  return outputs

In [None]:
model.eval()

Sentiment_Model(
  (word_embedding): Embedding(50265, 64)
  (pos_embedding): Embedding(64, 64)
  (mha1): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
  )
  (dense): Linear(in_features=64, out_features=2, bias=True)
)

In [None]:
print(run_pipeline('I feel so good'))
print(run_pipeline('I lost my mother today. I miss her. I wish I could have her back'))