In [1]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 kaggle.json
!kaggle datasets download  'kazanova/sentiment140'
!unzip sentiment140.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  sentiment140.zip
replace training.1600000.processed.noemoticon.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: training.1600000.processed.noemoticon.csv  


In [2]:
!pip install datasets
!pip install transformers



In [3]:
import re
import bz2
import tqdm
import pandas as pd


import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

In [4]:
def clean_text(text):
    text=text.lower()
    text= re.sub(r'(.)1+', r'1', text) #REPEATING CHARS
    text=re.sub('((www.[^s]+)|(https?://[^s]+))',' ',text) #URLS
    text=re.sub('[0-9]+', '', text) #NUMBERS
    text=" ".join(filter(lambda x:x[0]!='@', text.split())) #REPLY
    return text

In [5]:
df=pd.read_csv('./training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=['label','ids','date','flag','user','text']).sample(frac = 1).reset_index(drop=True)
df['text']=df['text'].apply(clean_text)

In [10]:
train_data=df.iloc[:200000]
test_data=df.iloc[200000:250000]
del df

In [39]:
max_length=64
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = max_length)

In [40]:
class SentimentDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_ids=torch.tensor(tokenizer.encode(row['text'],padding='max_length',max_length=max_length,truncation=True))
        attention_mask=torch.where(input_ids!=1,False,True)
        return {'input_ids':input_ids,
         'attention_mask': attention_mask,
         'label':torch.tensor(0 if row['label']==0 else 1)}

In [41]:
train_p=SentimentDataset(train_data)
test_p=SentimentDataset(test_data)

In [42]:
class Sentiment_Model(torch.nn.Module):
    def __init__(self, embed_dim=64,max_seq_len=max_length):
        super(Sentiment_Model, self).__init__()
        self.word_embedding = nn.Embedding(len(tokenizer), embed_dim)
        self.pos_embedding = nn.Embedding(max_seq_len, embed_dim)
        self.mha1 = nn.MultiheadAttention(embed_dim, 4, 0.2, kdim=embed_dim, vdim=embed_dim)
        self.dense = nn.Linear(embed_dim, 2)

    def forward(self, input_ids):
        word_embeddings = self.word_embedding(input_ids)
        positional_embeddings = self.pos_embedding(torch.arange(input_ids.size(1)).to(device))

        input_embeddings = word_embeddings + positional_embeddings

        attn_output1, attn_output_weights = self.mha1(input_embeddings, input_embeddings, input_embeddings)
        # attn_output2, attn_output_weights = self.mha2(attn_output1, attn_output1, attn_output1)
        mean_output = attn_output1.mean(dim=1)

        outputs = self.dense(mean_output)

        return outputs

In [43]:
model=Sentiment_Model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device='cpu'
model.to(device)

Sentiment_Model(
  (word_embedding): Embedding(50265, 64)
  (pos_embedding): Embedding(64, 64)
  (mha1): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
  )
  (dense): Linear(in_features=64, out_features=2, bias=True)
)

In [44]:
train_loader=DataLoader(train_p,batch_size=32,shuffle=True)
test_loader=DataLoader(test_p,batch_size=32,shuffle=True)

In [45]:
def create_mini_batch(samples):
    input_ids = [s['input_ids'] for s in samples]
    attention_mask = [(s['attention_mask']) for s in samples]
    label = [s['label'] for s in samples]
    l=max_length
    input_ids=torch.stack(input_ids)[:,:l]
    attention_mask=torch.stack(attention_mask)[:,:l]
    label=torch.stack(label)
    return input_ids, attention_mask, label

In [46]:
def train(num_epochs=30):
    criterion= nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    best_valid_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_corrects=0.0
        train_total=0.0
        test_corrects=0.0
        test_total=0.0
        for batch in tqdm.tqdm(train_loader):
            input_ids=batch['input_ids']
            attention_mask=batch['attention_mask'].T
            labels=batch['label']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids)
            loss = criterion(outputs,labels)
            train_loss += loss.item()
            train_corrects+=torch.sum(outputs.argmax(dim=-1)==labels).item()
            train_total+=outputs.size(0)
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for batch in test_loader:
                input_ids=batch['input_ids']
                attention_mask=batch['attention_mask'].T
                labels=batch['label']

                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outputs = model(input_ids)
                loss = criterion(outputs,labels)
                test_corrects+=torch.sum(outputs.argmax(dim=-1)==labels).item()
                test_total+=outputs.size(0)
                valid_loss += loss.item()
        avg_valid_loss = valid_loss / len(test_loader)

        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), "best_model.pt")
        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f},Train acc: {train_corrects/train_total}, Valid Loss: {avg_valid_loss:.4f},Valid acc: {test_corrects/test_total}")

In [47]:
train()

100%|██████████| 6250/6250 [01:48<00:00, 57.76it/s]


Epoch 1/30: Train Loss: 0.5960,Train acc: 0.674715, Valid Loss: 0.5332,Valid acc: 0.74396


100%|██████████| 6250/6250 [01:45<00:00, 59.34it/s]


Epoch 2/30: Train Loss: 0.5181,Train acc: 0.752435, Valid Loss: 0.5094,Valid acc: 0.7562


100%|██████████| 6250/6250 [01:48<00:00, 57.63it/s]


Epoch 3/30: Train Loss: 0.4919,Train acc: 0.770765, Valid Loss: 0.5063,Valid acc: 0.75692


100%|██████████| 6250/6250 [01:47<00:00, 58.12it/s]


Epoch 4/30: Train Loss: 0.4762,Train acc: 0.78184, Valid Loss: 0.5065,Valid acc: 0.76656


100%|██████████| 6250/6250 [01:51<00:00, 56.25it/s]


Epoch 5/30: Train Loss: 0.4665,Train acc: 0.78746, Valid Loss: 0.4886,Valid acc: 0.77332


100%|██████████| 6250/6250 [01:47<00:00, 58.32it/s]


Epoch 6/30: Train Loss: 0.4598,Train acc: 0.791745, Valid Loss: 0.4915,Valid acc: 0.77226


100%|██████████| 6250/6250 [01:46<00:00, 58.92it/s]


Epoch 7/30: Train Loss: 0.4549,Train acc: 0.79563, Valid Loss: 0.4927,Valid acc: 0.77402


100%|██████████| 6250/6250 [01:46<00:00, 58.68it/s]


Epoch 8/30: Train Loss: 0.4504,Train acc: 0.797955, Valid Loss: 0.4900,Valid acc: 0.77426


100%|██████████| 6250/6250 [01:47<00:00, 58.40it/s]


Epoch 9/30: Train Loss: 0.4459,Train acc: 0.80081, Valid Loss: 0.4915,Valid acc: 0.77506


100%|██████████| 6250/6250 [01:49<00:00, 57.34it/s]


Epoch 10/30: Train Loss: 0.4423,Train acc: 0.80302, Valid Loss: 0.4948,Valid acc: 0.77584


100%|██████████| 6250/6250 [01:45<00:00, 59.25it/s]


Epoch 11/30: Train Loss: 0.4388,Train acc: 0.803975, Valid Loss: 0.4962,Valid acc: 0.76784


100%|██████████| 6250/6250 [01:47<00:00, 58.23it/s]


Epoch 12/30: Train Loss: 0.4351,Train acc: 0.807275, Valid Loss: 0.5040,Valid acc: 0.77198


100%|██████████| 6250/6250 [01:45<00:00, 59.32it/s]


Epoch 13/30: Train Loss: 0.4324,Train acc: 0.808205, Valid Loss: 0.4952,Valid acc: 0.7757


100%|██████████| 6250/6250 [01:47<00:00, 58.30it/s]


Epoch 14/30: Train Loss: 0.4281,Train acc: 0.81152, Valid Loss: 0.4997,Valid acc: 0.77382


100%|██████████| 6250/6250 [01:45<00:00, 59.22it/s]


Epoch 15/30: Train Loss: 0.4255,Train acc: 0.81289, Valid Loss: 0.5017,Valid acc: 0.77044


100%|██████████| 6250/6250 [01:45<00:00, 59.09it/s]


Epoch 16/30: Train Loss: 0.4228,Train acc: 0.81435, Valid Loss: 0.5039,Valid acc: 0.77202


100%|██████████| 6250/6250 [01:45<00:00, 59.32it/s]


Epoch 17/30: Train Loss: 0.4197,Train acc: 0.81601, Valid Loss: 0.5112,Valid acc: 0.77174


100%|██████████| 6250/6250 [01:44<00:00, 59.58it/s]


Epoch 18/30: Train Loss: 0.4163,Train acc: 0.81757, Valid Loss: 0.5105,Valid acc: 0.77236


100%|██████████| 6250/6250 [01:47<00:00, 58.02it/s]


Epoch 19/30: Train Loss: 0.4132,Train acc: 0.81947, Valid Loss: 0.5081,Valid acc: 0.77208


100%|██████████| 6250/6250 [01:47<00:00, 58.40it/s]


Epoch 20/30: Train Loss: 0.4106,Train acc: 0.821325, Valid Loss: 0.5104,Valid acc: 0.76824


100%|██████████| 6250/6250 [01:46<00:00, 58.69it/s]


Epoch 21/30: Train Loss: 0.4079,Train acc: 0.822235, Valid Loss: 0.5192,Valid acc: 0.77014


100%|██████████| 6250/6250 [01:46<00:00, 58.65it/s]


Epoch 22/30: Train Loss: 0.4049,Train acc: 0.823825, Valid Loss: 0.5113,Valid acc: 0.76962


 79%|███████▉  | 4935/6250 [01:24<00:22, 58.68it/s]


KeyboardInterrupt: ignored

In [48]:
def run_pipeline(input_):
  input_ids=torch.tensor([tokenizer.encode(input_,padding='max_length',max_length=max_length,truncation=True)]).to(device)
  # print(input_ids)
  outputs=model(input_ids)[0].argmax(dim=-1)
  return outputs

In [61]:
model.eval()

Sentiment_Model(
  (word_embedding): Embedding(50265, 64)
  (pos_embedding): Embedding(64, 64)
  (mha1): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
  )
  (dense): Linear(in_features=64, out_features=2, bias=True)
)

In [None]:
print(run_pipeline('I feel so good'))
print(run_pipeline('I lost my mother today. I miss her. I wish I could have her back'))