In [1]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 kaggle.json
!kaggle datasets download  'kazanova/sentiment140'
!unzip sentiment140.zip

Downloading sentiment140.zip to /content
 80% 65.0M/80.9M [00:00<00:00, 127MB/s]
100% 80.9M/80.9M [00:00<00:00, 121MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [2]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.14.

In [3]:
import re
import bz2
import tqdm
import pandas as pd


import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

In [4]:
def clean_text(text):
    text=text.lower()
    text= re.sub(r'(.)1+', r'1', text) #REPEATING CHARS
    text=re.sub('((www.[^s]+)|(https?://[^s]+))',' ',text) #URLS
    text=re.sub('[0-9]+', '', text) #NUMBERS
    text=" ".join(filter(lambda x:x[0]!='@', text.split())) #REPLY
    return text

In [5]:
df=pd.read_csv('./training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=['label','ids','date','flag','user','text']).sample(frac = 1).reset_index(drop=True)
df['text']=df['text'].apply(clean_text)

In [6]:
train_data=df.iloc[:200000]
test_data=df.iloc[200000:250000]
del df

In [7]:
max_length=64
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = max_length)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:
class SentimentDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_ids=torch.tensor(tokenizer.encode(row['text'],padding='max_length',max_length=max_length,truncation=True))
        attention_mask=torch.where(input_ids!=1,False,True)
        return {'input_ids':input_ids,
         'attention_mask': attention_mask,
         'label':torch.tensor(0 if row['label']==0 else 1)}

In [9]:
train_p=SentimentDataset(train_data)
test_p=SentimentDataset(test_data)

In [46]:
class Sentiment_Model(torch.nn.Module):
    def __init__(self, embed_dim=64,max_seq_len=max_length):
        super(Sentiment_Model, self).__init__()
        self.input_embeddings = nn.Embedding(len(tokenizer), embed_dim)
        self.lstm1 = nn.LSTM(embed_dim, embed_dim//2,batch_first=True, num_layers=1,  dropout=0.1, bidirectional=True)
        self.dense = nn.Linear(embed_dim, 2)

    def forward(self, input_ids):
        input_embeddings = self.input_embeddings(input_ids)
        lstm_output, (hn, cn) = self.lstm1(input_embeddings)
        mean_output = lstm_output.mean(dim=1)

        outputs = self.dense(mean_output)

        return outputs

In [47]:
model=Sentiment_Model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device='cpu'
model.to(device)

Sentiment_Model(
  (input_embeddings): Embedding(50265, 64)
  (lstm1): LSTM(64, 32, batch_first=True, dropout=0.1, bidirectional=True)
  (dense): Linear(in_features=64, out_features=2, bias=True)
)

In [48]:
train_loader=DataLoader(train_p,batch_size=32,shuffle=True)
test_loader=DataLoader(test_p,batch_size=32,shuffle=True)

In [49]:
def create_mini_batch(samples):
    input_ids = [s['input_ids'] for s in samples]
    attention_mask = [(s['attention_mask']) for s in samples]
    label = [s['label'] for s in samples]
    l=max_length
    input_ids=torch.stack(input_ids)[:,:l]
    attention_mask=torch.stack(attention_mask)[:,:l]
    label=torch.stack(label)
    return input_ids, attention_mask, label

In [50]:
def train(num_epochs=30):
    criterion= nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    best_valid_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_corrects=0.0
        train_total=0.0
        test_corrects=0.0
        test_total=0.0
        for batch in tqdm.tqdm(train_loader):
            input_ids=batch['input_ids']
            attention_mask=batch['attention_mask'].T
            labels=batch['label']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids)
            loss = criterion(outputs,labels)
            train_loss += loss.item()
            train_corrects+=torch.sum(outputs.argmax(dim=-1)==labels).item()
            train_total+=outputs.size(0)
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for batch in test_loader:
                input_ids=batch['input_ids']
                attention_mask=batch['attention_mask'].T
                labels=batch['label']

                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outputs = model(input_ids)
                loss = criterion(outputs,labels)
                test_corrects+=torch.sum(outputs.argmax(dim=-1)==labels).item()
                test_total+=outputs.size(0)
                valid_loss += loss.item()
        avg_valid_loss = valid_loss / len(test_loader)

        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), "best_model.pt")
        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f},Train acc: {train_corrects/train_total}, Valid Loss: {avg_valid_loss:.4f},Valid acc: {test_corrects/test_total}")

In [None]:
train()

100%|██████████| 6250/6250 [01:44<00:00, 60.05it/s]


Epoch 1/30: Train Loss: 0.4981,Train acc: 0.754375, Valid Loss: 0.4511,Valid acc: 0.78964


100%|██████████| 6250/6250 [01:41<00:00, 61.63it/s]


Epoch 2/30: Train Loss: 0.4143,Train acc: 0.8101, Valid Loss: 0.4422,Valid acc: 0.79598


100%|██████████| 6250/6250 [01:42<00:00, 61.23it/s]


Epoch 3/30: Train Loss: 0.3791,Train acc: 0.83085, Valid Loss: 0.4317,Valid acc: 0.80302


100%|██████████| 6250/6250 [01:40<00:00, 62.06it/s]


Epoch 4/30: Train Loss: 0.3488,Train acc: 0.84683, Valid Loss: 0.4386,Valid acc: 0.80094


100%|██████████| 6250/6250 [01:40<00:00, 62.09it/s]


Epoch 5/30: Train Loss: 0.3165,Train acc: 0.86414, Valid Loss: 0.4540,Valid acc: 0.80116


100%|██████████| 6250/6250 [01:39<00:00, 62.55it/s]


Epoch 6/30: Train Loss: 0.2845,Train acc: 0.880765, Valid Loss: 0.4790,Valid acc: 0.79816


100%|██████████| 6250/6250 [01:42<00:00, 61.20it/s]


Epoch 7/30: Train Loss: 0.2529,Train acc: 0.896825, Valid Loss: 0.5097,Valid acc: 0.7953


100%|██████████| 6250/6250 [01:40<00:00, 62.22it/s]


Epoch 8/30: Train Loss: 0.2219,Train acc: 0.911045, Valid Loss: 0.5537,Valid acc: 0.7876


100%|██████████| 6250/6250 [01:41<00:00, 61.40it/s]


Epoch 9/30: Train Loss: 0.1944,Train acc: 0.923235, Valid Loss: 0.5997,Valid acc: 0.7853


100%|██████████| 6250/6250 [01:41<00:00, 61.62it/s]


Epoch 10/30: Train Loss: 0.1701,Train acc: 0.934715, Valid Loss: 0.6591,Valid acc: 0.78322


100%|██████████| 6250/6250 [01:40<00:00, 62.13it/s]


Epoch 11/30: Train Loss: 0.1475,Train acc: 0.944625, Valid Loss: 0.6825,Valid acc: 0.77958


100%|██████████| 6250/6250 [01:41<00:00, 61.85it/s]


Epoch 12/30: Train Loss: 0.1298,Train acc: 0.951885, Valid Loss: 0.7550,Valid acc: 0.77856


100%|██████████| 6250/6250 [01:40<00:00, 62.41it/s]


Epoch 13/30: Train Loss: 0.1154,Train acc: 0.957775, Valid Loss: 0.8191,Valid acc: 0.77656


100%|██████████| 6250/6250 [01:40<00:00, 62.17it/s]


Epoch 14/30: Train Loss: 0.1020,Train acc: 0.963405, Valid Loss: 0.8753,Valid acc: 0.77592


100%|██████████| 6250/6250 [01:40<00:00, 61.96it/s]


Epoch 15/30: Train Loss: 0.0909,Train acc: 0.967495, Valid Loss: 0.9823,Valid acc: 0.77296


100%|██████████| 6250/6250 [01:38<00:00, 63.13it/s]


Epoch 16/30: Train Loss: 0.0829,Train acc: 0.970345, Valid Loss: 0.9827,Valid acc: 0.77154


100%|██████████| 6250/6250 [01:39<00:00, 62.74it/s]


Epoch 17/30: Train Loss: 0.0760,Train acc: 0.972855, Valid Loss: 1.0287,Valid acc: 0.76876


100%|██████████| 6250/6250 [01:39<00:00, 62.68it/s]


Epoch 18/30: Train Loss: 0.0708,Train acc: 0.97484, Valid Loss: 1.0686,Valid acc: 0.7674


100%|██████████| 6250/6250 [01:43<00:00, 60.21it/s]


Epoch 19/30: Train Loss: 0.0643,Train acc: 0.97714, Valid Loss: 1.1152,Valid acc: 0.7705


100%|██████████| 6250/6250 [01:40<00:00, 62.13it/s]


In [None]:
def run_pipeline(input_):
  input_ids=torch.tensor([tokenizer.encode(input_,padding='max_length',max_length=max_length,truncation=True)]).to(device)
  # print(input_ids)
  outputs=model(input_ids)[0].argmax(dim=-1)
  return outputs

In [None]:
model.eval()

Sentiment_Model(
  (word_embedding): Embedding(50265, 64)
  (pos_embedding): Embedding(64, 64)
  (mha1): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
  )
  (dense): Linear(in_features=64, out_features=2, bias=True)
)

In [None]:
print(run_pipeline('I feel so good'))
print(run_pipeline('I lost my mother today. I miss her. I wish I could have her back'))