In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 kaggle.json
!kaggle datasets download  'kazanova/sentiment140'
!unzip sentiment140.zip

In [None]:
!pip install datasets
!pip install transformers

In [None]:
import re
import bz2
import tqdm
import pandas as pd


import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'(.)1+', r'1', text)  # REPEATING CHARS
    text = re.sub('((www.[^s]+)|(https?://[^s]+))', ' ', text)  # URLS
    text = re.sub('[0-9]+', '', text)  # NUMBERS
    text = " ".join(filter(lambda x: x[0] != '@', text.split()))  # REPLY
    return text

In [None]:
df = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1',
                 names=['label', 'ids', 'date', 'flag', 'user', 'text']).sample(frac=1).reset_index(drop=True)
df['text'] = df['text'].apply(clean_text)

In [None]:
train_data = df.iloc[:200000]
test_data = df.iloc[200000:250000]
del df

In [None]:
max_length = 64
tokenizer = RobertaTokenizerFast.from_pretrained(
    'roberta-base', max_length=max_length)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_ids = torch.tensor(tokenizer.encode(
            row['text'], padding='max_length', max_length=max_length, truncation=True))
        attention_mask = torch.where(input_ids != 1, False, True)
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': torch.tensor(0.0 if row['label'] == 0 else 1.0)}

In [None]:
train_p = SentimentDataset(train_data)
test_p = SentimentDataset(test_data)

In [None]:
class Sentiment_Model(torch.nn.Module):
    def __init__(self, embed_dim=64, max_seq_len=max_length):
        super(Sentiment_Model, self).__init__()
        self.word_embedding = nn.Embedding(len(tokenizer), embed_dim)
        self.pos_embedding = nn.Embedding(max_seq_len, embed_dim)
        self.mha1 = nn.MultiheadAttention(
            embed_dim, 4, 0.2, kdim=embed_dim, vdim=embed_dim)
        self.dense = nn.Linear(embed_dim, 1)

    def forward(self, input_ids):
        word_embeddings = self.word_embedding(input_ids)
        positional_embeddings = self.pos_embedding(
            torch.arange(input_ids.size(1)).to(device))

        input_embeddings = word_embeddings + positional_embeddings

        attn_output1, attn_output_weights = self.mha1(
            input_embeddings, input_embeddings, input_embeddings)
        # attn_output2, attn_output_weights = self.mha2(attn_output1, attn_output1, attn_output1)
        mean_output = attn_output1.mean(dim=1)

        outputs = self.dense(mean_output)

        return outputs

In [None]:
model = Sentiment_Model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device='cpu'
model.to(device)

In [None]:
train_loader = DataLoader(train_p, batch_size=32, shuffle=True)
test_loader = DataLoader(test_p, batch_size=32, shuffle=True)

In [None]:
def train(num_epochs=30):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    best_valid_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_corrects = 0.0
        train_total = 0.0
        test_corrects = 0.0
        test_total = 0.0
        for batch in tqdm.tqdm(train_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask'].T
            labels = batch['label']
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids).view(-1,)
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            train_corrects += torch.sum((outputs >
                                        0.5).float() == labels).item()
            train_total += outputs.size(0)
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask'].T
                labels = batch['label']

                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outputs = model(input_ids).view(-1,)
                loss = criterion(outputs, labels)
                test_corrects += torch.sum((outputs >
                                           0.5).float() == labels).item()
                test_total += outputs.size(0)
                valid_loss += loss.item()
        avg_valid_loss = valid_loss / len(test_loader)

        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), "best_model.pt")
        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f},Train acc: {train_corrects/train_total}, Valid Loss: {avg_valid_loss:.4f},Valid acc: {test_corrects/test_total}")

In [None]:
train()

In [None]:
def run_pipeline(input_):
    input_ids = torch.tensor([tokenizer.encode(
        input_, padding='max_length', max_length=max_length, truncation=True)]).to(device)
    # print(input_ids)
    outputs = model(input_ids)[0].argmax(dim=-1)
    return outputs

In [None]:
model.eval()

In [None]:
print(run_pipeline('I feel so good'))
print(run_pipeline('I lost my mother today. I miss her. I wish I could have her back'))