In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 kaggle.json
!kaggle datasets download  'kazanova/sentiment140'
!unzip sentiment140.zip

In [None]:
!pip install datasets
!pip install transformers

In [None]:
import re
import bz2
import tqdm
import pandas as pd


import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

In [None]:
def clean_text(text):
    text=text.lower()
    text= re.sub(r'(.)1+', r'1', text) #REPEATING CHARS
    text=re.sub('((www.[^s]+)|(https?://[^s]+))',' ',text) #URLS
    text=re.sub('[0-9]+', '', text) #NUMBERS
    text=" ".join(filter(lambda x:x[0]!='@', text.split())) #REPLY
    return text

In [None]:
df=pd.read_csv('./training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=['label','ids','date','flag','user','text']).sample(frac = 1).reset_index(drop=True)
df['text']=df['text'].apply(clean_text)

In [None]:
train_data=df.iloc[:200000]
test_data=df.iloc[250000:300000]
del df

In [None]:
max_length=64
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = max_length)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_ids=torch.tensor(tokenizer.encode(row['text'],padding='max_length',max_length=max_length,truncation=True))
        attention_mask=torch.where(input_ids!=1,False,True)
        return {'input_ids':input_ids,
         'attention_mask': attention_mask,
         'label':torch.tensor(0.0 if row['label']==0 else 1.0)}

In [None]:
train_p=SentimentDataset(train_data)
test_p=SentimentDataset(test_data)

In [None]:
class Sentiment_Model(torch.nn.Module):
    def __init__(self, embed_dim=64,max_seq_len=max_length,dim1=32):
        super(Sentiment_Model, self).__init__()
        self.input_embeddings = nn.Embedding(len(tokenizer), embed_dim)
        self.dense1 = nn.Linear(embed_dim, dim1)
        self.bn1=nn.BatchNorm1d(64)
        self.dp1=nn.Dropout(0.4)
        self.dense = nn.Linear(dim1, 1)

    def forward(self, input_ids):
        input_embeddings = self.input_embeddings(input_ids)
        dense1_output = F.relu(self.dp1(self.bn1(self.dense1(input_embeddings))))
        mean_output = dense1_output.mean(dim=1)
        outputs = self.dense(mean_output)

        return outputs

In [None]:
train_loader=DataLoader(train_p,batch_size=32,shuffle=True)
test_loader=DataLoader(test_p,batch_size=32,shuffle=True)

In [None]:
def create_mini_batch(samples):
    input_ids = [s['input_ids'] for s in samples]
    attention_mask = [(s['attention_mask']) for s in samples]
    label = [s['label'] for s in samples]
    input_ids=torch.stack(input_ids)[:,:l]
    attention_mask=torch.stack(attention_mask)[:,:l]
    label=torch.stack(label)
    return input_ids, attention_mask, label

In [None]:
history=[]

In [None]:
def train(num_epochs=10,learning_rate=1e-3,dim1=32,embed_dim=32, optimizer_name='adam'):
    criterion= nn.BCEWithLogitsLoss()
    if optimizer_name=='adam':
      optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    if optimizer_name =='sgd':
      optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    if optimizer_name =='rmsprop':
      optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
    best_valid_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_corrects=0.0
        train_total=0.0
        test_corrects=0.0
        test_total=0.0
        for batch in tqdm.tqdm(train_loader):
            input_ids=batch['input_ids']
            attention_mask=batch['attention_mask'].T
            labels=batch['label']

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids).view(-1,)
            loss = criterion(outputs,labels)
            train_loss += loss.item()
            train_corrects+=torch.sum((outputs>0.5).float()==labels).item()
            train_total+=outputs.size(0)
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for batch in test_loader:
                input_ids=batch['input_ids']
                attention_mask=batch['attention_mask'].T
                labels=batch['label']

                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outputs = model(input_ids).view(-1,)
                loss = criterion(outputs,labels)
                test_corrects+=torch.sum((outputs>0.5).float()==labels).item()
                test_total+=outputs.size(0)
                valid_loss += loss.item()
        avg_valid_loss = valid_loss / len(test_loader)

        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            torch.save(model.state_dict(), "best_model.pt")
        history.append([epoch,learning_rate,dim1,embed_dim,optimizer,avg_train_loss,train_corrects/train_total,avg_valid_loss,test_corrects/test_total])
        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f},Train acc: {train_corrects/train_total}, Valid Loss: {avg_valid_loss:.4f},Valid acc: {test_corrects/test_total}")

In [None]:
# for dim1 in [16,4,8,32,64]:
#   for learning_rate in[1e-1,1e-2,1e-3,1e-4]:
#     for embed_dim in [16,8,4,32,64]:
for dim1 in [32]:
  for learning_rate in[1e-2]:
    for embed_dim in [128]:
      for optimizer_name in ['rmsprop']:
        model=Sentiment_Model(embed_dim=embed_dim,dim1=dim1)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # device='cpu'
        model.to(device)
        train(100,learning_rate,dim1,embed_dim,optimizer_name)

In [None]:
history=pd.DataFrame(history,columns=['epoch','learning_rate','dim1','embed_dim','optimizer','avg_train_loss','train_acc','avg_valid_loss','test_acc'])

In [None]:
import matplotlib.pyplot as plt

# Sample data: Replace this with your actual data
epochs = list(range(0, history.shape[0]))
train_acc = history['train_acc']
test_acc = history['test_acc']

# Create a figure with both train and test loss
plt.figure(figsize=(8, 6))
plt.plot(epochs, train_acc, label='Train Acc', color='blue', marker='o', linestyle='-')
plt.plot(epochs, test_acc, label='Test Acc', color='red', marker='s', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Train and Test Accuracy History')
plt.grid(True, alpha=0.2)
plt.legend()

# Save or display the combined figure
plt.savefig('combined_loss_figure.svg', format='svg')  # To save as an image file
plt.show()

In [None]:
import plotly.graph_objects as go

# Sample data: Replace this with your actual data
epochs = list(range(1, 11))
train_loss = history['avg_train_loss']
test_loss = history['avg_valid_loss']

# Create a figure for train loss
train_loss_fig = go.Figure()
train_loss_fig.add_trace(go.Scatter(x=epochs, y=train_loss, mode='lines+markers', name='Train Loss'))
train_loss_fig.update_layout(
    title="Train Loss History",
    xaxis_title="Epoch",
    yaxis_title="Loss",
)

# Create a figure for test loss
test_loss_fig = go.Figure()
test_loss_fig.add_trace(go.Scatter(x=epochs, y=test_loss, mode='lines+markers', name='Test Loss'))
test_loss_fig.update_layout(
    title="Test Loss History",
    xaxis_title="Epoch",
    yaxis_title="Loss",
)

# Show the train and test loss figures
train_loss_fig.show()
test_loss_fig.show()

In [None]:
import matplotlib.pyplot as plt

# Sample data: Replace this with your actual data
epochs = list(range(0, history.shape[0]))
train_loss = history['train_acc']
test_loss = history['test_acc']

# Create a figure for train loss
# Create a figure with both train and test loss
plt.figure(figsize=(8, 6))
plt.plot(epochs, train_loss, label='Train Loss', color='blue', marker='o', linestyle='-')
plt.plot(epochs, test_loss, label='Test Loss', color='red', marker='s', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train and Test Loss History')
plt.grid(True, alpha=0.2)
plt.legend()

# Save or display the combined figure
plt.savefig('combined_loss_figure.png')  # To save as an image file
plt.show()