In [1]:
import glob
import numpy as np
import os
import pandas as pd
import re
import requests
import tarfile

from collections import Counter
from tqdm import tqdm

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
train = pd.read_csv("data/processed/train.csv")
test = pd.read_csv("data/processed/test.csv")

In [3]:
# Separating features and labels
# convert sentiment to binary?
X_train = train['text']
y_train = train['sentiment']

X_test = test['text']
y_test = test['sentiment'] 

In [3]:
df1 = train[['text', 'sentiment']]

In [4]:
df1['sentiment'] = df1['sentiment'].map({'positive': 1, 'negative': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['sentiment'] = df1['sentiment'].map({'positive': 1, 'negative': 0})


In [5]:
import re
import os
import transformers

from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer

config = AutoConfig.from_pretrained("roberta-base")

MAX_LEN=512
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.comment_text = dataframe["text"].tolist()
        self.targets = dataframe["sentiment"].tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)
    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
            return_tensors='pt'
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            #'targets': torch.tensor(targets, dtype=torch.float)
            'targets': torch.tensor(self.targets[index], dtype=torch.long).to(device)
             }

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
device = 'mps'

In [7]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        config = RobertaConfig()
        config.max_position_embeddings = 512
        self.l1 = transformers.RobertaModel(config).from_pretrained("roberta-base")
        self.l2 = torch.nn.Dropout(0.2)
        self.l3 = torch.nn.Linear(768,1)

    def forward(self, ids, mask, token_type_ids):
        output_1=self.l1(ids,token_type_ids=token_type_ids)
        #print(out1.shape)
        output_2 = self.l2(output_1[1])
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [8]:
def loss_fn(outputs, targets):
    loss=torch.nn.BCEWithLogitsLoss()(outputs, targets)
    return loss

In [9]:
LEARNING_RATE=1e-5
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [10]:
train_size = 0.8
train_dataset=df1.sample(frac=train_size,random_state=42)
test_dataset=df1.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df1.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (25000, 2)
TRAIN Dataset: (20000, 2)
TEST Dataset: (5000, 2)


In [11]:
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

TRAIN_BATCH_SIZE=5
VALID_BATCH_SIZE=1

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

train_dataloader = DataLoader(training_set, **train_params)#**passing multile parameters by dic
testing_loader = DataLoader(testing_set, **test_params)

In [13]:
def train(epoch):
    model.train()
    average_loss=0
    for i,data in enumerate(train_dataloader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        #print('targets.shape',targets.shape)
        #outputs = model(ids, mask, token_type_ids)
        outputs = model(ids.squeeze(), mask.squeeze(), token_type_ids.squeeze())
        #print(outputs)
        optimizer.zero_grad()
        # print ()
        loss = loss_fn(outputs.squeeze(), targets)
        average_loss+=loss
        if i%50==0:
            print(f'Epoch: {epoch}, Loss:  {average_loss/(i+1)}')
        loss.backward()# caculate the derivates
        optimizer.step()#upadte weight

In [None]:
# As after each epochs i save the mdole,so previous epochs results are not listed below, the total number of epoch thats model run is 25.
model_path = "model/"

EPOCHS=5
for epoch in range(EPOCHS):
    train(epoch)
torch.save(model.state_dict(), model_path)

  'ids': torch.tensor(ids, dtype=torch.long),
  'mask': torch.tensor(mask, dtype=torch.long),
  'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch: 0, Loss:  0.7091418504714966
Epoch: 0, Loss:  0.6928136348724365
Epoch: 0, Loss:  0.693176805973053
Epoch: 0, Loss:  0.6941467523574829
Epoch: 0, Loss:  0.6962456107139587
Epoch: 0, Loss:  0.6950148940086365
Epoch: 0, Loss:  0.6652579307556152
Epoch: 0, Loss:  0.6196178793907166
Epoch: 0, Loss:  0.5738733410835266
Epoch: 0, Loss:  0.5373698472976685
Epoch: 0, Loss:  0.510400116443634
Epoch: 0, Loss:  0.490872859954834
Epoch: 0, Loss:  0.47778740525245667
Epoch: 0, Loss:  0.462095707654953
Epoch: 0, Loss:  0.4427928924560547
Epoch: 0, Loss:  0.43010213971138
Epoch: 0, Loss:  0.41652747988700867
Epoch: 0, Loss:  0.4058190584182739
Epoch: 0, Loss:  0.3942759931087494
Epoch: 0, Loss:  0.3821165859699249
Epoch: 0, Loss:  0.3724372982978821
Epoch: 0, Loss:  0.36300548911094666
Epoch: 0, Loss:  0.35685163736343384
Epoch: 0, Loss:  0.35006871819496155
Epoch: 0, Loss:  0.3438358008861542
Epoch: 0, Loss:  0.3391912579536438
Epoch: 0, Loss:  0.3332423269748688
Epoch: 0, Loss:  0.3273794651

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    #softm = torch.nn.sigmoid(dim=1)
    sigmoid_v=torch.nn.Sigmoid()
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            #print(ids.shape, mask.shape,token_type_ids.shape,targets.shape)
            ids=ids.squeeze()
            mask=mask.squeeze()
            token_type_ids=token_type_ids.squeeze()
            outputs = model(ids.unsqueeze(0), mask.unsqueeze(0), token_type_ids.unsqueeze(0))
            #print(targets.shape)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(sigmoid_v(outputs))
    return fin_outputs, fin_targets

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
for epoch in range(1):
    outputs, targets = validation(epoch)
    #targets=targets
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    precision = precision_score(targets, outputs)
    recall = recall_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(confusion_matrix(targets, outputs))