[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github.com/ntl2222/HackathonAI/blob/nikos/sentiment_classifier.ipynb)

# Review Classifier based on Sentiment Analysis
---

# Table of Contents

- [Dataframe & text preprocess](#Dataframe-&-text-preprocess)
- [Setup Configurations](#Setup-Configurations)
- [Creating the Model](#Creating-the-Model)
- [Creating Datasets and Dataloaders](#Creating-Datasets-and-Dataloaders)
- [Training](#Training)

In [246]:
# for colab
# !pip install torchinfo
# !pip install torchmetrics

# Dataframe & text preprocess

In [39]:
import pandas as pd
from pathlib import Path

dataset_dir = Path('./data')
csv_dir = dataset_dir / 'raw' / 'Restaurant reviews.csv'

df = pd.read_csv(csv_dir, usecols=['Review', 'Rating']).dropna()
df

Unnamed: 0,Review,Rating
0,"The ambience was good, food was quite good . h...",5
1,Ambience is too good for a pleasant evening. S...,5
2,A must try.. great food great ambience. Thnx f...,5
3,Soumen das and Arun was a great guy. Only beca...,5
4,Food is good.we ordered Kodi drumsticks and ba...,5
...,...,...
9995,Madhumathi Mahajan Well to start with nice cou...,3
9996,This place has never disappointed us.. The foo...,4.5
9997,"Bad rating is mainly because of ""Chicken Bone ...",1.5
9998,I personally love and prefer Chinese Food. Had...,4


In [40]:
from src.utils.df_setup import clean_dataframe

df = clean_dataframe(df)

* turn ratings to binary labels for the sentiment analysis

In [41]:
df.loc[:, 'Rating'] = df['Rating'].astype(float)
df.loc[:, 'Rating'] = df['Rating'].map(lambda rating: 1 if rating > 3 else 0)

In [42]:
# check for duplicates or missing values
duplicate_index = df.index[df.index.duplicated()]
print('Duplicates:')
print(len(duplicate_index))

print('\nMissing indexes:')
missing_index = set(range(len(df))) - set(df.index)
print(len(missing_index))

Duplicates:
0

Missing indexes:
57


In [43]:
df = df.reindex(range(len(df)))
missing_index = set(range(len(df))) - set(df.index)
df = df.dropna()
print('Missing indexes:')
print(len(missing_index))

Missing indexes:
0


In [44]:
# clean reviews but keep the verbs to help in sentiment analysis
from src.utils.text_setup import TextCleaner

f = TextCleaner(remove_verbs=False)
# df['cleaned-reviews'] = df['Review'].map(lambda review: f.clean(review))
# df.to_csv(dataset_dir / 'processed' /'clean_with_verbs.csv', index=False)

In [1]:
import pandas as pd
df = pd.read_csv('./data/processed/clean_with_verbs.csv')
df

Unnamed: 0,Review,Rating,cleaned-reviews
0,"The ambience was good, food was quite good . h...",1,ambience good food quite good saturday lunch c...
1,Ambience is too good for a pleasant evening. S...,1,ambience good pleasant evening service prompt ...
2,A must try.. great food great ambience. Thnx f...,1,must try great food great ambience thnx servic...
3,Soumen das and Arun was a great guy. Only beca...,1,soumen das arun great guy behavior sincerety g...
4,Food is good.we ordered Kodi drumsticks and ba...,1,food good.we order kodi drumstick basket mutto...
...,...,...,...
9881,Been looking for Chinese food around gachibowl...,1,look chinese food around gachibowli find place...
9882,I am amazed at the quality of food and service...,1,amazed quality food service place provide opul...
9883,The food was amazing. Do not forget to try 'Mo...,1,food amazing forget try mou chi kay amazing si...
9884,We ordered from here via swiggy:\n\nWe ordered...,1,order via swiggy order stuff mushroom little s...


# Setup Configurations

In [14]:
import torch
from torch import nn
from torchmetrics import Accuracy

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [103]:
class conf:
    # training
    EPOCHS = 1
    LOSS_FN = nn.CrossEntropyLoss()
    LR = 0.001
    BATCH_SIZE = 32
    ACC_FN = Accuracy(task='binary').to(device)
    # roberta
    EMBEDDING_DIM = 120
    FFN_DIMENSION = 150
    ATT_HEADS = 3
    ENCODERS = 2
    DROPOUT = 0.4
    # classifier
    HIDDEN_UNITS = 50
    

# Creating the Model

In [21]:
df['cleaned-reviews'] = df['cleaned-reviews'].astype(str)
df['Rating'] = df['Rating'].astype(int)
reviews = df['cleaned-reviews'].values.tolist()
labels = df['Rating'].values.tolist()
reviews[:2]

['ambience good food quite good saturday lunch cost effective good place sate brunch one also chill friend parent waiter soumen das really courteous helpful',
 'ambience good pleasant evening service prompt food good good experience soumen das kudo service']

In [22]:
from src.utils.vocab_setup import CustomVocab

vocab = CustomVocab(reviews)
vocab_size = vocab.size
vocab_size

14486

In [23]:
max_seq_length = 0

for review in reviews:
    if len(review) > max_seq_length: max_seq_length = len(review)

max_seq_length

3656

In [107]:
# %%writefile /src/models/roberta_model.py

import torchtext

def model_init(vocab_size: int,
                 embedding_dim: int,
                 ffn_dimension: int,
                 num_attention_heads: int,
                 num_encoder_layers: int,
                 max_seq_len: int,
                 padding_idx: int = 1,
                 dropout: float = 0.1,
                 scaling = None,
                 normalize_before: bool = False):
    '''Initialize a distilled RoBERTa encoder and returns the
       model and the transformations it was trained.'''

    base = torchtext.models.ROBERTA_DISTILLED_ENCODER
    config = base.encoderConf

    config.vocab_size=vocab_size
    config.embedding_dim=embedding_dim
    config.ffn_dimension=ffn_dimension
    config.padding_idx=padding_idx
    config.max_seq_len=max_seq_len
    config.num_attention_heads=num_attention_heads
    config.num_encoder_layers=num_encoder_layers
    config.dropout=dropout
    config.scaling=scaling
    config.normalize_before=normalize_before

    model = base.build_model(encoder_conf=config)
    transforms = base.transform()

    transformer_encoder = model.encoder.transformer.layers.layers

    for i in range(num_encoder_layers):
        transformer_encoder.get_submodule(f'{i}').linear1 = nn.Linear(in_features=embedding_dim, out_features=ffn_dimension, bias=True)
        transformer_encoder.get_submodule(f'{i}').linear2 = nn.Linear(in_features=ffn_dimension, out_features=embedding_dim, bias=True)
    
    print(config)
    return model, transforms

In [105]:
roberta, transforms = model_init(vocab_size = 50265,
                                 embedding_dim = conf.EMBEDDING_DIM,
                                 ffn_dimension = conf.FFN_DIMENSION,
                                 num_attention_heads = conf.ATT_HEADS,
                                 num_encoder_layers = conf.ENCODERS,
                                 dropout=conf.DROPOUT,
                                 max_seq_len = max_seq_length)  



RobertaEncoderConf(vocab_size=50265, embedding_dim=120, ffn_dimension=150, padding_idx=1, max_seq_len=3656, num_attention_heads=3, num_encoder_layers=2, dropout=0.4, scaling=None, normalize_before=False)


In [106]:
print(transforms, '\n')
print(roberta)

Sequential(
  (0): GPT2BPETokenizer()
  (1): VocabTransform(
    (vocab): Vocab()
  )
  (2): Truncate()
  (3): AddToken()
  (4): AddToken()
) 

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(50265, 120, padding_idx=1)
      (layers): TransformerEncoder(
        (layers): ModuleList(
          (0-1): 2 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=120, out_features=120, bias=True)
            )
            (linear1): Linear(in_features=120, out_features=150, bias=True)
            (dropout): Dropout(p=0.4, inplace=False)
            (linear2): Linear(in_features=150, out_features=120, bias=True)
            (norm1): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((120,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.4, inplace=False)
            (dropout2): Drop

In [27]:
from torchtext.functional import to_tensor

transformed_reviews = to_tensor(transforms(reviews[:2]), padding_value=1)

print(transformed_reviews, '\n')
print(transformed_reviews.shape)

tensor([[    0,  3146, 11465,   205,   689,  1341,   205,   579, 47035,  4592,
           701,  2375,   205,   317,   579,   877, 25003,    65,    67, 13146,
          1441,  4095, 38233, 22272,  2262,   385,   281,   269, 17867,   859,
          1827,  7163,     2],
        [    0,  3146, 11465,   205, 16219,  1559,   544, 14302,   689,   205,
           205,   676, 22272,  2262,   385,   281,   449, 23259,   544,     2,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1]]) 

torch.Size([2, 33])


In [28]:
y = roberta(transformed_reviews)
y.shape

torch.Size([2, 33, 120])

In [29]:
import torch
from torch import nn

class SentimentClassifier(nn.Module):
    def __init__(self, embedding_layer, embedding_dim, hidden_units):
        super(SentimentClassifier, self).__init__()

        self.embedding_layer = embedding_layer
        self.classifier = nn.Sequential(nn.Linear(in_features=embedding_dim, out_features=hidden_units),
                                        nn.ReLU(),
                                        nn.Linear(in_features=hidden_units, out_features=2))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # get token embeddings from a pretrained model
        embeddings = self.embedding_layer(x)

        # Pooling to get a single vector representation for each sequence in the batch
        pooling = torch.mean(embeddings, dim=1)

        logits = self.classifier(pooling)
    
        return logits

In [30]:
model = SentimentClassifier(embedding_layer=roberta, embedding_dim=conf.EMBEDDING_DIM, hidden_units=conf.HIDDEN_UNITS)

In [31]:
model.eval()
y = model(transformed_reviews)
probabilities = torch.softmax(y, dim=-1)
predictions = torch.argmax(probabilities, dim=-1)

print(f'probabilities:\n{probabilities}\n')
print(f'predictions:\n{predictions}')

probabilities:
tensor([[0.4717, 0.5283],
        [0.4578, 0.5422]], grad_fn=<SoftmaxBackward0>)

predictions:
tensor([1, 1])


# Creating Dataset and Dataloaders

In [102]:
# %%writefile src/utils/data_setup.py

from typing import List, Optional
import os

from torch.utils.data import Dataset, DataLoader
from torchtext.functional import to_tensor

class CustomDataset(Dataset):
    '''Creates a torch.utils.data.Dataset and applies given transformations.'''
    def __init__(self, data: List[str], labels: list, transforms: Optional = None):
        super().__init__()

        self.labels = labels
        self.transforms = transforms

        if transforms:
            self.data = to_tensor(self.transforms(data), padding_value=1)
        else:
            self.data = to_tensor(data, padding_value=1)
       

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]
        

def train_test_split(data: List[str], labels: list, split_point: float = 0.7):
    '''Splits the data to train and test segments according to split_point.'''
    split = int(split_point * len(data))
    
    train_data = data[0 : split]
    train_labels = labels[0 : split]

    test_data = data[split : len(data) -1]
    test_labels = labels[split : len(labels) -1]

    return train_data, train_labels, test_data, test_labels

def create_dataloaders(train_dataset, test_dataset, batch_size: int):
    '''Creates train and test dataloaders of given batch size.'''
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  num_workers=os.cpu_count(),
                                  shuffle=True)

    test_dataloader = DataLoader(dataset=test_dataset,
                                  batch_size=batch_size,
                                  num_workers=os.cpu_count(),
                                  shuffle=False)

    return train_dataloader, test_dataloader

In [33]:
train_data, train_labels, test_data, test_labels = train_test_split(reviews, labels)

train_dataset = CustomDataset(data=train_data, labels=train_labels, transforms=transforms)
test_dataset = CustomDataset(data=test_data, labels=test_labels, transforms=transforms)

In [34]:
len(train_dataset), len(test_dataset)

(7908, 1977)

In [35]:
train_dataloader, test_dataloader = create_dataloaders(train_dataset, test_dataset, conf.BATCH_SIZE)

# Training

In [36]:
def train_step(model: nn.Module,
               loss_fn: nn.Module,
               optimizer: torch.optim,
               acc_fn,
               dataloader: torch.utils.data,
               device: torch.device):

    model.train()
    model.to(device)

    train_loss, train_acc = 0, 0

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        logits = model(X)
        probs = torch.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)
        
        loss = loss_fn(logits, y)
        acc = acc_fn(preds, y)

        train_loss += loss.item()
        train_acc += acc.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 30 == 0:
            print(f'train loss: {loss:.4f} train acc: {acc:.2f}%')
            

    train_loss /= len(dataloader)
    train_acc = (train_acc / len(dataloader)) * 100

    return train_loss, train_acc

In [37]:
def test_step(model: nn.Module,
              loss_fn: nn.Module,
              acc_fn,
              dataloader: torch.utils.data,
              device: torch.device):

    model.eval()
    model.to(device)

    test_loss, test_acc = 0, 0

    with torch.inference_mode():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
    
            test_logits = model(X)
            test_preds = torch.argmax(test_logits, dim=1)
            
            test_loss += loss_fn(test_logits, y).item()
            test_acc += acc_fn(test_preds, y).item()

            if batch % 30 == 0:
                print(f'test loss: {test_loss:.4f} test acc: {test_acc:.2f}%')

        test_loss /= len(dataloader)
        test_acc = (test_acc / len(dataloader)) * 100

    return test_loss, test_acc

In [38]:
def training(model: nn.Module,
             loss_fn: nn.Module,
             optimizer: torch.optim,
             train_dataloader: torch.utils.data,
             test_dataloader: torch.utils.data,
             acc_fn,
             epochs: int,
             device: torch.device):

    results = {
        'train_loss' : [],
        'train_acc' : [],
        'test_loss' : [],
        'test_acc' : []
    }

    for epoch in range(epochs):
        print(f'\nEpoch: {epoch}\n--------')

        train_loss, train_acc = train_step(model=model, loss_fn=loss_fn, optimizer=optimizer, acc_fn=acc_fn, dataloader=train_dataloader, device=device)
        results['train_loss'].append(train_loss)
        results['train_acc'].append(train_acc)

        print()
        
        test_loss, test_acc = test_step(model=model, loss_fn=loss_fn, acc_fn=acc_fn, dataloader=test_dataloader, device=device)
        results['test_loss'].append(test_loss)
        results['test_acc'].append(test_acc)

        
        print(f'train loss: {train_loss:.4f} | test loss: {test_loss:.4f}')
        print(f'train acc: {train_acc:.2f}% | test acc: {test_acc:.2f}%')


    return results


In [40]:
model = SentimentClassifier(embedding_layer=roberta, embedding_dim=conf.EMBEDDING_DIM, hidden_units=conf.HIDDEN_UNITS)
optimizer = torch.optim.Adam(model.parameters(), lr=conf.LR)

In [41]:
demo_set = CustomDataset(reviews[:2], labels[:2], transforms)
demo_dataloader = DataLoader(dataset=demo_set, batch_size=1)

In [None]:
results = training(model=model,
                   loss_fn=conf.LOSS_FN,
                   optimizer=optimizer,
                   train_dataloader=demo_dataloader,
                   test_dataloader=demo_dataloader,
                   acc_fn=conf.ACC_FN,
                   epochs=conf.EPOCHS,
                   device=device)