# Review Classification

## Torchtext Processing and Bidirectional LSTM

### Import Libraries

In [1]:
import pandas as pd # Loading data
import numpy as np
import warnings
from sklearn.model_selection import train_test_split # train test splits

warnings.filterwarnings('ignore')

### Data Loading and Processing

We will first do all the necessary pre-processing before starting to create batches and training the model. All the steps are explained in the notebook named `Text Cleaning.ipynb`

In [2]:
# Read dataset
data = pd.read_csv("Reviews.csv")
# Drop unnecesary columns and duplicates
new_data = data.drop_duplicates(subset=['UserId', 'ProfileName', 'Time', 'Text'])
# Get useful columns
useful_data = new_data[['Text', 'Score']]
# Calculate length of each sentence without tokenizer
useful_data['sudo_length'] = useful_data.Text.str.split().str.len()
# Filter examples by length
useful_data = useful_data[(useful_data.sudo_length > 20) & (useful_data.sudo_length < 100)]
# Remove length column
useful_data = useful_data.drop(['sudo_length'], axis = 1)
# print 5 rows
useful_data.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


#### Create Train and Test sets

In [3]:
train, test = train_test_split(useful_data, test_size = 0.2)
train.to_csv("./train_test_data/train.csv", index=False)
test.to_csv("./train_test_data/test.csv", index=False)

In [4]:
import torchtext
from torchtext.data import TabularDataset, Field, BucketIterator
import spacy

In [5]:
tok = spacy.load('en_core_web_sm')

In [6]:
def tokenize_en(sent):
    sent = sent.lower()
    return [item.text for item in tok.tokenizer(sent)]

In [7]:
sent = "hello their, why don't u have a seat?"
tokenize_en(sent)

['hello', 'their', ',', 'why', 'do', "n't", 'u', 'have', 'a', 'seat', '?']

In [8]:
SENT_FIELD = Field(sequential=True, tokenize=tokenize_en)
LABEL_FIELD = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

data_fields = [
    ('Text', SENT_FIELD),
    ('Score', LABEL_FIELD)
]

In [9]:
train, val = TabularDataset.splits(
    path='./train_test_data',
    train='train.csv',
    validation = 'test.csv',
    format='csv',
    skip_header=True,
    fields=data_fields
)

In [10]:
SENT_FIELD.build_vocab(train)
SENT_FIELD.vocab.load_vectors('glove.6B.300d')

print("Number of words : {}".format(len(SENT_FIELD.vocab)))

Number of words : 101815


In [11]:
BATCH_SIZE = 32
dev = 'cuda'

train_iter, val_iter = BucketIterator.splits(
    (train, val), 
    batch_sizes=(BATCH_SIZE, BATCH_SIZE), 
    sort_key=lambda x: len(x.Text), 
    shuffle=True, 
    sort_within_batch=True,
    repeat=False,
    device = dev
)

In [12]:
import torch
import torch.nn as nn
import torch.optim as opt
from sklearn.metrics import confusion_matrix

In [13]:
class ClassificationMetrics:
    def __init__(self, num_classes):
        self.num_classes = num_classes
        self.classes = list(range(num_classes))
        self.epsilon = 1e-12
        self.cmatrix = np.zeros((num_classes, num_classes), dtype = np.int64) + self.epsilon
        
        self.total_correct = 0
        self.total_examples = 0
        
    def update(self, pred, truth):
        pred = pred.cpu()
        truth = truth.cpu()
        
        _, idx = pred.topk(1)
        truth = truth.view(-1, 1)
        
        self.total_examples += len(truth)
        self.total_correct += sum(idx == truth).item()
        
        val = confusion_matrix(truth, idx, labels=self.classes)
        
        self.cmatrix = self.cmatrix + val
        
        
    def precision_score(self):
        scores = {}
        for i in range(self.num_classes):
            scores[i] = self.cmatrix[i, i] / (sum(self.cmatrix[:, i]) + self.epsilon)
        
        return scores
    
    def recall_score(self):
        scores = {}
        for i in range(self.num_classes):
            scores[i] = self.cmatrix[i, i] / (sum(self.cmatrix[i, :]) + self.epsilon)
        
        return scores
    
    def scores(self, return_type = 'f1'):
        pscores = self.precision_score()
        rscores = self.recall_score()
        scores = {}
        for i in range(self.num_classes):
            if(pscores[i] == 0 and rscores[i] == 0):
                scores[i] = 0
            else:
                scores[i] = 2 * ((pscores[i] * rscores[i]) / (pscores[i] + rscores[i])  + self.epsilon)
            
        if return_type == 'f1':
            return scores
        elif return_type == 'all':
            all_scores = list(zip(pscores.values(), rscores.values(), scores.values()))
            t = {}
            for i in range(self.num_classes):
                t[i] = all_scores[i]
                
            return t
        else:
            raise Exception("Invalid argument for return type")
            
    def accuracy_score(self):
        return self.total_correct / self.total_examples
    
    def reset(self):
        self.total_correct = 0
        self.total_examples = 0
        self.cmatrix = np.zeros((self.num_classes, self.num_classes))
            
    def print_report(self):
        all_scores = self.scores('all')
        print("{:^15}\t{:^15}\t{:^15}\t{:^15}".format("Class", "Precision", "Recall", "F1-score"))
        for c, values in all_scores.items():
            print("{:^15}\t{:^15.3f}\t{:^15.3f}\t{:^15.3f}".format(c, values[0], values[1], values[2]))
            
        print("Accuracy : {:.5f} %".format(self.accuracy_score()))

In [14]:
class BiDirectionalLstm(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(BiDirectionalLstm, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_size = hidden_size
        self.cell = nn.LSTM(embedding_dim, hidden_size, bidirectional = True, dropout = 0.2)
        self.linear = nn.Linear(hidden_size * 2, num_classes)
        self.soft = nn.Softmax(dim=1)
        
    def forward(self, x, hstate = None):
        if hstate is None:
            hstate = self.init_hidden(self.hidden_size, x.shape[-1])
            
        cell_out, _ = self.cell(self.embedding(x), hstate)
        
        temp = torch.cat([cell_out[-1, :, :self.hidden_size], cell_out[0, :, self.hidden_size:]], axis = -1)
        
        out = self.linear(temp)
        
        return self.soft(out)
            
    def init_hidden(self, hidden_size, bs):
        return (torch.zeros(2, bs, hidden_size, device=dev), torch.zeros(2, bs, hidden_size, device=dev))
    
    def load_embeddings(self, embeddings):
        self.embedding.weight.data.copy_(embeddings)

In [15]:
VOCAB_SIZE = len(SENT_FIELD.vocab)
EMBEDDING_DIM = 300
HIDDEN_SIZE = 128
NUM_CLASSES = 5

In [16]:
import pandas as pd
from sklearn.utils import class_weight

train = pd.read_csv("./train_test_data/train.csv")
weight_array = class_weight.compute_class_weight('balanced', sorted(train.Score.unique()), train.Score)
del(train)

In [17]:
net = BiDirectionalLstm(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, NUM_CLASSES)
net.load_embeddings(SENT_FIELD.vocab.vectors)
net = net.cuda()
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weight_array).to(dev))

optimizer = opt.Adagrad(net.parameters())

scheduler = opt.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

In [18]:
train_metrics = ClassificationMetrics(NUM_CLASSES)
val_metrics = ClassificationMetrics(NUM_CLASSES)

In [19]:
from tqdm import tqdm
import copy
N_EPOCH = 20

tloss = []
tacc = []
vloss = []
vacc = []

best_val_acc = 0
best_model = copy.deepcopy(net)

for epoch in range(N_EPOCH):
    train_metrics.reset()
    losses = []
    net.train()
    for batch in tqdm(train_iter):
        optimizer.zero_grad()
        labels = batch.Score - 1
        pred = net(batch.Text)

        loss = criterion(pred, labels)
        loss.backward()
        losses.append(loss.item())
        optimizer.step()

        train_metrics.update(pred, labels)
    
    print("Training Run\nEpoch : {} Loss : {:.5f}".format(epoch + 1, sum(losses) / len(losses)))
    train_metrics.print_report()
    tloss.append(sum(losses) / len(losses))
    tacc.append(train_metrics.accuracy_score() * 100)
    
    val_metrics.reset()
    val_losses = []
    net.eval()
    for batch in tqdm(val_iter):
        labels = batch.Score - 1
        pred = net(batch.Text)
        loss = criterion(pred, labels)
        val_losses.append(loss.item())
        val_metrics.update(pred, labels)
        
    print("Validation Run\nEpoch : {} Loss : {:.5f}".format(epoch + 1, sum(val_losses) / len(val_losses)))
    val_metrics.print_report()
    acc = val_metrics.accuracy_score() * 100
    vloss.append(sum(val_losses) / len(val_losses))
    vacc.append(acc)
    
    if(acc > best_val_acc):
        print("##\nNew Best Accuracy : {:.5f}\n##".format(acc))
        best_val_acc = acc
        best_model = copy.deepcopy(net)
        torch.save(best_model.state_dict(), "./models/network-val-acc-{:.2f}.pt".format(best_val_acc))
        print("Best model saved.")
        
    scheduler.step()

100%|██████████████████████████████████████████████████████████████████████████████| 7011/7011 [04:35<00:00, 25.46it/s]
  1%|▍                                                                                | 9/1753 [00:00<00:19, 88.47it/s]

Training Run
Epoch : 1 Loss : 1.56462
     Class     	   Precision   	    Recall     	   F1-score    
       0       	     0.217     	     0.437     	     0.290     
       1       	     0.095     	     0.011     	     0.020     
       2       	     0.141     	     0.007     	     0.014     
       3       	     0.226     	     0.005     	     0.011     
       4       	     0.727     	     0.882     	     0.797     
Accuracy : 0.62441 %


100%|█████████████████████████████████████████████████████████████████████████████| 1753/1753 [00:14<00:00, 121.82it/s]


Validation Run
Epoch : 1 Loss : 1.52646
     Class     	   Precision   	    Recall     	   F1-score    
       0       	     0.213     	     0.772     	     0.334     
       1       	     0.117     	     0.007     	     0.013     
       2       	     0.127     	     0.012     	     0.023     
       3       	     0.216     	     0.016     	     0.030     
       4       	     0.808     	     0.797     	     0.803     
Accuracy : 0.60052 %
##
New Best Accuracy : 60.05242
##


  0%|                                                                                         | 0/7011 [00:00<?, ?it/s]

Best model saved.


  1%|█                                                                               | 92/7011 [00:03<04:42, 24.49it/s]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(list(range(len(tloss))), tloss, label="Training")
plt.plot(list(range(len(vloss))), vloss, label = "Testing")
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Loss Value")
plt.title("Loss Curves")

In [None]:
plt.plot(list(range(len(tacc))), tacc, label="Training")
plt.plot(list(range(len(vacc))), vacc, label = "Testing")
plt.legend()
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Accuracy Value")

In [None]:
pos_sent = "This food was good but i didn't like the place. Although, love the desert!"

pos_rev = SENT_FIELD.process([SENT_FIELD.preprocess(pos_sent)])

net.eval()
pred = net(pos_rev.to(dev))

print("Review Rating as predicted : {}".format(pred.topk(1)[1].item() + 1))

In [None]:
pos_sent = "I just hated that food. Not recommended at all."

pos_rev = SENT_FIELD.process([SENT_FIELD.preprocess(pos_sent)])

net.eval()
pred = net(pos_rev.to(dev))

print("Review Rating as predicted : {}".format(pred.topk(1)[1].item() + 1))

In [None]:
vocab_dict = {}
for z in range(len(SENT_FIELD.vocab)):
    vocab_dict[SENT_FIELD.vocab.itos[z]] = z

import pickle

with open("./vocabdict.pkl", "wb") as f:
    pickle.dump([vocab_dict], f)