In [36]:
import pandas as pd
import numpy as np

import torch 
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd

import torchtext
from torchtext import data
from torchtext import datasets

import spacy

import re 
import time
import pickle
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
reviewdf = pd.read_csv("./data/Reviews.csv", usecols=["Score","Text"])

#Make all review scores equal distribution by undersampling
balanced = None
for i in reviewdf["Score"].unique():
    balanced = pd.concat([balanced, reviewdf[reviewdf["Score"] == i][0:29500]])
balanced = balanced.sample(frac=1).reset_index(drop=True)
del reviewdf
balanced.head()

Unnamed: 0,Score,Text
0,1,Probably because it doesn't taste like coffee....
1,4,Stash have tried very hard to get quality prod...
2,4,this a great snack... the spicy taste is a kic...
3,3,One of the biggest challenge to mix more than ...
4,2,"I liked the tree, but when it arrived, I felt ..."


In [7]:
def clean_review(x):
    x = re.sub('\\\\|<[^>]+>', '', x) #remove <br>
    x = re.sub(r'\([^)]*\)', '', x) #remove (in-between parenthesis)
    x = x.replace('"',"")
    x = x.replace("'","")
    return x 

balanced["Text"] = balanced["Text"].apply(lambda x: clean_review(x))
balanced[0:29500*4].to_csv('amazon_train.csv', index=False)
balanced[29500*4:].to_csv('amazon_test.csv', index=False)

In [28]:
SCORE = data.LabelField(dtype=torch.long)
REVIEW = data.Field(tokenize='spacy', lower=True, include_lengths=True)
print('Finished loading spacy.')

fields = [('score', SCORE), ('review', REVIEW)]

train_data, test_data = data.TabularDataset.splits(
                                        path = '.',
                                        train = 'amazon_train.csv',
                                        test = 'amazon_test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)
train_data, valid_data = train_data.split(split_ratio=0.8)
print('Finished loading data.')

Finished loading spacy.
Finished loading data.


In [29]:
vocab_size = 20000
batch_size = 100

REVIEW.build_vocab(train_data, max_size = vocab_size, vectors="glove.6B.100d", unk_init = torch.Tensor.normal_)
SCORE.build_vocab(train_data)
print('Finished building vocab')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = batch_size,
    device = device,
    sort_within_batch = True,
    sort_key=lambda x: len(x.review)
)

test_iterator = data.Iterator(
    test_data, 
    batch_size = batch_size, 
    device = device, 
    sort_within_batch = True, 
    sort_key=lambda x: len(x.review)
)

In [30]:
class BiLSTM_RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout, padding_idx):
        super(BiLSTM_RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, review, review_length):        
        embedded = self.dropout(self.embedding(review))
        padded = nn.utils.rnn.pack_padded_sequence(embedded, review_length)
        hidden = self.lstm(padded)[1][0]
        cat = torch.cat((hidden[-2], hidden[-1]), dim = 1)
        return self.fc(cat.squeeze(0))

In [39]:
vocab_size = len(REVIEW.vocab)
embedding_dim = 100
hidden_dim = 100
output_dim = 5
num_layers = 1
dropout = 0.5
padding_index = REVIEW.vocab.stoi['<pad>']
unknown_index = REVIEW.vocab.stoi['<unk>']

model = BiLSTM_RNN(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout, padding_index)

#Load pretrained embeddings
pretrained_embeddings = REVIEW.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

#Reset unknown and padding vectors
model.embedding.weight.data[unknown_index] = torch.zeros(embedding_dim, device=device)
model.embedding.weight.data[padding_index] = torch.zeros(embedding_dim, device=device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [40]:
def accuracy(preds, y):
    num_correct = torch.sum(torch.argmax(preds, dim=1, keepdim=False) == y)
    return num_correct.item()/len(y)

def train(model, iterator, optimizer, criterion):
    loss, acc = 0, 0
    model.train()

    for batch in iterator:
        optimizer.zero_grad()   
        preds = model(*batch.review)

        batch_loss = criterion(preds, batch.score)
        batch_loss.backward()
        optimizer.step()
        
        loss += batch_loss.item()
        acc += accuracy(preds, batch.score)
        
    return loss/len(iterator), acc/len(iterator)

def evaluate(model, iterator, criterion):
    loss, acc = 0, 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            preds = model(*batch.review)
            batch_loss = criterion(preds, batch.score)
            loss += batch_loss.item()
            acc += accuracy(preds, batch.score)
        
    return loss/len(iterator), acc/len(iterator)

We only train for 5 epochs here, but I am able to get around 67% validation accuracy with 20 epochs. However, training accuracy surges beyond 80% which implies heavy over-fitting. 

In [41]:
epochs = 5
best_loss = np.inf

for epoch in np.arange(epochs):
    start = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end = time.time()

    duration = time.strftime('%H:%M:%S', time.gmtime(end - start))

    if valid_loss < best_loss:
        best_loss = valid_loss
        with open('model.pkl', 'wb') as f:
            pickle.dump(model, f)

    print(f'\nEpoch {epoch + 1} at {duration}')
    print(f'Train Loss: {train_loss:.3f} - Validation Loss: {valid_loss:.3f}')
    print(f'Train Acc: {train_acc:.2f} - Validation Acc: {valid_acc:.2f}')


Epoch 1 at 00:00:34
Train Loss: 1.341 - Validation Loss: 1.126
Train Acc: 0.40 - Validation Acc: 0.51

Epoch 2 at 00:00:34
Train Loss: 1.119 - Validation Loss: 1.062
Train Acc: 0.52 - Validation Acc: 0.54

Epoch 3 at 00:00:34
Train Loss: 1.028 - Validation Loss: 0.993
Train Acc: 0.56 - Validation Acc: 0.58

Epoch 4 at 00:00:34
Train Loss: 0.970 - Validation Loss: 0.971
Train Acc: 0.59 - Validation Acc: 0.59

Epoch 5 at 00:00:34
Train Loss: 0.923 - Validation Loss: 0.933
Train Acc: 0.61 - Validation Acc: 0.61


In [44]:
with open('model.pkl','rb') as f:
    model = pickle.load(f)
model.to(device)

test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')
print(f'Test Acc: {test_acc:.2f}%')

Test Loss: 0.941
Test Acc: 0.60%


In [47]:
nlp = spacy.load('en')

In [131]:
def predict(review, tokenizer, review_vocab, score_vocab, print_prob=False):
    tokens = [t.text for t in tokenizer(review)]
    indices = [review_vocab.stoi[t] for t in tokens]
    
    with torch.no_grad():
        input = torch.LongTensor(indices).to(device).unsqueeze(1)
        likelihood = model(input, torch.tensor([len(tokens)]))
        probs = torch.softmax(likelihood, 0)
        pred_index = probs.argmax().item()
        scores = np.array(score_vocab.itos[0:5]).astype(int)
        prediction = score_vocab.itos[pred_index]
        
        if print_prob:
            print('Probability distribution: ', probs[np.argsort(scores)].cpu().numpy())
        
    return prediction

In [132]:
prediction = predict("The way it tastes is great. Absolutely amazing.", nlp.tokenizer, REVIEW.vocab, SCORE.vocab, print_prob=True)
print(f'Predicted score: {prediction}')

Probability distribution:  [0.00427912 0.00304541 0.01441015 0.19444345 0.7838218 ]
Predicted score: 5


In [134]:
prediction = predict("It was okay, but nothing fancy.", nlp.tokenizer, REVIEW.vocab, SCORE.vocab, print_prob=True)
print(f'Predicted score: {prediction}')

Probability distribution:  [0.03027532 0.15796894 0.72844136 0.07762711 0.00568723]
Predicted score: 3


In [133]:
prediction = predict("Very disappointing from such a reputable company", nlp.tokenizer, REVIEW.vocab, SCORE.vocab, print_prob=True)
print(f'Predicted score: {prediction}')

Probability distribution:  [0.8194858  0.15229692 0.02525269 0.00197759 0.00098703]
Predicted score: 1


### Original dataset

In [94]:
reviewdf = pd.read_csv("./data/Reviews.csv", usecols=["Score","Text"])
reviewdf["Predicted_score"] = reviewdf["Text"].apply(lambda x: predict(x, SCORE.vocab))
reviewdf["Predicted_score"] = pd.to_numeric(reviewdf["Predicted_score"])

The confusion matrix indicates that our model is really adept at predicting 1-star ratings and 5-star ratings, but starts to get ambiguous at the 2, 3, 4 star ratings. This makes sense as there is a lot of room for slack. How different is a 2 from a 3? A 4 is also really close to a 5 and can be really difficult to pickup on. 

In [115]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, mean_absolute_error

cm = confusion_matrix(reviewdf["Score"], reviewdf["Predicted_score"])
cm

array([[ 44957,   2393,   1541,    379,   2998],
       [ 15492,   6595,   4143,   1039,   2500],
       [  9449,   4380,  15588,   6870,   6353],
       [  5317,    935,   9056,  27402,  37945],
       [ 16185,   1032,   6567,  32231, 307107]])

In [108]:
accuracy_score(reviewdf["Score"], reviewdf["Predicted_score"])

0.7065637676927244

In [110]:
f1_score(reviewdf["Score"], reviewdf["Predicted_score"], average="weighted")

0.6995135110714451

On average, this is how much we would be off by if we predicted the mean every time

In [119]:
mean_absolute_error(reviewdf["Score"], np.repeat(np.mean(reviewdf["Score"]), len(reviewdf)))

1.0435269810958099

Our model predictions are off by less than 0.5 stars on average. This is really good!

In [116]:
mean_absolute_error(reviewdf["Score"], reviewdf["Predicted_score"])

0.472675009763323

Original distribution of the dataset

In [128]:
reviewdf.groupby("Score").count()[["Text"]]/len(reviewdf)

Unnamed: 0_level_0,Text
Score,Unnamed: 1_level_1
1,0.091948
2,0.052368
3,0.07501
4,0.141885
5,0.638789
