In [1]:
import os
import torch
import torchtext
import pandas as pd
import numpy as np

import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader
from torchtext.data import (Dataset, Example, Field, 
                            LabelField, BucketIterator, Iterator)
from argparse import Namespace
from tqdm.notebook import tqdm as tqdm_notebook

In [44]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# Args

In [2]:
args = Namespace(
    review_csv = "data/yelp/reviews_with_splits_lite.csv",
    model_state_file='model.pth',
    save_dir='model_storage/',
    label_field_name = "rating",
    data_field_name = "review",
    frequency_cutoff=25,
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    catch_keyboard_interrupt=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
    cuda=False
)

##  Global Configurations

In [3]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [4]:
if args.expand_filepaths_to_save_dir:
    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage/model.pth
Using CUDA: False


# Load Data

In [5]:
data = pd.read_csv(args.review_csv)

In [6]:
data.head()

Unnamed: 0,rating,review,split
0,negative,"on a recent visit to las vegas , my friends an...",train
1,positive,"excellent food ! we had the pompedoro , chicke...",train
2,positive,a great little glimpse back into old vegas . t...,train
3,positive,i was in phoenix for a couple of days for a co...,train
4,positive,what a treasure ! i have been doing yoga for y...,train


# Define Fields

In [7]:
import string
def tokenizer(review):
    """Simple tokenizer"""
    return [word for word in review.split(" ") 
            if word not in string.punctuation]

In [8]:
REVIEW = Field(tokenize=tokenizer, sequential=True, lower=True)
RATING = LabelField(dtype=torch.float32)

## Create Datasets TorchText

In [9]:
class ReviewDataset(Dataset):
    def __init__(self, review_df, fields):
        examples = []
        for i, row in tqdm_notebook(review_df.iterrows(), total=review_df.shape[0]):
            rating = row.rating
            review = row.review
            examples.append(Example.fromlist([review, rating], fields))
        super().__init__(examples, fields)
        
    @staticmethod
    def sort_key(ex):
        return len(ex.review)
    
    @classmethod
    def splits(cls, fields, review_df):
        train_data = cls(review_df[review_df.split == "train"], fields)
        val_data = cls(review_df[review_df.split == "val"], fields)
        test_data = cls(review_df[review_df.split == "test"], fields)
        
        return train_data, val_data, test_data

In [10]:
fields = [('review', REVIEW), ('rating', RATING)]
train_data, val_data, test_data = ReviewDataset.splits(fields, review_df=data)

HBox(children=(FloatProgress(value=0.0, max=39200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8400.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8400.0), HTML(value='')))




In [11]:
print("REVIEW\n",vars(train_data[0])["review"])
print("RATING\n",vars(train_data[0])["rating"])

REVIEW
 ['on', 'a', 'recent', 'visit', 'to', 'las', 'vegas', 'my', 'friends', 'and', 'i', 'decided', 'to', 'stay', 'at', 'the', 'monte', 'carlo', 'because', 'it', 'had', 'been', 'recommended', 'to', 'us', 'and', 'we', 'like', 'the', 'location', 'i', 'would', 'say', 'overall', 'that', 'we', 'had', 'a', 'nice', 'vacation', 'but', 'we', 'experienced', 'a', 'problem', 'at', 'the', 'end', 'of', 'our', 'stay', 'n', 'nafter', 'we', 'had', 'packed', 'our', 'bags', 'and', 'just', 'before', 'checking', 'out', 'i', 'called', 'for', 'a', 'bellman', 'to', 'take', 'our', 'bags', 'down', 'for', 'us', 'after', 'checking', 'out', 'and', 'driving', 'home', 'we', 'discovered', 'that', 'a', 'bottle', 'of', 'perfume', 'had', 'been', 'broken', 'in', 'one', 'of', 'our', 'bags', 'as', 'all', 'we', 'did', 'was', 'go', 'straight', 'from', 'the', 'hotel', 'back', 'to', 'our', 'house', 'we', 'concluded', 'the', 'bag', 'had', 'most', 'likely', 'been', 'mishandled', 'by', 'the', 'bellman', 'n', 'nwe', 'immediately'

## Build Vocab

In [12]:
REVIEW.build_vocab(train_data, min_freq=args.frequency_cutoff)

In [13]:
RATING.build_vocab(train_data)

In [14]:
vars(RATING.vocab)

{'freqs': Counter({'negative': 19600, 'positive': 19600}),
 'itos': ['negative', 'positive'],
 'unk_index': None,
 'stoi': defaultdict(None, {'negative': 0, 'positive': 1}),
 'vectors': None}

In [15]:
print("TOTAL_WORDS = ", len(REVIEW.vocab.itos))

TOTAL_WORDS =  7712


In [16]:
print(REVIEW.vocab.itos[:2])

['<unk>', '<pad>']


In [17]:
args.total_words = len(REVIEW.vocab.itos)

## Create One-Hot-Vectorizer

In [18]:
class Vectorizer(object):
    def __init__(self, review_field):
        self._review_field = review_field
        self._slicing = torch.tensor([i for i in # Exclude <pad> token
                                      range(len(self._review_field.vocab.itos)) if i !=1])
        self._dimension = len(self._slicing)
    
    def vectorize(self, batch_matrix):
        batch_size = batch_matrix.shape[1]
        one_hot = torch.zeros((batch_size, len(self._review_field.vocab.itos)), dtype=torch.long)
        indices = batch_matrix.T
        source = torch.ones_like(indices)
        # source = (indices != 1).long() # Exclude <pad> token
        one_hot.scatter_(1, indices, source)
        return one_hot[:, self._slicing] 

In [19]:
oh_vectorize = Vectorizer(REVIEW)

## Define Iterators

In [21]:
train_iterator, val_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = args.batch_size,
    sort_within_batch = True,
    device = args.device,
)

test_iterator = Iterator(test_data, batch_size=args.batch_size,
                        device=args.device, sort=False, sort_within_batch=False)

In [22]:
for batch in train_iterator:
    v_code = oh_vectorize.vectorize(batch.review)
    print(v_code)
    first = batch.review[:,0].sort().values
    decode = ' '.join([REVIEW.vocab.itos[idx] for idx in first if idx != 1])
    print(decode)
    first = v_code[0]
    decode = ' '.join([REVIEW.vocab.itos[idx] for idx, val in enumerate(first, 1)
                      if val!= 0])
    print("\n")
    print(decode)
    break

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
<unk> <unk> <unk> <unk> the the the the the the and and and and i i to to a a a a a a a was was was it it of of for for in in n n is that that this with t have have s had at at at at at were so there out if if get would back their an what only us been because ni ni also well over better best come chicken minutes room think bad table table took wasn long hour why nwe least breakfast waiting waiting waiting tasted served shrimp sat may rather sitting under rest eggs eggs spend simple mediocre hope happened nservice cocktail heat smile nfood shit ham dude nwell golden gate pig explanation lamp blackjack laying


<pad> the and i to a was it of for in n is that this with t have s had at were so there out if get would back their an what only us been because ni also well over be

# Model

In [23]:
class ReviewClassifier(nn.Module):
    """ a simple perceptron based classifier """
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, 
                             out_features=1)

    def forward(self, x_in, apply_sigmoid=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, num_features)
            apply_sigmoid (bool): a flag for the sigmoid activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,)
        """
        y_out = self.fc1(x_in.squeeze()).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

## Training Loop

### Utilities

In [24]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'].format(train_state['epoch_index']))
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

## Initilization

In [25]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

In [26]:
classifier = ReviewClassifier(num_features=oh_vectorize._dimension)
classifier

ReviewClassifier(
  (fc1): Linear(in_features=7711, out_features=1, bias=True)
)

In [27]:
def train(data_loaders, classifier, optimizer, loss_func, train_bar):
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(data_loaders["train"]):
        # the training routine is these 5 steps:

        # --------------------------------------
        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output

        x_in = oh_vectorize.vectorize(batch_dict.review)
        y_pred = classifier(x_in=x_in.float())

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict.rating)
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        # -----------------------------------------
        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict.rating)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

        # update bar
        train_bar.set_postfix(loss=running_loss, 
                              acc=running_acc, 
                              epoch=epoch_index)
        train_bar.update()

    return running_loss, running_acc

In [28]:
def val(data_loaders, classifier, loss_func, val_bar):
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    with torch.no_grad():
        for batch_index, batch_dict in enumerate(data_loaders["val"]):

            # compute the output
            x_in = oh_vectorize.vectorize(batch_dict.review)
            y_pred = classifier(x_in=x_in.float())

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict.rating)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict.rating)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            val_bar.set_postfix(loss=running_loss, 
                                acc=running_acc, 
                                epoch=epoch_index)
            val_bar.update()
    return running_loss, running_acc

In [29]:
data_loaders = {"train": train_iterator, "val": val_iterator, "test": test_iterator}
datasets = {"train": train_data, "val": val_data, "test": test_data}

In [30]:
classifier = classifier.to(args.device)

loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

train_bar = tqdm_notebook(desc='split=train',
                          total=len(data_loaders["train"]), 
                          position=1, 
                          leave=True)

val_bar = tqdm_notebook(desc='split=val',
                        total=len(data_loaders["val"]), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset
        running_loss, running_acc = train(data_loaders, classifier, 
                                          optimizer, loss_func, train_bar)
        
        writer.add_scalar('Loss/train', running_loss, epoch_index)
        writer.add_scalar('Accuracy/train', running_acc, epoch_index)
        
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset
        running_loss, running_acc = val(data_loaders, classifier,
                                       loss_func, val_bar)
        
        writer.add_scalar('Loss/val', running_loss, epoch_index)
        writer.add_scalar('Accuracy/val', running_acc, epoch_index)
        
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()

        if train_state['stop_early']:
            break

except KeyboardInterrupt:
    print("Exiting loop")

HBox(children=(FloatProgress(value=0.0, description='training routine', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=307.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=66.0, style=ProgressStyle(description_wid…

In [45]:
%tensorboard --logdir runs

Reusing TensorBoard on port 6007 (pid 43615), started 0:00:36 ago. (Use '!kill 43615' to kill it.)

In [31]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)


running_loss = 0.
running_acc = 0.
classifier.eval()

with torch.no_grad():
    for batch_index, batch_dict in tqdm_notebook(enumerate(data_loaders["test"]), 
                                                total = len(data_loaders["test"])):
        # compute the output
        x_in = oh_vectorize.vectorize(batch_dict.review)
        y_pred = classifier(x_in=x_in.float())

        # compute the loss
        loss = loss_func(y_pred, batch_dict.rating.float())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict.rating)
        running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

HBox(children=(FloatProgress(value=0.0, max=66.0), HTML(value='')))




In [32]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.207
Test Accuracy: 92.03


## Inference

In [33]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [34]:
def predict_rating(review, classifier, review_field, rating_field,
                   oh_vectorizer, decision_threshold=0.5):
    """Predict the rating of a review
    
    Args:
        review (str): the text of the review
        classifier (ReviewClassifier): the trained model
        vectorizer (ReviewVectorizer): the corresponding vectorizer
        decision_threshold (float): The numerical boundary which separates the rating classes
    """
    review = preprocess_text(review)
    review = review_field.tokenize(review)
    review = review_field.numericalize([review])
    vectorized_review = oh_vectorizer.vectorize(review)
    
    result = classifier(vectorized_review.float())
    
    probability_value = torch.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0
        
    return rating_field.itos[index]

In [35]:
test_review = "This book is meh"

classifier = classifier.cpu()
prediction = predict_rating(test_review, classifier, REVIEW, RATING.vocab, oh_vectorize, decision_threshold=0.5)
print("{} -> {}".format(test_review, prediction))

This book is meh -> negative


## Interpretability

In [36]:
classifier.fc1.weight.shape

torch.Size([1, 7711])

In [37]:
lookup_index = ['<unk>'] + REVIEW.vocab.itos[2:]

In [38]:
# Sort weights
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

# Top 20 words
print("Influential words in Positive Reviews:")
print("--------------------------------------")
for i in range(20):
    print(lookup_index[indices[i]])
    
print("====\n\n\n")

# Top 20 negative words
print("Influential words in Negative Reviews:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
    print(lookup_index[indices[i]])

Influential words in Positive Reviews:
--------------------------------------
excellent
delicious
amazing
disappoint
outstanding
perfect
awesome
perfection
yum
incredible
fantastic
ngreat
great
downside
superb
heaven
love
hooked
wonderful
perfectly
====



Influential words in Negative Reviews:
--------------------------------------
worst
horrible
terrible
bland
awful
meh
mediocre
tasteless
poisoning
disgusting
poor
eh
overpriced
disappointment
inedible
rude
disappointing
lacked
sucks
overrated
