# Loading data

Dataset available in the [original author's Drive](https://drive.google.com/file/d/1Lmv4rsJiCWVs1nzs4ywA9YI-ADsTf6WB/view).

In [2]:
import pandas as pd
data = pd.read_csv("reviews_with_splits_lite.csv")
data

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train
...,...,...,...
55995,positive,"great food . wonderful , friendly service . i ...",test
55996,positive,charlotte should be the new standard for moder...,test
55997,positive,get the encore sandwich ! ! make sure to get i...,test
55998,positive,i m a pretty big ice cream gelato fan . pretty...,test


# Creating a training set

This is the main focus of today, understanding the training pipeline and how to use torch's Dataset and DataLoader classes.

## Vocabulary
We will start building the vocabulary constructor.

In [3]:
class Vocabulary:
    def __init__(self, token_to_idx = None, add_unk = True, unk_token = '<UNK>'):
        '''
        token_to_idx is a dictionary that maps each token to an integer. 
        If the token has never been seen before, we map it `unk_token`.
        '''
        if token_to_idx is None:
            # Allows for general initalization
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        # Creates a dictionary with pointers in the inverse direction
        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token

        # We do not use -1 for the unknown index, but will update it later
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    # The original implementation does this for serralization, 
    # but again, I'm not too worried about caching for now
    def to_dictionary(self):
        return {
            'token_to_idx': self._token_to_idx,
            'idx_to_token': self._idx_to_token
        }
    
    def add_token(self, token):
        '''
        Adds one new token.
        '''
        # If the token already exists, we do nothing.
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            # Otherwise the token is just appended as a list
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def add_many(self, tokens):
        '''
        Add many tokens in a single run.
        '''
        return [self.add_token(token) for token in tokens]
    
    def lookup_token(self, token):
        '''
        Returns the token's index.
        '''
        if self.unk_index >= 0:
            # If the unknown has not being added, we return -1 if
            # the word is unknown.
            # In particular, .get(value, default) avoids raising
            # an error if value is not in the dictionary. In that case,
            # returns default
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        '''
        Get a token associated to some index.
        '''
        if index not in self._idx_to_token:
            raise KeyError('The index (%d) is not yet in the vocabulary' % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary (size = %d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)



Let us play a little with the class to understand it.

In [4]:
voc = Vocabulary()
print(voc)

<Vocabulary (size = 1)>


We can add words.

In [5]:
words = ['i', 'love', 'you']
voc.add_many(words)
print(voc)
print(voc.to_dictionary())

<Vocabulary (size = 4)>
{'token_to_idx': {'<UNK>': 0, 'i': 1, 'love': 2, 'you': 3}, 'idx_to_token': {0: '<UNK>', 1: 'i', 2: 'love', 3: 'you'}}


And check the indices and words.

In [6]:
print(voc.lookup_token('i'))
print(voc.lookup_token('her'))
print(voc.lookup_index(1))

1
0
i


## Vectorizer

Now we move on to construct the vectorizer.

In [7]:
import string
from collections import Counter
import numpy as np

class ReviewVectorizer:
    def __init__(self, review_vocab, rating_vocab):
        '''
        Here, we will make transform sentences/reviews to vectors. 
        Notice that we also assume that the class labels have been transformed into integers.
        '''
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review):
        '''
        Here we will use one-hot encoding for vectorization.
        '''
        one_hot = np.zeros(len(self.review_vocab), dtype = np.float32)

        for token in review.split(" "):
            # We do not encode punctuation
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    def from_dataframe(cls, review_df, cutoff = 25):
        '''
        Allow the whole vectorization process from a pd DataFrame.
        Cutoff makes that words whose frequency are less than that 
        value are not encoded.
        '''
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk= False)

        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)
        
        word_counts = Counter()
        for review in review_df['review']:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        
        # Now we only add words whose frequency are bigger than cutoff.
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
        
        return cls(review_vocab, rating_vocab)

Let us again play a little with this.

In [8]:
vectorizer = ReviewVectorizer(voc, None)
vectorizer.vectorize('i hate you so much .')

array([1., 1., 0., 1.], dtype=float32)

Makes sense as

In [9]:
print(voc.to_dictionary())

{'token_to_idx': {'<UNK>': 0, 'i': 1, 'love': 2, 'you': 3}, 'idx_to_token': {0: '<UNK>', 1: 'i', 2: 'love', 3: 'you'}}


## Dataset and DataLoader

Now we make the class that will generate the whole dataset. We will use Torch's Dataset for that.

In [10]:
from torch.utils.data import Dataset
class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        self.review_df = review_df
        self.vectorizer = vectorizer # I have this public instead of private, which avoids
                                    # an extra method (I like working with attributes ;p )

        # Now we make the split into train, validation, and test
        self.train_df = self.review_df[self.review_df['split'] == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df['split'] == 'val']
        self.val_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df['split'] == 'test']
        self.test_size = len(self.test_df)

        # We now create the attributes that will be used to interact with each of these datasets
        self._lookup_dict = {
            'train' : (self.train_df, self.train_size),
            'val' : (self.val_df, self.val_size),
            'test' : (self.test_df, self.test_size) 
        }

        # This will be defined below, but basically allows for choosing an external dataset at each time
        # By default, this is train
        # It is important to do so because the __len__ and __get_item__ methods are defined on a fixed
        # external data
        self.set_split('train') 

    def set_split(self, split):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    
    # This method will be useful for generating an object of the class directly from the csv path
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        '''
        Loads the csv directly and makes a vectorizer based on it. 
        The vectorizer is naturally constructed only over the train set.
        '''
        review_df = pd.read_csv(review_csv)
        train_data = review_df[review_df['split'] == 'train']
        return cls(review_df, ReviewVectorizer.from_dataframe(train_data))
    
    # Here are the important methods so the training loop does work
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        '''
        This basically allows one to iterate over the rows of the csv.
        '''
        row = self._target_df.iloc[index] # Recall that we are basically working with pd DataFrame
        # Vectorize the row
        review_vector = self.vectorizer.vectorize(row['review'])
        rating_index = self.vectorizer.rating_vocab.lookup_token(row['rating'])
        return {
            'x_data': review_vector,
            'y_data': rating_index
            }
    
    def get_num_batches(self, batch_size):
        '''
        Returns the number of batches needed for that particular batch size.
        '''
        return len(self)//batch_size

Let us try creating the class, just to check if everything is fine.

In [11]:
vectorizer = ReviewVectorizer.from_dataframe(data)
print(vectorizer.vectorize('i hate this restaurant .'))
review = ReviewDataset(data, vectorizer)
print(review.train_df.head())
print(review[0])

[0. 0. 0. ... 0. 0. 0.]
     rating                                             review  split
0  negative  terrible place to work for i just heard a stor...  train
1  negative   hours , minutes total time for an extremely s...  train
2  negative  my less than stellar review is for service . w...  train
3  negative  i m granting one star because there s no way t...  train
4  negative  the food here is mediocre at best . i went aft...  train
{'x_data': array([1., 1., 1., ..., 0., 0., 0.], shape=(8945,), dtype=float32), 'y_data': 0}


We can also load directly from the csv path.

In [12]:
review = ReviewDataset.load_dataset_and_make_vectorizer("reviews_with_splits_lite.csv")
print(review.train_df.head())
print(review[0])

     rating                                             review  split
0  negative  terrible place to work for i just heard a stor...  train
1  negative   hours , minutes total time for an extremely s...  train
2  negative  my less than stellar review is for service . w...  train
3  negative  i m granting one star because there s no way t...  train
4  negative  the food here is mediocre at best . i went aft...  train
{'x_data': array([1., 1., 1., ..., 0., 0., 0.], shape=(7326,), dtype=float32), 'y_data': 0}


Finally, we do the DataLoader. It's objective is to create the batches for training.

In [13]:
from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle = True, drop_last = True):
    dataloader = DataLoader(dataset = dataset, 
                            batch_size = batch_size, 
                            shuffle = shuffle,
                            drop_last = drop_last)
    
    # I will only be using cpu this time, so no device menaging is necessary
    return dataloader

# Training

We now define training. Instead of using the more complicated training loop (this will be the topic of tomorrow), I'll just use the training loop of yesterday's implementation.

In [14]:
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    '''
    Perceptron as a class. In general, we like to have models as classes in Torch.
    '''
    def __init__(self, input_size, hidden_dimension):
        super().__init__() 
        self.fc1 = nn.Linear(input_size, hidden_dimension)
        self.fc2 = nn.Linear(hidden_dimension, 1)
        self.input_size = input_size
    
    def forward(self, x_in, apply_sigmoid = False):
        if x_in.shape[1] != self.input_size:
            raise Exception("Input dimension of the object must be equal to the model's expected diemension!") 
        intermediate = F.relu(self.fc1(x_in))
        y_out = self.fc2(intermediate)
        # For computing the cross-entropy loss, we 
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

This is for computing the accuracy. I basically copied the original implementation, removing the device stuff.

In [15]:
def compute_accuracy(y_pred, y_target):
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).long()
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [16]:
import torch.optim as optim
import torch

hidden_dimension = 5
lr = 0.001
n_epochs = 10
loss_func = nn.BCEWithLogitsLoss()
batch_size = 128


vectorizer = review.vectorizer
classifier = MLP(len(vectorizer.review_vocab), hidden_dimension)


optimizer = optim.Adam(classifier.parameters(), lr = lr)


for epoch in range(n_epochs): 
    review.set_split('train')
    batch_generator = generate_batches(review, batch_size = batch_size)
    running_loss = 0.
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # 1. Zero the gradient
        classifier.zero_grad()

        # 2. Prediction
        y_pred = classifier(x_in = batch_dict['x_data'].float()).squeeze()

        # 3. Compute loss
        loss = loss_func(y_pred, batch_dict['y_data'].float())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)


        # 4. Backpropagate
        loss.backward()
        
        # 5. Optimize
        optimizer.step()

    # Evaluation part, we don't want paramereres to change
    classifier.eval()
    review.set_split('val')
    batch_generator = generate_batches(review, batch_size = batch_size)
    running_acc = 0.
    for batch_index, batch_dict in enumerate(batch_generator):

        # compute the output
        y_pred = classifier(x_in = batch_dict['x_data'].float()).squeeze()

        # compute the accuracy
        acc_t = compute_accuracy(y_pred, batch_dict['y_data'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    

    print('Epoch: ', epoch)
    print('Training loss', running_loss)
    print('Validation accuracy: ', running_acc)

Epoch:  0
Training loss 0.4013539999236466
Validation accuracy:  91.6826923076923
Epoch:  1
Training loss 0.22029797099774182
Validation accuracy:  92.13942307692305
Epoch:  2
Training loss 0.17796104303956814
Validation accuracy:  92.47596153846156
Epoch:  3
Training loss 0.15636351838513138
Validation accuracy:  92.35576923076921
Epoch:  4
Training loss 0.1415344056448126
Validation accuracy:  92.12740384615385
Epoch:  5
Training loss 0.13060202868350987
Validation accuracy:  91.88701923076923
Epoch:  6
Training loss 0.12188972649621027
Validation accuracy:  91.82692307692311
Epoch:  7
Training loss 0.11521302539782183
Validation accuracy:  91.63461538461539
Epoch:  8
Training loss 0.10985134475030538
Validation accuracy:  91.35817307692305
Epoch:  9
Training loss 0.10437196455408936
Validation accuracy:  91.45432692307692


We see some very strong overfitting, but again, this will be treated tomorrow. For now, let us follow the original implementation in doing some qualitative tests.

In [48]:
def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):  
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    classifier.eval()
    result = classifier(vectorized_review.view(1, -1))
    
    probability_value = F.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0

    return vectorizer.rating_vocab.lookup_index(index), probability_value

In [53]:
test_review = "this is a pretty amazing restaurant"

prediction, probability = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5)
print("The review '{}' is {}".format(test_review, prediction.upper()), 'with probability', probability)


The review 'this is a pretty amazing restaurant' is POSITIVE with probability 0.8731887936592102


In [56]:
# This should be fc1 here (input layer) as this is where components
# make sense
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0)
indices = indices.numpy().tolist()

print("Influential words in positive reviews:")
print("--------------------------------------")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))
    
print("====\n\n\n")

print("Influential words in negative reviews:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))


Influential words in positive reviews:
--------------------------------------
pleasantly
chinatown
fantastic
deliciousness
amazed
hooked
nthank
delicious
jokes
ngreat
delightful
superb
perfection
bomb
lawn
drawback
notch
mmmm
maker
frills
====



Influential words in negative reviews:
--------------------------------------
worst
meh
nmaybe
slowest
mediocre
bland
awful
unacceptable
underwhelmed
horrible
cancelled
tasteless
rude
embarrassing
unimpressed
blah
disgusting
terrible
horrendous
roach


In [72]:
print(vectorizer.rating_vocab.to_dictionary())

{'token_to_idx': {'negative': 0, 'positive': 1}, 'idx_to_token': {0: 'negative', 1: 'positive'}}


In [80]:
print(classifier.fc2.weight.shape)

torch.Size([1, 5])


In [98]:
W1 = classifier.fc1.weight.detach()      
W2 = classifier.fc2.weight.detach()      

# Effective weights: class × vocab
W_eff = W2 @ W1                          

topk = torch.topk(W_eff, k=20)
print("Influential words in positive reviews:")
print("--------------------------------------")
for i in topk.indices[0]:
        print(vectorizer.review_vocab.lookup_index(i.item()))

print("Influential words in negative reviews:")
print("--------------------------------------")
topk = torch.topk(-W_eff, k=20)
for i in topk.indices[0]:
        print(vectorizer.review_vocab.lookup_index(i.item()))

Influential words in positive reviews:
--------------------------------------
pleasantly
fantastic
chinatown
delicious
deliciousness
hooked
amazed
nthank
ngreat
jokes
bomb
superb
notch
mmmm
delightful
lawn
perfection
drawback
boba
frills
Influential words in negative reviews:
--------------------------------------
worst
meh
nmaybe
slowest
mediocre
bland
awful
unacceptable
underwhelmed
horrible
tasteless
cancelled
rude
blah
embarrassing
disgusting
unimpressed
terrible
unfriendly
horrendous
