# Using Convolutional Neural Network in Sentiment Analysis

This notebook follows the Week 4 lecture about Recurrent Neural Networks. The dataset we use in this notebook is from this kaggle competition: https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

The first cell is the code that we looked at in the lecture on how to bulid a bag of words model

In [None]:
# import our tokenizer and counter
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter
# Declare a sentence 
sent = '''I am taking the MCS class CS274P 
          and I am learning how to teach a 
          machine to understand language'''
# Lets tokenize our sentence
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sent.lower())
bow = Counter(tokens)
bow.most_common(3)

## Convolutional Neural Network for Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torchtext import data as torch_data
import random
import torch.nn.functional as F

random.seed(84848)
np.random.seed(84848)
torch.manual_seed(84848)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Step 1: Lets download our data from Kaggle
Look at one of the previous notebooks on how to setup Kaggle on your machine

In [None]:
!kaggle competitions download -c sentiment-analysis-on-movie-reviews

In [None]:
!ls

In [None]:
!unzip sentiment-analysis-on-movie-reviews.zip

In [None]:
! rm sentiment-analysis-on-movie-reviews.zip && mkdir ../../data/sentiment

In [None]:
!rm *csv && mv *.zip ../../data/sentiment

In [None]:
!ls ../../data/sentiment

### Step 2: Load the data into pandas

In [None]:
train = pd.read_csv('../../data/sentiment/train.tsv.zip', sep="\t")
test = pd.read_csv('../../data/sentiment/test.tsv.zip', sep="\t")

In [None]:
train.shape, test.shape

In [None]:
train.head()

#### Lets create a custom Dataset object to load and split our data

In [None]:
class MovieReviewDataset(torch_data.Dataset):
    name = 'movie_reviews'
    text_field = torch_data.Field(tokenize = 'spacy', batch_first = True)
    label_field = torch_data.LabelField(dtype = torch.float)
    path = ''
    def __init__(self, data, fields, **kwargs):
        examples = []
        for idx, row in data.iterrows():
            text = row['Phrase']
            label = row['Sentiment']
            examples.append(torch_data.Example.fromlist([text, label], fields))

        super(MovieReviewDataset, self).__init__(examples, fields, **kwargs)
    
    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    def split(self, split_ratio=0.7, stratified=False, strata_field='label',
              random_state=None):
        """Create train-test(-valid?) splits from the instance's examples.
        Returns:
            Tuple[Dataset]: Datasets for train, validation, and
            test splits in that order, if the splits are provided.
        """
        train_ratio, test_ratio, val_ratio = self.check_split_ratio(split_ratio)

        # For the permutations
        rnd = torch_data.utils.RandomShuffler(random_state)
        if not stratified:
            train_data, test_data, val_data = self.rationed_split(self.examples, train_ratio,
                                                             test_ratio, val_ratio, rnd)
        else:
            if strata_field not in self.fields:
                raise ValueError("Invalid field name for strata_field {}"
                                 .format(strata_field))
            strata = stratify(self.examples, strata_field)
            train_data, test_data, val_data = [], [], []
            for group in strata:
                # Stratify each group and add together the indices.
                group_train, group_test, group_val = rationed_split(group, train_ratio,
                                                                    test_ratio, val_ratio,
                                                                    rnd)
                train_data += group_train
                test_data += group_test
                val_data += group_val

        splits = tuple(torch_data.Dataset(d, self.fields)
                       for d in (train_data, val_data, test_data) if d)

        # In case the parent sort key isn't none
        if self.sort_key:
            for subset in splits:
                subset.sort_key = self.sort_key
        return splits
    
    def check_split_ratio(self, split_ratio):
        """"Check that the split ratio argument is not malformed"""
        valid_ratio = 0.
        if isinstance(split_ratio, float):
            # Only the train set relative ratio is provided
            # Assert in bounds, validation size is zero
            assert 0. < split_ratio < 1., (
                "Split ratio {} not between 0 and 1".format(split_ratio))

            test_ratio = 1. - split_ratio
            return (split_ratio, test_ratio, valid_ratio)
        elif isinstance(split_ratio, list):
            # A list of relative ratios is provided
            length = len(split_ratio)
            assert length == 2 or length == 3, (
                "Length of split ratio list should be 2 or 3, got {}".format(split_ratio))

            # Normalize if necessary
            ratio_sum = sum(split_ratio)
            if not ratio_sum == 1.:
                split_ratio = [float(ratio) / ratio_sum for ratio in split_ratio]

            if length == 2:
                return tuple(split_ratio + [valid_ratio])
            return tuple(split_ratio)
        else:
            raise ValueError('Split ratio must be float or a list, got {}'
                             .format(type(split_ratio)))
    
    def rationed_split(self, examples, train_ratio, test_ratio, val_ratio, rnd):
    
        N = len(examples)
        print(N)
        randperm = rnd(range(N))
        train_len = int(round(train_ratio * N))

        # Due to possible rounding problems
        if not val_ratio:
            test_len = N - train_len
        else:
            test_len = int(round(test_ratio * N))

        indices = (randperm[:train_len],  # Train
                   randperm[train_len:train_len + test_len],  # Test
                   randperm[train_len + test_len:])  # Validation

        # There's a possibly empty list for the validation set
        data = tuple([examples[i] for i in index] for index in indices)

        return data

### Step 3: Clean and prepare the data for our CNN Model

Here we declare our text token field and our label field. We also instantiate the dataset object

In [None]:
text_field = torch_data.Field(tokenize = 'spacy', batch_first = True)
label_field = torch_data.LabelField(dtype = torch.float)
dataset = MovieReviewDataset(train, 
                             fields = [('text', text_field), 
                             ('label', label_field)])

Lets split the dataset

In [None]:
train_data, valid_data = dataset.split(random_state = random.seed(48484))

In [None]:
print(len(train_data), len(valid_data))

Lets build up out word embedding based on the Glove embedding model which we will cover next week

In [None]:
text_field.build_vocab(train_data, 
                 max_size = 25_000, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

In [None]:
label_field.build_vocab(train_data)

In [None]:
train_iterator, valid_iterator = torch_data.BucketIterator.splits(
                                    (train_data, valid_data), 
                                    batch_size = 64, 
                                    device = 'cuda')

### Step 4: Lets declare a basic model architecture

In [None]:
class LanguageModel(nn.Module):
    def __init__(self, 
                 input_size, 
                 emb_dim,
                 layer_sizes,
                 hidden_size,
                 out_dim,
                 pad_idx,
                 device='cpu',
                 activation=nn.ReLU):
        
        super(LanguageModel, self).__init__()
        
        
        self.embed = nn.Embedding(input_size, emb_dim, padding_idx=pad_idx)
        
        self.convs = nn.ModuleDict({})
        for idx in layer_sizes:
            self.convs['conv'+str(idx)] = nn.Conv2d(in_channels = 1, 
                                                    out_channels = hidden_size, 
                                                    kernel_size = (idx, emb_dim))
        
        self.out = nn.Linear(len(layer_sizes) * hidden_size, out_dim)
        self.dropout = nn.Dropout(0.5)
        
        self.optimizer = torch.optim.Adam(self.parameters())
        self.criterion = nn.BCEWithLogitsLoss()
        self.activation = activation()
        
        # Move the model to the gpu if its available
        self.to(device)
        self.criterion = self.criterion.to(device)
        
        self.min_loss = float('inf')
        
        trained_embeddings = text_field.vocab.vectors
        self.embed.weight.data.copy_(trained_embeddings)
        
        unk_idx = text_field.vocab.stoi[text_field.unk_token]
        self.embed.weight.data[unk_idx] = torch.zeros(em_bed)
        self.embed.weight.data[pad_idx] = torch.zeros(em_bed)

    def forward(self, in_text):
        x = self.embed(in_text)
        x = x.unsqueeze(1)
        conv = [self.activation(conv(x)).squeeze(3) for _,conv in self.convs.items()]
        pool = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in conv]
        cat = self.dropout(torch.cat(pool, dim = 1))
        return self.out(cat)
    
    def train_(self, 
              train_iter, 
              epochs=5, 
              model_out_path=None):
        
        self.train()
        
        for e_idx in range(epochs):
            train_loss, train_acc = self._train_(train_iter)
            #eval_loss, eval_acc = self.evaluate(test_iter)
        
            #if model_out_path and eval_loss < self.min_loss:
            #    self.min_loss = eval_loss
            #    self.save(model_out_path)
        
            print(f'Epoch: {e_idx}')
            print(f'\t Loss: {train_loss:.3f} | Acc: {train_acc*100:.2f}%')
            #print(f'\t Val. Loss: {eval_loss:.3f} |  Val. Acc: {eval_acc*100:.2f}%')
    
    def _train_(self, iterator):
        t_loss = 0
        t_acc = 0
    
        for batch in iterator:
            self.optimizer.zero_grad()
            predictions = self(batch.text).squeeze(1)
            loss = self.criterion(predictions, batch.label)
            # Calculate the accuracy
            rounded_preds = torch.round(torch.sigmoid(predictions))
            correct = (rounded_preds == batch.label).float() #convert into float for division 
            acc = correct.sum() / len(correct)
        
            loss.backward()
        
            self.optimizer.step()
        
            t_loss += loss.item()
            t_acc += acc.item()
        
        return t_loss / len(iterator), t_acc / len(iterator)
    
    def evaluate(self, iterator):
        e_loss = 0
        e_acc = 0
    
        self.eval()
    
        with torch.no_grad():
            for batch in iterator:
                predictions = self(batch.text).squeeze(1)
            
                loss = self.criterion(predictions, batch.label)
            
                rounded_preds = torch.round(torch.sigmoid(predictions))
                correct = (rounded_preds == batch.label).float() #convert into float for division 
                acc = correct.sum() / len(correct)

                e_loss += loss.item()
                e_acc += acc.item()
        
        return e_loss / len(iterator), e_acc / len(iterator)
    
    def save(self, path):
        torch.save(self.state_dict(), path)
        

#### Now we declare our hyper paramters, instantiate our model and start training

In [None]:
em_bed = 100
hidden_size = 100
layer_size = [3,4,5]
output_dim = 1
input_size = len(text_field.vocab)
pad_idx = text_field.vocab.stoi[text_field.pad_token]

model = LanguageModel( 
                      input_size, 
                      em_bed,
                      layer_size, 
                      hidden_size, 
                      output_dim, 
                      pad_idx,
                      device)

In [None]:
print(model)

In [None]:
model.train_(train_iterator 5)

## Excercises

1. What is wrong with the model and/or training above. Can you fix it?
2. What other hyper parameters would be better here?
3. Do predictions on the test set and share your best metrics and how you got there