# Sentiment classification with GRU
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [0]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

## Dataset

In [0]:
def unpack_glove():
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    ! mkdir data
    ! unzip glove.6B.zip -C data

In [0]:
def loadGloveModel(gloveFile=PATH/"glove.6B.50d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [0]:
# unpack_glove()

In [0]:
word_vecs = loadGloveModel()

In [28]:
print(len(word_vecs.keys()))

400000


In [0]:
def delete_rare_words(word_vecs, word_count, min_df=4):
    """ Deletes rare words from word_count
    
    Deletes words from word_count if they are not in word_vecs
    and don't have at least min_df occurrencies in word_count.
    """
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df and word not in word_vecs:
            words_delete.append(word)
    for word in words_delete: word_count.pop(word)
    return word_count

In [0]:
def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_vecs, word_count, min_df)
    V = len(word_count.keys()) + 2
    vocab2index = {}
    W = np.zeros((V, emb_size), dtype="float32")
    vocab = ["", "UNK"]
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    vocab2index["UNK"] = 1
    i = 2
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
            vocab2index[word] = i
            vocab.append(word)
            i += 1
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
            vocab2index[word] = i
            vocab.append(word)
            i += 1   
    return W, np.array(vocab), vocab2index

In [0]:
pretrained_weight_glove, vocab_glove, vocab2index_glove = create_embedding_matrix(word_vecs, counts)

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [0]:
def unpack_dataset():
    ! mkdir -p data/aclImdb
    ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    ! tar -zxvf aclImdb_v1.tar.gz -C data

In [0]:
# unpack_dataset()

In [4]:
from pathlib import Path
PATH = Path("data/aclImdb/")
list(PATH.iterdir())

[PosixPath('data/aclImdb/test'),
 PosixPath('data/aclImdb/imdbEr.txt'),
 PosixPath('data/aclImdb/train'),
 PosixPath('data/aclImdb/glove.6B.50d.txt'),
 PosixPath('data/aclImdb/README'),
 PosixPath('data/aclImdb/imdb.vocab')]

In [5]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [0]:
# first time run this
#!python3 -m spacy download en

In [0]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [8]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index

In [9]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('data/aclImdb/train/pos/1892_8.txt'),
 PosixPath('data/aclImdb/train/pos/2060_8.txt'),
 PosixPath('data/aclImdb/train/pos/11134_9.txt'),
 PosixPath('data/aclImdb/train/pos/9481_9.txt'),
 PosixPath('data/aclImdb/train/pos/10447_10.txt')]

In [0]:
# takes some time
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [0]:
# counts

In [12]:
len(counts.keys())

103512

In [0]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [14]:
len(counts.keys())

33912

In [0]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [0]:
#vocab2index

## Model with variable length
dynamic padding + pack_padded_sequence

`pack_padded_sequence` packs a Tensor containing padded sequences of variable length.

In [0]:
def encode_sentence_no_padding(path, vocab2index):
    x = spacy_tok(path.read_text())
    return np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])

In [0]:
path = PATH/"train/neg/211_4.txt"
#encode_sentence_no_padding(path, vocab2index)

In [0]:
class ImdbDataset2(Dataset):
    def __init__(self, PATH, train="train"):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        # it is important to run encode_sentence in the init
        self.X = [encode_sentence_no_padding(path, vocab2index) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        return x, self.y[idx]

In [0]:
train_ds = ImdbDataset2(PATH)
valid_ds = ImdbDataset2(PATH, "test")

In [0]:
#train_ds[0]

### collate_fn function
The `collate_fn` merges a list of samples to form a mini-batch. It is an optional parameter to our data loader.

In [0]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sentences, labels).
    
    Need custom collate_fn because merging sequences (including padding) is not 
    supported in default. Sequences are padded to the maximum length of mini-batch 
    sequences (dynamic padding).
    
    Args:
        data: list of tuple (sentence, label). 
            - list of word indices of variable length
            - label, 0 or 1
    Returns:
        packed_batch: (PackedSequence), see torch.nn.utils.rnn.pack_padded_sequence
        sencences: torch tensor of shape (batch_size, max_len).
        labels: torch tensor of shape (batch_size, 1).
        lengths: list; valid length for each padded sentence. 
    """
    # Sort a data list by sentences length (descending order).
    data.sort(key=lambda x: len(x[0]), reverse=True)
    sentences, labels = zip(*data)
    
    # stack labels
    labels = torch.Tensor(labels)
    
    # Merge sentences
    lengths = [len(s) for s in sentences]
   
    sents = torch.zeros(len(sentences), max(lengths)).long()
    for i, s in enumerate(sentences):
        end = lengths[i]
        sents[i, :end] = torch.Tensor(s[:end])        
    
    return sents, lengths, labels

In [0]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights=None):
        super(GRUModel,self).__init__()
        self.hidden_dim = hidden_dim
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings

        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, lengths):
        x = self.embeddings(x)
        x = self.dropout(x)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        out_pack, ht = self.gru(pack)
        return self.linear(ht[-1])

In [0]:
def train_epocs(model, optimizer, train_dl, valid_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [0]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [0]:
batch_size = 3000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate_fn)

In [0]:
vocab_size = len(words)
model = GRUModel(vocab_size, 50, 50, pretrained_weight_glove).cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [44]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.677 val loss 0.649 and val accuracy 0.622
train loss 0.477 val loss 0.423 and val accuracy 0.809
train loss 0.409 val loss 0.399 and val accuracy 0.822
train loss 0.389 val loss 0.393 and val accuracy 0.825
train loss 0.372 val loss 0.414 and val accuracy 0.813
train loss 0.358 val loss 0.394 and val accuracy 0.826


In [0]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [45]:
update_optimizer(optimizer, lr=0.0001)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.342 val loss 0.356 and val accuracy 0.844
train loss 0.341 val loss 0.348 and val accuracy 0.849
train loss 0.343 val loss 0.344 and val accuracy 0.852
train loss 0.341 val loss 0.348 and val accuracy 0.850
train loss 0.341 val loss 0.347 and val accuracy 0.851
train loss 0.343 val loss 0.344 and val accuracy 0.852


In [46]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.338 val loss 0.346 and val accuracy 0.852
train loss 0.339 val loss 0.347 and val accuracy 0.851
train loss 0.339 val loss 0.350 and val accuracy 0.850
train loss 0.341 val loss 0.351 and val accuracy 0.850
train loss 0.338 val loss 0.352 and val accuracy 0.850
train loss 0.336 val loss 0.349 and val accuracy 0.851


In [47]:
update_optimizer(optimizer, lr=0.0005)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.337 val loss 0.347 and val accuracy 0.852
train loss 0.334 val loss 0.369 and val accuracy 0.843
train loss 0.339 val loss 0.364 and val accuracy 0.845
train loss 0.331 val loss 0.354 and val accuracy 0.850
train loss 0.338 val loss 0.357 and val accuracy 0.849
train loss 0.334 val loss 0.350 and val accuracy 0.852
