In [0]:
import pandas as pd
import numpy as np
import os
import spacy
import string
import re
from spacy.symbols import ORTH #ID for each word
from collections import Counter
import torch
import torch.nn as nn #import modules and parameters and convolution layers
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [0]:
def download_dataset():
  ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz > .None
  ! tar -xzf aclImdb_v1.tar.gz > .None

In [0]:
download_dataset()

--2020-04-28 19:09:06--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-04-28 19:09:13 (12.6 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [0]:
!ls

aclImdb  aclImdb_v1.tar.gz  sample_data


In [0]:
!./aclImdb

/bin/bash: ./aclImdb: Is a directory


In [0]:
from pathlib import Path
PATH = Path('./aclImdb')
list(PATH.iterdir())

[PosixPath('aclImdb/test'),
 PosixPath('aclImdb/README'),
 PosixPath('aclImdb/train'),
 PosixPath('aclImdb/imdbEr.txt'),
 PosixPath('aclImdb/imdb.vocab')]

In [0]:
path = PATH/'train/neg/211_4.txt'
path.read_text()

'Hilariously obvious "drama" about a bunch of high school (I think) kids who enjoy non-stop hip-hop, break dancing, graffiti and trying to become a dj at the Roxy--or something. To be totally honest I was so bored I forgot! Even people who love the music agree this movie is terribly acted and--as a drama--failed dismally. We\'re supposed to find this kids likable and nice. I found them bland and boring. The one that I REALLY hated was Ramon. He does graffiti on subway trains and this is looked upon as great. Excuse me? He\'s defacing public property that isn\'t his to begin with. Also these "great" kids tap into the city\'s electricity so they can hold a big dance party at an abandoned building. Uh huh. So we\'re supposed to find a bunch of law breakers lovable and fun.<br /><br />I could forgive all that if the music was good but I can\'t stand hip hop. The songs were--at best--mediocre and they were nonstop! They\'re ALWAYS playing! It got to the point that I was fast-forwarding thro

In [0]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE) #ignore ', <, \', ?
def sub_br(x): return re_br.sub("\n", x.lower()) #replace a string using regular expression instead of a perfect match, then use .sub()

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [0]:
path = PATH/'train/neg/211_4.txt'
spacy_tok(path.read_text())[:10]

['hilariously',
 'obvious',
 '"',
 'drama',
 '"',
 'about',
 'a',
 'bunch',
 'of',
 'high']

In [0]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('aclImdb/train/pos/9667_9.txt'),
 PosixPath('aclImdb/train/pos/10342_7.txt'),
 PosixPath('aclImdb/train/pos/7144_10.txt'),
 PosixPath('aclImdb/train/pos/5424_10.txt'),
 PosixPath('aclImdb/train/pos/3615_9.txt')]

In [0]:
counts = Counter() #dict subclass for counting hash objects
for path in all_files:
  counts.update(spacy_tok(path.read_text())) #like dict.update(), add counts instead of replacing them

In [0]:
len(counts.keys())

87048

In [0]:
for word in list(counts):
  if counts[word] < 5:
    del counts[word]

In [0]:
len(counts.keys())

29364

In [0]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
  vocab2index[word] = len(words)
  words.append(word)

In [0]:
vocab2index

{'': 0,
 'UNK': 1,
 'story': 2,
 'of': 3,
 'ireland': 4,
 'in': 5,
 'the': 6,
 '70': 7,
 '/': 8,
 's': 9,
 '.': 10,
 'this': 11,
 'film': 12,
 'is': 13,
 'a': 14,
 'beautiful': 15,
 'reconstruction': 16,
 'small': 17,
 'time': 18,
 '1970': 19,
 'all': 20,
 'gang': 21,
 'are': 22,
 'there': 23,
 'see': 24,
 'below': 25,
 'master': 26,
 'boyle': 27,
 ',': 28,
 'boys': 29,
 'cannon': 30,
 'sp': 31,
 "o'donnell": 32,
 'senator': 33,
 "'s": 34,
 'rose': 35,
 'agnes': 36,
 'and': 37,
 'una': 38,
 'as': 39,
 'it': 40,
 'was': 41,
 '\n\n': 42,
 'melvyn': 43,
 'douglas': 44,
 'once': 45,
 'more': 46,
 'gives': 47,
 'polished': 48,
 'performance': 49,
 'which': 50,
 'he': 51,
 'inhabits': 52,
 'role': 53,
 'detective': 54,
 'who': 55,
 'ca': 56,
 "n't": 57,
 'place': 58,
 'love': 59,
 'before': 60,
 'duty': 61,
 'adventure': 62,
 'warmly': 63,
 'joan': 64,
 'blondell': 65,
 '(': 66,
 'far': 67,
 'from': 68,
 'being': 69,
 'illiterate': 70,
 'one': 71,
 'reviewer': 72,
 'suggested': 73,
 'wrote':

In [0]:
def encode_sentence(path, vocab2index, N=400, padding_start=True):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [0]:
path = PATH/'train/pos/10544_8.txt'
encode_sentence(path, vocab2index, N = 400, padding_start= False)

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   549,    28,   226, 11593,
          226,    66,  2680,    80,    28, 13368,    34,   479,    92,
           28,    41,    71,     3,     6,  1058,  1168,    37,  6113,
         5278,    96,    94,  1439,    82,   140,   133,   681,    76,
         1368,    10,   286,   319,   196,    71,  3542,    28,    14,
         3649,     3,   119,    37,    14,  1263,    10,   125,    28,
          465,   283,    40,   125,   804,   136,    20,     6,  5880,
          120,    28,   211,  7051,    37,    14,  2487,   410,    96,
          649,   319,   100,  1893,   138,  9445,     5,    14,  7762,
           10,    83,  1600,    92,    28,   226, 11522,   226,    66,
      

In [0]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train = "train", N = 400, padding_start = True):
       self.path_to_images = PATH/train
       self.pos_files = list((self.path_to_images/"pos").iterdir())
       self.neg_files = list((self.path_to_images/"neg").iterdir())
       self.files = self.pos_files + self.neg_files
       self.y = np.concatenate((np.ones(len(self.pos_files), dtype= int),
                             np.zeros(len(self.neg_files), dtype = int)), axis = 0)
       self.X = [encode_sentence(path, vocab2index, N, padding_start) for path in self.files]
    def __len__(self):
       return len(self.y)

    def __getitem__(self, idx):
       x, s = self.X[idx]
       return x, s, self.y[idx]

In [0]:
train_ds_v0 = ImdbDataset(PATH, padding_start= False)
valid_ds_v0 = ImdbDataset(PATH, "test", padding_start= False)

In [0]:
batch_size = 1000
train_dl_v0 = DataLoader(train_ds_v0, batch_size= batch_size, shuffle= True)
valid_dl_v0 = DataLoader(valid_ds_v0, batch_size = batch_size)

In [0]:
train_ds_v0[1]

(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [0]:
#dataset with padding at the end
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

model

In [0]:
class LSTMModel(torch.nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(LSTMModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.dropout = nn.Dropout(0.5)
    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx= 0)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
    self.linear = nn.Linear(hidden_dim, 1)
  
  def forward(self, x, s):
    s, sort_index = torch.sort(s, 0, descending= True)
    s = s.numpy().tolist()
    x = x[sort_index]
    x = self.embeddings(x)
    x = self.dropout(x)
    x_pack = pack_padded_sequence(x, s, batch_first= True)
    out_pack, (ht, ct) = self.lstm(x_pack)
    out = self.linear(ht[-1])
    return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)

In [0]:
def train_epochs(model, epochs = 10, lr = 0.001):
  parameters = filter(lambda p: p.requires_grad, model.parameters())
  optimizer = torch.optim.Adam(parameters, lr = lr)
  for i in range(epochs):
    model.train()
    sum_loss = 0.0
    total = 0
    for x, s, y in train_dl:
      x = x.long().cuda()
      y = y.float().cuda()
      y_pred = model(x, s)
      optimizer.zero_grad()
      loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
      loss.backward()
      optimizer.step()
      sum_loss += loss.item() * y.shape[0]
      total += y.shape[0]
    val_loss, val_acc = val_metrics(model, val_dl)
    if i %5 == 1:
      print("train loss %.3f val loss %.3f and val accuacy %.3f" % (sum_loss/total, val_loss, val_acc))


In [0]:
def val_metrics(model, val_dl):
  model.eval()
  correct = 0
  total = 0
  sum_loss = 0.0
  for x, s, y in val_dl:
    x = x.long().cuda()
    y = y.float().unsqueeze(1).cuda()
    y_hat = model(x, s)
    loss = F.binary_cross_entropy_with_logits(y_hat, y)
    y_pred = y_hat > 0
    correct += (y_pred.float() == y).float().sum()
    total += y.shape[0]
    sum_loss += loss.item() * y.shape[0]
  return sum_loss/total, correct/total

In [0]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
val_dl = DataLoader(valid_ds, batch_size = batch_size)

In [0]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 100).cuda()

29366


In [0]:
train_epochs(model, epochs = 20, lr = 0.01)

train loss 0.090 val loss 0.413 and val accuacy 0.869
train loss 0.058 val loss 0.546 and val accuacy 0.860
train loss 0.044 val loss 0.569 and val accuacy 0.855
train loss 0.030 val loss 0.644 and val accuacy 0.858


GRU with dropout


In [0]:
class GRUModel(torch.nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(GRUModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
    self.dropout = nn.Dropout(0.5)
    self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first = True)
    self.linear = nn.Linear(hidden_dim, 1)
  
  def forward(self, x, s):
    s, sort_index = torch.sort(s, 0, descending = True)
    s = s.numpy().tolist()
    x = x[sort_index]
    x = self.embeddings(x)
    x = self.dropout(x)
    x_pack = pack_padded_sequence(x, list(s), batch_first = True)
    out_pack, ht = self.gru(x_pack)
    out = self.linear(ht[-1])
    return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out) #returns a tensor filled with the scaler value 0 and with the same size 
    #as the input. torch.zeros_like(input, out) = torch.zeros(input.size(), out)
    #.scatter_() send the elements of x to the following indices in torch.zeros according row wise element


In [0]:
vocab_size = len(words)
print(vocab_size)
model2 = GRUModel(vocab_size, 50, 100).cuda()

29366


In [0]:
train_epochs(model2, epochs = 20, lr = 0.009)

train loss 0.053 val loss 0.461 and val accuacy 0.878
train loss 0.029 val loss 0.608 and val accuacy 0.873
train loss 0.021 val loss 0.680 and val accuacy 0.873
train loss 0.020 val loss 0.795 and val accuacy 0.867
