In [0]:
import pandas as pd
import numpy as np
import os
import spacy
import string
import re
from spacy.symbols import ORTH #ID for each word
from collections import Counter
import torch
import torch.nn as nn #import modules and parameters and convolution layers
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [0]:
def download_dataset():
  ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz > .None
  ! tar -xzf aclImdb_v1.tar.gz > .None

In [3]:
download_dataset()

--2020-04-27 21:32:54--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-04-27 21:32:55 (89.8 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [4]:
!ls

aclImdb  aclImdb_v1.tar.gz  sample_data


In [5]:
!./aclImdb

/bin/bash: ./aclImdb: Is a directory


In [6]:
from pathlib import Path
PATH = Path('./aclImdb')
list(PATH.iterdir())

[PosixPath('aclImdb/train'),
 PosixPath('aclImdb/test'),
 PosixPath('aclImdb/README'),
 PosixPath('aclImdb/imdb.vocab'),
 PosixPath('aclImdb/imdbEr.txt')]

In [7]:
path = PATH/'train/neg/211_4.txt'
path.read_text()

'Hilariously obvious "drama" about a bunch of high school (I think) kids who enjoy non-stop hip-hop, break dancing, graffiti and trying to become a dj at the Roxy--or something. To be totally honest I was so bored I forgot! Even people who love the music agree this movie is terribly acted and--as a drama--failed dismally. We\'re supposed to find this kids likable and nice. I found them bland and boring. The one that I REALLY hated was Ramon. He does graffiti on subway trains and this is looked upon as great. Excuse me? He\'s defacing public property that isn\'t his to begin with. Also these "great" kids tap into the city\'s electricity so they can hold a big dance party at an abandoned building. Uh huh. So we\'re supposed to find a bunch of law breakers lovable and fun.<br /><br />I could forgive all that if the music was good but I can\'t stand hip hop. The songs were--at best--mediocre and they were nonstop! They\'re ALWAYS playing! It got to the point that I was fast-forwarding thro

In [0]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE) #ignore ', <, \', ?
def sub_br(x): return re_br.sub("\n", x.lower()) #replace a string using regular expression instead of a perfect match, then use .sub()

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [9]:
path = PATH/'train/neg/211_4.txt'
spacy_tok(path.read_text())[:10]

['hilariously',
 'obvious',
 '"',
 'drama',
 '"',
 'about',
 'a',
 'bunch',
 'of',
 'high']

In [10]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('aclImdb/train/pos/5331_10.txt'),
 PosixPath('aclImdb/train/pos/1385_8.txt'),
 PosixPath('aclImdb/train/pos/4655_7.txt'),
 PosixPath('aclImdb/train/pos/25_7.txt'),
 PosixPath('aclImdb/train/pos/12467_7.txt')]

In [0]:
counts = Counter() #dict subclass for counting hash objects
for path in all_files:
  counts.update(spacy_tok(path.read_text())) #like dict.update(), add counts instead of replacing them

In [12]:
len(counts.keys())

87048

In [0]:
for word in list(counts):
  if counts[word] < 5:
    del counts[word]

In [14]:
len(counts.keys())

29364

In [0]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
  vocab2index[word] = len(words)
  words.append(word)

In [16]:
vocab2index

{'': 0,
 'UNK': 1,
 'just': 2,
 'watched': 3,
 'it': 4,
 'on': 5,
 'sky': 6,
 'tv': 7,
 'missed': 8,
 'the': 9,
 'first': 10,
 'half': 11,
 'an': 12,
 'hour': 13,
 '.': 14,
 'i': 15,
 'did': 16,
 'wonder': 17,
 'if': 18,
 'was': 19,
 'a': 20,
 'true': 21,
 'story': 22,
 'so': 23,
 'to': 24,
 'end': 25,
 'there': 26,
 'no': 27,
 'brief': 28,
 'at': 29,
 'say': 30,
 'what': 31,
 'happened': 32,
 'everyone': 33,
 'remind': 34,
 'me': 35,
 'of': 36,
 'speed': 37,
 'but': 38,
 'day': 39,
 'do': 40,
 "n't": 41,
 'suppose': 42,
 'released': 43,
 'cinema': 44,
 'as': 45,
 'we': 46,
 'see': 47,
 'following': 48,
 'error': 49,
 'goof': 50,
 'that': 51,
 'saw': 52,
 'they': 53,
 'remove': 54,
 'bonnet': 55,
 '(': 56,
 'hood': 57,
 ')': 58,
 'and': 59,
 'then': 60,
 'later': 61,
 'are': 62,
 'two': 63,
 'shots': 64,
 'car': 65,
 'with': 66,
 'police': 67,
 'in': 68,
 'front': 69,
 'trying': 70,
 'slow': 71,
 'down': 72,
 'when': 73,
 'is': 74,
 'back': 75,
 'have': 76,
 'edge': 77,
 'my': 78,
 'se

In [0]:
def encode_sentence(path, vocab2index, N=400, padding_start=True):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [18]:
path = PATH/'train/pos/10544_8.txt'
encode_sentence(path, vocab2index, N = 400, padding_start= False)

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   949,   112,   369,  1974,
          369,    56,  6638,    58,   112, 14475,   235,    10,   245,
          112,    19,    96,    36,     9,   305,   410,    59, 19789,
         2768,    51,    15,   821,   822,   319,    73,  1760,   106,
          412,    14,    53,   322,     2,    96,  4981,   112,    20,
         3744,    36,   675,    59,    20,   743,    14,    23,   112,
           31,   335,     4,    23,   159,   269,   113,     9,  4135,
         1819,   112,   307,  7789,    59,    20,  6933,  7025,    51,
          557,   322,    24,  1714,   134,  9662,    68,    20,  8870,
           14,   198,  1662,   245,   112,   369, 11628,   369,    56,
      

In [0]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train = "train", N = 400, padding_start = True):
       self.path_to_images = PATH/train
       self.pos_files = list((self.path_to_images/"pos").iterdir())
       self.neg_files = list((self.path_to_images/"neg").iterdir())
       self.files = self.pos_files + self.neg_files
       self.y = np.concatenate((np.ones(len(self.pos_files), dtype= int),
                             np.zeros(len(self.neg_files), dtype = int)), axis = 0)
       self.X = [encode_sentence(path, vocab2index, N, padding_start) for path in self.files]
    def __len__(self):
       return len(self.y)

    def __getitem__(self, idx):
       x, s = self.X[idx]
       return x, s, self.y[idx]

In [0]:
train_ds_v0 = ImdbDataset(PATH, padding_start= False)
valid_ds_v0 = ImdbDataset(PATH, "test", padding_start= False)

In [0]:
batch_size = 1000
train_dl_v0 = DataLoader(train_ds_v0, batch_size= batch_size, shuffle= True)
valid_dl_v0 = DataLoader(valid_ds_v0, batch_size = batch_size)

In [22]:
train_ds_v0[1]

(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

model

In [0]:
class LSTMModel(torch.nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(LSTMModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.dropout = nn.Dropout(0.5)
    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx= 0)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
    self.linear = nn.Linear(hidden_dim, 1)
  
  def forward(self, x, s):
    s, sort_index = torch.sort(s, 0, descending= True)
    s = s.numpy().tolist()
    x = x[sort_index]
    x = self.embeddings(x)
    x = self.dropout(x)
    x_pack = pack_padded_sequence(x, s, batch_first= True)
    out_pack, (ht, ct) = self.lstm(x_pack)
    out = self.linear(ht[-1])
    return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)

In [0]:
def train_epochs(model, epochs = 10, lr = 0.001):
  parameters = filter(lambda p: p.requires_grad, model.parameters())
  optimizer = torch.optim.Adam(parameters, lr = lr)
  for i in range(epochs):
    model.train()
    sum_loss = 0.0
    total = 0
    for x, s, y in train_dl:
      x = x.long().cuda()
      y = y.float().cuda()
      y_pred = model(x, s)
      optimizer.zero_grad()
      loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
      loss.backward()
      optimizer.step()
      sum_loss += loss.item() * y.shape[0]
      total += y.shape[0]
    val_loss, val_acc = val_metrics(model, val_dl)
    if i %5 == 1:
      print("train loss %.3f val loss %.3f and val accuacy %.3f" % (sum_loss/total, val_loss, val_acc))


In [0]:
def val_metrics(model, val_dl):
  model.eval()
  correct = 0
  total = 0
  sum_loss = 0.0
  for x, s, y in val_dl:
    x = x.long().cuda()
    y = y.float().unsqueeze(1).cuda()
    y_hat = model(x, s)
    loss = F.binary_cross_entropy_with_logits(y_hat, y)
    y_pred = y_hat > 0
    correct += (y_pred.float() == y).float().sum()
    total += y.shape[0]
    sum_loss += loss.item() * y.shape[0]
  return sum_loss/total, correct/total

In [0]:
#dataset with padding at the end
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH,"test")

In [0]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
val_dl = DataLoader(valid_ds, batch_size = batch_size)

In [30]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 100).cuda()

29366


In [32]:
train_epochs(model, epochs = 20, lr = 0.01)

train loss 0.095 val loss 0.519 and val accuacy 0.866
train loss 0.062 val loss 0.623 and val accuacy 0.859
train loss 0.042 val loss 0.732 and val accuacy 0.853
train loss 0.034 val loss 0.702 and val accuacy 0.858
