In [None]:
import numpy as np
import pandas as pd
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
def unpack_dataset():
  ! mkdir -p data/aclImdb
  ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
  ! tar -zxvf aclImdb_v1.tar.gz -C data

In [None]:
unpack_dataset()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
aclImdb/train/unsup/44983_0.txt
aclImdb/train/unsup/44982_0.txt
aclImdb/train/unsup/44981_0.txt
aclImdb/train/unsup/44980_0.txt
aclImdb/train/unsup/44979_0.txt
aclImdb/train/unsup/44978_0.txt
aclImdb/train/unsup/44977_0.txt
aclImdb/train/unsup/44976_0.txt
aclImdb/train/unsup/44975_0.txt
aclImdb/train/unsup/44974_0.txt
aclImdb/train/unsup/44973_0.txt
aclImdb/train/unsup/44972_0.txt
aclImdb/train/unsup/44971_0.txt
aclImdb/train/unsup/44970_0.txt
aclImdb/train/unsup/44969_0.txt
aclImdb/train/unsup/44968_0.txt
aclImdb/train/unsup/44967_0.txt
aclImdb/train/unsup/44966_0.txt
aclImdb/train/unsup/44965_0.txt
aclImdb/train/unsup/44964_0.txt
aclImdb/train/unsup/44963_0.txt
aclImdb/train/unsup/44962_0.txt
aclImdb/train/unsup/44961_0.txt
aclImdb/train/unsup/44960_0.txt
aclImdb/train/unsup/44959_0.txt
aclImdb/train/unsup/44958_0.txt
aclImdb/train/unsup/44957_0.txt
aclImdb/train/unsup/44956_0.txt
aclImdb/train/unsup/44955_0.txt
aclImdb

In [None]:
from pathlib import Path
PATH = Path("data/aclImdb/")
list(PATH.iterdir())

[PosixPath('data/aclImdb/imdb.vocab'),
 PosixPath('data/aclImdb/README'),
 PosixPath('data/aclImdb/train'),
 PosixPath('data/aclImdb/imdbEr.txt'),
 PosixPath('data/aclImdb/test')]

## tokenization

In [None]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en_core_web_sm')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [None]:
path = PATH/'train/pos/0_9.txt'
spacy_tok(path.read_text())[: 10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

In [None]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[: 5]

[PosixPath('data/aclImdb/train/pos/10187_7.txt'),
 PosixPath('data/aclImdb/train/pos/9444_10.txt'),
 PosixPath('data/aclImdb/train/pos/8108_10.txt'),
 PosixPath('data/aclImdb/train/pos/2856_10.txt'),
 PosixPath('data/aclImdb/train/pos/1393_7.txt')]

In [None]:
counts = Counter()
for path in all_files:
  counts.update(spacy_tok(path.read_text()))

In [None]:
counts

Counter({'Well': 1544,
         ',': 275407,
         'I': 81720,
         "'ll": 2788,
         'be': 26732,
         'honest': 467,
         ':': 9399,
         'It': 18353,
         'is': 109276,
         'not': 29004,
         'exactly': 958,
         'a': 156296,
         'Sholay': 37,
         '.': 275453,
         'But': 7318,
         'you': 30704,
         'ca': 3632,
         'nt': 490,
         'get': 9082,
         'every': 3488,
         'week': 443,
         'In': 5948,
         'fact': 3501,
         'could': 9238,
         'see': 11092,
         'distinct': 84,
         'signatures': 3,
         'of': 144098,
         '"': 64995,
         'without': 3026,
         'my': 10299,
         'Daughter"(Sally': 1,
         'Field': 68,
         '1991': 65,
         ')': 34367,
         'in': 87772,
         'this': 60739,
         'movie': 43236,
         'However': 1974,
         'as': 43522,
         'most': 8056,
         'inspired': 342,
         'movies': 7425,
         '

In [None]:
len(counts.keys())

103163

In [None]:
for word in list(counts):
  if counts[word] < 5:
    del counts[word]

In [None]:
len(counts.keys())

33893

In [None]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

## Dataset

In [None]:
z = spacy_tok(path.read_text())

In [None]:
z

['Pretty',
 'bad',
 'movie',
 'offers',
 'nothing',
 'new',
 '.',
 'The',
 'usual',
 'creaks',
 'and',
 'moans',
 'attempt',
 'to',
 'make',
 '-',
 'up',
 'for',
 'a',
 'muddled',
 ',',
 'but',
 'thin',
 'story',
 '.',
 'Acting',
 'is',
 'barely',
 'above',
 'pathetic',
 '.',
 'Why',
 'Liam',
 'Neeson',
 'signed',
 'on',
 'for',
 'this',
 'is',
 'anyone',
 "'s",
 'guess',
 '.',
 'Owen',
 'Wilson',
 'truly',
 'turns',
 'in',
 'one',
 'of',
 'the',
 'worst',
 'performances',
 'in',
 'recent',
 'horror',
 '-',
 'movie',
 'history',
 '.',
 'Catherine',
 'Zeta',
 'Jones',
 'is',
 'fun',
 'to',
 'look',
 'at',
 'and',
 'not',
 'much',
 'else',
 'although',
 'Lili',
 'Tayor',
 'did',
 'an',
 'above',
 '-',
 'average',
 'job',
 '.',
 'The',
 'special',
 'effects',
 'were',
 'fairly',
 'memorable',
 'and',
 'the',
 'house',
 'itself',
 'was',
 'breathtaking',
 'and',
 'hauntingly',
 'gorgeous',
 '.',
 'However',
 'they',
 'ca',
 "n't",
 'makeup',
 'for',
 'the',
 'poor',
 'acting',
 'and',
 'th

In [None]:
def encode_sentence(path, vocab2index, N = 400, padding_start = True):
  x = spacy_tok(path.read_text())
  enc = np.zeros(N, dtype = np.int32)
  encl = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
  length = min(N, len(encl))
  if padding_start:
    enc[ :length] = encl[:length]
  else:
    enc[N-length: ] = encl[ :length]
  return enc, length


In [None]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N = 400, padding_start = False)

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            1,   374,    29,   899,    29,    66,    13,  5159,    28,
         2435,  2437,   106,     4,   356,    34,  2203,   117,   518,
         1101,    47,  1215, 12260,    47,  2420,     3,  2670,   300,
            3, 22637,    49,   189,    55,   402,    13,     1,   152,
      

In [None]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=400, padding_start=True):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        # it is important to run encode_sentence in the init
        self.X = [encode_sentence(path, vocab2index, N, padding_start) for path in self.files]

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

## LSTM with Variable Lengths

In [None]:
# dataset with padding at the end
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [None]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)

    def forward(self, x, s):
        # sorting
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)


In [None]:
def train_epochs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [None]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total


In [None]:
batch_size = 2000
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size= batch_size)

In [None]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 50).cuda()

33895


In [None]:
train_epochs(model, epochs = 30, lr = 0.01)

train loss 0.594 val loss 0.607 and val accuracy 0.661
train loss 0.213 val loss 0.475 and val accuracy 0.827
train loss 0.076 val loss 0.762 and val accuracy 0.823
train loss 0.043 val loss 0.667 and val accuracy 0.854
train loss 0.025 val loss 0.767 and val accuracy 0.854
train loss 0.018 val loss 0.946 and val accuracy 0.843


In [None]:
train_epochs(model, epochs = 30, lr = 0.001)

train loss 0.012 val loss 0.974 and val accuracy 0.846
train loss 0.013 val loss 0.906 and val accuracy 0.853
train loss 0.012 val loss 1.003 and val accuracy 0.848
train loss 0.010 val loss 0.936 and val accuracy 0.852
train loss 0.009 val loss 1.046 and val accuracy 0.851
train loss 0.008 val loss 1.049 and val accuracy 0.852


In [None]:
def save_model(m, p): torch.save(m.state_dict(), p)
def load_model(m, p): m.load_state_dict(torch.load(p))

In [None]:
! mkdir $PATH/"models"

In [None]:
p = PATH/"models/model-86.pth"
save_model(model, p)

In [None]:
val_metrics(model, valid_dl)

(1.0444632053375245, tensor(0.8513, device='cuda:0'))

In [None]:
load_model(model, p)

## GRU model with droupout

In [None]:
class GRUModel(torch.nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(GRUModel, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
    self.hidden_dim = hidden_dim
    self.dropout = nn.Dropout(0.5)
    self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first = True)
    self.linear = nn.Linear(hidden_dim, 1)

  def forward(self, x, s):
    s, sort_index = torch.sort(s, 0, descending = True)
    s = s.numpy().tolist()
    x = x[sort_index]
    x = self.embeddings(x)
    x = self.dropout(x)
    x_pack = pack_padded_sequence(x, s, batch_first = True)
    out_pack, ht = self.gru(x_pack)
    out = self.linear(ht[-1])
    return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)


In [None]:
vocab_size = len(words)
print(vocab_size)
model2 = GRUModel(vocab_size, 50, 50).cuda()

33895


In [None]:
train_epochs(model2, epochs = 30, lr = 0.01)

train loss 0.671 val loss 0.746 and val accuracy 0.565
train loss 0.344 val loss 0.440 and val accuracy 0.838
train loss 0.183 val loss 0.412 and val accuracy 0.876
train loss 0.111 val loss 0.501 and val accuracy 0.874
train loss 0.070 val loss 0.562 and val accuracy 0.871
train loss 0.048 val loss 0.647 and val accuracy 0.868


In [None]:
p = PATH/"models/model-gru-87.pth"
save_model(model2, p)

## Bidirectional and Multiple Layers GRUs / LSTMs

In [None]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
x, s, y = next(iter(train_dl))

In [None]:
vocab_size = len(words)
embedding_dim = 10
hidden_dim = 9
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0)
lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first = True, bidirectional = True)
lstm2 = nn.LSTM(embedding_dim, hidden_dim, num_layers = 2, batch_first = True, dropout = 0.1, bidirectional = True)

In [None]:
s, sort_index = s.sort(0, descending = True)
x = x[sort_index]
x = embed(x.long())
x_pack = pack_padded_sequence(x, list(s), batch_first = True)

In [None]:
lstm_out, (ht, ct) = lstm1(x)

In [None]:
ht.shape

torch.Size([2, 7, 9])

In [None]:
ht[-2,:,:].shape

torch.Size([7, 9])

In [None]:
lstm_out, (ht2, ct2) = lstm2(x)

In [None]:
ht2[-2,:, :].shape, ht2[-1,:, :].shape

(torch.Size([7, 9]), torch.Size([7, 9]))

In [None]:
#concat the final forward (ht[-2, :, :]) and backward( ht[-1,:,:]) hidden layers
h = torch.cat((ht2[-2,:, :], ht2[-1,:,:]), dim =1)
h.shape

torch.Size([7, 18])

In [None]:
class LSTMBiModel(torch.nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(LSTMBiModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx= 0)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers = 2, batch_first = True,
                        dropout = 0.3, bidirectional = True)
    self.linear = nn.Linear(2* hidden_dim, 1)

  def forward(self, x, s):
    s, sort_index = torch.sort(s, 0, descending = True)
    s = s.numpy().tolist()
    x = x[sort_index]
    x = self.embeddings(x)
    x_pack = pack_padded_sequence(x, s, batch_first = True)
    out_pack, (ht, ct) = self.lstm(x_pack)
    h = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim =1)
    h = self.linear(h)
    return torch.zeros_like(h).scatter_(0, sort_index.unsqueeze(1).cuda(), h)

In [None]:
vocab_size = len(words)
model3 = LSTMBiModel(vocab_size, 50, 50).cuda()

In [None]:
train_epochs(model3, epochs = 5, lr = 0.01)

train loss 0.216 val loss 0.454 and val accuracy 0.829


## Bidirecitonal GRUS

In [None]:
class GRUBiModel(torch.nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(GRUBiModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx =0)
    self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers = 2,
                      batch_first = True, dropout = 0.3, bidirectional = True)
    self.linear = nn.Linear(2 * hidden_dim, 1)
  def forward(self, x, s):
    s, sort_index = torch.sort(s, 0, descending = True)
    s = s.numpy().tolist()
    x = x[sort_index]
    x = self.embeddings(x)
    x_pack = pack_padded_sequence(x, s, batch_first = True)
    out_pack, ht = self.gru(x_pack)
    h = torch.cat((ht[-2,:, :], ht[-1,:,:]), dim =1)
    return self.linear(h)

In [None]:
vocab_size = len(words)
model4 = GRUBiModel(vocab_size, 50, 50).cuda()

In [None]:
train_epochs(model4, epochs = 10, lr = 0.01)

train loss 0.742 val loss 0.741 and val accuracy 0.483
train loss 0.742 val loss 0.738 and val accuracy 0.499
