In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import torchtext.experimental
import torchtext.experimental.vectors

import collections
import time
import random

In [3]:
from classifier import *
from helpers import ep_time, predict

In [4]:
seed = 1234
max_len = 500
max_size = 25000

torch.manual_seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
raw_train, raw_test = torchtext.experimental.datasets.raw.IMDB()
raw_train, raw_valid = get_train_valid_split(raw_train)

aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 9.11MB/s]


In [6]:
tokenizer = Tokenizer(max_len=max_len)

In [7]:
vocab = gen_vocab(raw_train, tokenizer, max_size=max_size)

In [8]:
train_data = process_raw(raw_train,tokenizer, vocab)
test_data = process_raw(raw_test,tokenizer, vocab)
valid_data = process_raw(raw_valid,tokenizer, vocab)

In [9]:
batch_size = 256
pad_token = '<pad>'
unk_token = '<unk>'
pad_idx = vocab[pad_token]
input_dim = len(vocab)
emb_dim = 100
hid_dim = 256
output_dim = 2
n_layers = 2
dropout = 0.5
n_epochs = 10
best_valid_loss = float('inf')

In [10]:
collator = Collator(pad_idx)

In [11]:
train_iterator = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True, collate_fn=collator.collate)

valid_iterator = torch.utils.data.DataLoader(valid_data, batch_size, shuffle=False, collate_fn=collator.collate)

test_iterator = torch.utils.data.DataLoader(test_data, batch_size, shuffle=False, collate_fn=collator.collate)

In [12]:
model = BiLSTM(input_dim, emb_dim, hid_dim, output_dim, n_layers, dropout, pad_idx)

glove = torchtext.experimental.vectors.GloVe(name = '6B',
                                             dim = emb_dim)
# for n,p in model.named_parameters():
#     print(f'name:{n}\nshape:{p.shape}\n')
model.apply(init_params)

glove.6B.zip: 100%|██████████| 862M/862M [08:18<00:00, 1.73MB/s] 


BiLSTM(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (lstm): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [13]:
pretrained_embedding, unk_tokens = get_pretrained_embedding(model.embedding, glove, vocab, unk_token)

In [14]:
model.embedding.weight.data.copy_(pretrained_embedding)

tensor([[-0.0398,  0.0357, -0.0046,  ..., -0.0485, -0.0088,  0.0329],
        [-0.0330,  0.0428,  0.0304,  ...,  0.0236,  0.0487,  0.0101],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.2925,  0.1087,  0.7920,  ..., -0.3641,  0.1822, -0.4104],
        [-0.7250,  0.7545,  0.1637,  ..., -0.0144, -0.1761,  0.3418],
        [ 1.1753,  0.0460, -0.3542,  ...,  0.4510,  0.0485, -0.4015]])

In [15]:
model.embedding.weight.data[pad_idx] = torch.zeros(emb_dim)

In [16]:
optimizer = optim.Adam(model.parameters())

In [17]:
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
model = model.to(device)
criterion = criterion.to(device)

In [19]:
for epoch in range(n_epochs):

    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = ep_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bilstm.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 47s
	Train Loss: 0.788 | Train Acc: 50.93%
	 Val. Loss: 0.659 |  Val. Acc: 63.48%
Epoch: 02 | Epoch Time: 0m 48s
	Train Loss: 0.678 | Train Acc: 58.33%
	 Val. Loss: 0.662 |  Val. Acc: 60.19%
Epoch: 03 | Epoch Time: 0m 49s
	Train Loss: 0.633 | Train Acc: 64.07%
	 Val. Loss: 0.533 |  Val. Acc: 73.59%
Epoch: 04 | Epoch Time: 0m 49s
	Train Loss: 0.481 | Train Acc: 77.48%
	 Val. Loss: 0.539 |  Val. Acc: 81.34%
Epoch: 05 | Epoch Time: 0m 50s
	Train Loss: 0.356 | Train Acc: 85.18%
	 Val. Loss: 0.362 |  Val. Acc: 83.42%
Epoch: 06 | Epoch Time: 0m 49s
	Train Loss: 0.284 | Train Acc: 88.68%
	 Val. Loss: 0.275 |  Val. Acc: 89.03%
Epoch: 07 | Epoch Time: 0m 49s
	Train Loss: 0.252 | Train Acc: 90.10%
	 Val. Loss: 0.270 |  Val. Acc: 89.64%
Epoch: 08 | Epoch Time: 0m 49s
	Train Loss: 0.211 | Train Acc: 91.93%
	 Val. Loss: 0.274 |  Val. Acc: 89.33%
Epoch: 09 | Epoch Time: 0m 51s
	Train Loss: 0.182 | Train Acc: 93.41%
	 Val. Loss: 0.283 |  Val. Acc: 90.11%
Epoch: 10 | Epoch T

In [20]:
model.load_state_dict(torch.load('bilstm.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.291 | Test Acc: 88.15%


In [21]:
sent = 'I fucking hate you'
predict(tokenizer, vocab, model, device, sent)

0.4891465902328491