In [1]:
import pickle
import dill
import os
import settings as s
import time
import csv

import torch
import torchtext

import torch.optim as O
import torch.nn as nn

from gensim.models import KeyedVectors

from model import BinarySARNN
from configs import *
from utils import dir_to_csv, tokenize, binary_accuracy, epoch_time

In [2]:
train_files_path = os.path.join(s.DATASET_PATH, s.TRAIN_DIR)
test_files_path = os.path.join(s.DATASET_PATH, s.TEST_DIR)
dataset_path = os.path.join(s.DATA_DIR, s.CSV)

train_file = os.path.join(s.DATA_DIR, "train.csv")
val_file = os.path.join(s.DATA_DIR, "val.csv")
test_file = os.path.join(s.DATA_DIR, "test.csv")

d_embedding = s.D_EMBEDDING

embeddings_path = os.path.join(s.DATASET_PATH, s.EMBEDDINGS_FILE)
vocab_path = os.path.join(s.DATA_DIR, s.VOCAB_FILE)

model_config = LSTM_CONFIG3
train_config = TRAIN_CONFIG5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
def ds_to_csv(ds, file):
    if not os.path.isfile(file):
        with open(file, "w+") as csvf:
            writer = csv.writer(csvf,
                                delimiter=" ",
                                quoting=csv.QUOTE_NONNUMERIC)
            for e in ds:  
                writer.writerow([" ".join(e.review), e.label])

In [4]:
# create train set
dataset_paths = [train_files_path, test_files_path]
dir_to_csv(s.CSV, dataset_paths)
  
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

print("Loading dataset...")
if not (os.path.isfile(train_file) or os.path.isfile(val_file) or os.path.isfile(test_file)):
    
    REVIEW = torchtext.data.Field(tokenize=tokenize, lower=True, include_lengths=True)
    
    print("Splitting train, val and test sets...")
    dataset = torchtext.data.TabularDataset(path=dataset_path, format="CSV",
                                          fields=[("review", REVIEW),
                                                  ("label", LABEL)],
                                          csv_reader_params={"delimiter": " "})

    train, val, test = dataset.split(split_ratio=[0.7, 0.2, 0.1])
    
    #serialize train, val and test
    ds_to_csv(train, train_file)
    ds_to_csv(val, val_file)
    ds_to_csv(test, test_file)
    
    REVIEW.build_vocab(train)
    
    with open(os.path.join(s.DATA_DIR, "REVIEW.field"), "wb+") as reviewf:
        dill.dump(REVIEW, reviewf)
        
    print("Train and Validation sets successfully created.")
    
else:
    print("Split already done, loading...")
    
    with open(os.path.join(s.DATA_DIR, "REVIEW.field"), "rb") as reviewf:
        REVIEW = dill.load(reviewf)
        
    train = torchtext.data.TabularDataset(path=train_file, format="CSV",
                                          fields=[("review", REVIEW),
                                                  ("label", LABEL)],
                                          csv_reader_params={"delimiter": " "})
    
    val = torchtext.data.TabularDataset(path=val_file, format="CSV",
                                          fields=[("review", REVIEW),
                                                  ("label", LABEL)],
                                          csv_reader_params={"delimiter": " "})
    
    test = torchtext.data.TabularDataset(path=test_file, format="CSV",
                                          fields=[("review", REVIEW),
                                                  ("label", LABEL)],
                                          csv_reader_params={"delimiter": " "})
    print("Train and Validation sets successfully loaded.")


train_iter = torchtext.data.Iterator(train, model_config.batch_size, device=device, sort_within_batch = True,
                                     sort_key=lambda x: len(x.review))
val_iter = torchtext.data.Iterator(val, model_config.batch_size, device=device, sort_within_batch = True,
                                   sort_key=lambda x: len(x.review))

test_iter = torchtext.data.Iterator(test, 64, device=device, sort_within_batch = True,
                               sort_key=lambda x: len(x.review))

Directory already exists, skipping creation...
Loading dataset...
Split already done, loading...
Train and Validation sets successfully loaded.


In [5]:
# load word embeddings
print("Loading word embeddings...")
embeddings = KeyedVectors.load_word2vec_format(embeddings_path, binary=False,
                                               unicode_errors='ignore')
print("Word embeddings successfully loaded.\n")

vocab = embeddings.vocab
if not os.path.isfile(vocab_path):
    print("Vocabulary file not present, creating...")
    with open(vocab_path, "wb+") as vf:
        pickle.dump(vocab, vf)
    print("Done.\n")
else:
    print("Vocabulary file already exists, skipping creation.")

Loading word embeddings...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Word embeddings successfully loaded.

Vocabulary file already exists, skipping creation.


In [6]:
# set model configurations
model_config.vocab_size = len(vocab)
model_config.d_embed = d_embedding

model = BinarySARNN(model_config)

model.embed.weight.data.copy_(torch.from_numpy(embeddings.vectors))
model.to(device)

BinarySARNN(
  (embed): Embedding(168994, 300)
  (rnn): LSTM(300, 128, num_layers=2, bidirectional=True)
  (hidden_to_label): Linear(in_features=256, out_features=2, bias=True)
)

In [7]:
UNK_IDX = REVIEW.vocab.stoi[REVIEW.unk_token]
PAD_IDX = REVIEW.vocab.stoi[REVIEW.pad_token]

model.embed.weight.data[UNK_IDX] = torch.zeros(model_config.d_embed)
model.embed.weight.data[PAD_IDX] = torch.zeros(model_config.d_embed)

In [8]:
# set training configuration
criterion = train_config.criterion()
criterion.to(device)

opt = train_config.optimizer(model.parameters(), **train_config.o_kwargs)

print(str(model_config) + "\n")
print(str(train_config) + "\n")

Model Config
d_hidden = 128
vocab_size = 168994
d_embed = 300
batch_size = 64
n_layers = 2
nonlin = 
dropout = 0
bidir = True
arch = LSTM

Train Config
criterion = <class 'torch.nn.modules.loss.CrossEntropyLoss'>
optimizer = <class 'torch.optim.rmsprop.RMSprop'>
optimizer args = {}
epochs = 5



In [9]:
iterations = 0
start = time.time()
best_val_acc = -1
val_every = 100
train_iter.repeat = False
best_epoch = 1

model_fname = f"{model_config.id}|{train_config.id}.pt"

for epoch in range(train_config.epochs):
    train_iter.init_epoch()
    #print(torch.cuda.memory_allocated(device=device))
    #print("train...")
    model.train()
    for batch_idx, batch in enumerate(train_iter):
        #print(torch.cuda.memory_allocated(device=device))
        
        opt.zero_grad()
        
        iterations += 1
        #forward pass
        #print(batch.review)
        reviews, review_lengths = batch.review
        answer = model(reviews, review_lengths)
        
        #calculate accuracy in current batch
        #print(torch.max(answer, 1)[1], " vs", batch.label)
        train_acc = binary_accuracy(torch.max(answer, 1)[1], batch.label).item()
        #calculate loss
        loss = criterion(answer, batch.label)

        #backpropagate, calculating gradients
        loss.backward()
        
        #update model parameters
        opt.step()
    #print("eval..")
    #evaluate
    model.eval()
    val_iter.init_epoch()
    
    epoch_loss, epoch_acc = 0, 0
    
    
    with torch.no_grad():
        
        for val_batch_idx, val_batch in enumerate(val_iter):
            #print(torch.cuda.memory_allocated(device=device))
            reviews, review_lengths = val_batch.review
            answer = model(reviews, review_lengths)
            epoch_acc += binary_accuracy(torch.max(answer, 1)[1], val_batch.label).item()
            epoch_loss += criterion(answer, val_batch.label)

    epoch_loss /= len(val_iter)
    epoch_acc /= len(val_iter)
    torch.save(model.state_dict(), os.path.join(s.MODELS_PATH, f"{model_fname}"))
    if epoch_acc > best_val_acc:
        print("Saving to", str(os.path.join(s.MODELS_PATH, model_fname)))
        best_val_acc = epoch_acc
        torch.save(model.state_dict(), os.path.join(s.MODELS_PATH, model_fname))
        best_epoch = epoch
                
    epoch_mins, epoch_secs = epoch_time(start, time.time())
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f'\t Val. Loss: {epoch_loss:.3f} |  Val. Acc: {epoch_acc*100:.2f}%')
print(f'Best Epoch -> {best_epoch+1:02}')

Saving to /home/jupyter/models/aclImdb/lstm_2layers_bidir|rmsprop.pt
Epoch: 01 | Epoch Time: 4m 54s
	Train Loss: 0.518 | Train Acc: 73.21%
	 Val. Loss: 0.529 |  Val. Acc: 72.77%
Saving to /home/jupyter/models/aclImdb/lstm_2layers_bidir|rmsprop.pt
Epoch: 02 | Epoch Time: 9m 50s
	Train Loss: 0.348 | Train Acc: 85.71%
	 Val. Loss: 0.326 |  Val. Acc: 86.23%
Epoch: 03 | Epoch Time: 14m 42s
	Train Loss: 0.223 | Train Acc: 91.07%
	 Val. Loss: 0.345 |  Val. Acc: 85.70%
Epoch: 04 | Epoch Time: 19m 36s
	Train Loss: 0.060 | Train Acc: 96.43%
	 Val. Loss: 0.392 |  Val. Acc: 85.82%
Epoch: 05 | Epoch Time: 24m 28s
	Train Loss: 0.066 | Train Acc: 96.43%
	 Val. Loss: 0.598 |  Val. Acc: 83.03%
Best Epoch -> 02


## Test

In [None]:
archs =  {"simple_rnn": RNN_CONFIG1, "simple_rnn_2layers": RNN_CONFIG2, "simple_rnn_2layers_bidir": RNN_CONFIG3, 
         "lstm": LSTM_CONFIG1, "lstm_2layers": LSTM_CONFIG2, "lstm_2layers_bidir": LSTM_CONFIG3}
optimizers = ["sgd", "rmsprop", "adadelta", "adagrad", "adam"]

models = ["|".join([arch, optimizer]) for arch in archs for optimizer in optimizers]

In [None]:
for arch in archs:
    archs[arch].vocab_size = len(embeddings.vocab)
    archs[arch].d_embed = embeddings.vector_size
    
criterion = nn.modules.CrossEntropyLoss()

### JUST IN CASE

In [None]:
modelb = BinarySARNN(LSTM_CONFIG3)
modelb.load_state_dict(torch.load(os.path.join(s.MODELS_PATH, "e3-lstm_2layers_bidir|adam.pt")))
modelb.to(device)

modelc = BinarySARNN(LSTM_CONFIG3)
modelc.load_state_dict(torch.load(os.path.join(s.MODELS_PATH, "lstm_2layers_bidir|adam.pt")))
modelc.to(device)

In [None]:
criterion = nn.modules.CrossEntropyLoss()
#test_iter = torchtext.data.Iterator(test, 64, device=device, sort_within_batch = True,
 #                              sort_key=lambda x: len(x.review))
modelt = model
modelt.eval()
tloss, tacc = 0, 0
with torch.no_grad():
    for test_batch in test_iter:
        treviews, treview_lengths = test_batch.review
        tanswer = modelt(treviews, treview_lengths)
        tacc += binary_accuracy(torch.max(tanswer, 1)[1], test_batch.label).item()
        tloss += criterion(tanswer, test_batch.label)
tloss /= len(test_iter)
tacc /= len(test_iter)

print(f"\tTest Loss: {tloss:.3f} | Test Acc: {tacc*100:.2f}%\n")