In [1]:
import pickle
import os
import settings as s
import time

import torch
import torchtext

import torch.optim as O
import torch.nn as nn

from gensim.models import KeyedVectors

from model import BinarySARNN
from configs import *
from utils import dir_to_csv, tokenize, binary_accuracy, epoch_time

In [26]:
train_files_path = os.path.join(s.DATASET_PATH, s.TRAIN_DIR)
test_files_path = os.path.join(s.DATASET_PATH, s.TEST_DIR)
dataset_path = os.path.join(s.DATA_DIR, s.CSV)

embeddings_path = os.path.join(s.DATASET_PATH, s.EMBEDDINGS_FILE)
vocab_path = os.path.join(s.DATA_DIR, s.VOCAB_FILE)

model_config = RNN_CONFIG1
train_config = TRAIN_CONFIG3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [27]:
# create train set
dataset_paths = [train_files_path, test_files_path]
dir_to_csv(s.CSV, dataset_paths)

REVIEW = torchtext.data.Field(tokenize=tokenize, lower=True)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

print("Loading dataset...")
dataset = torchtext.data.TabularDataset(path=dataset_path, format="CSV",
                                      fields=[("review", REVIEW),
                                              ("label", LABEL)],
                                      csv_reader_params={"delimiter": " "})

train, val, test = dataset.split(split_ratio=[0.7, 0.2, 0.1])


train_iter = torchtext.data.Iterator(train, model_config.batch_size, device=device)
val_iter = torchtext.data.Iterator(val, model_config.batch_size, device=device)

REVIEW.build_vocab(train)
print("Training set successfully loaded.")

Directory already exists, skipping creation...
Loading dataset...
Training set successfully loaded.


In [28]:
# load word embeddings
print("Loading word embeddings...")
embeddings = KeyedVectors.load_word2vec_format(embeddings_path, binary=False,
                                               unicode_errors='ignore')
print("Word embeddings successfully loaded.\n")

vocab = embeddings.vocab
if not os.path.isfile(vocab_path):
    print("Vocabulary file not present, creating...")
    with open(vocab_path, "wb+") as vf:
        pickle.dump(vocab, vf)
    print("Done.\n")
else:
    print("vocabulary file already exists, skipping creation...\n")

Loading word embeddings...
Word embeddings successfully loaded.

vocabulary file already exists, skipping creation...



In [29]:
# set model configurations
model_config.vocab_size = len(embeddings.vocab)
model_config.d_embed = embeddings.vector_size

model = BinarySARNN(model_config)
model.embed.weight.data.copy_(torch.from_numpy(embeddings.vectors))
model.to(device)

BinarySARNN(
  (embed): Embedding(168994, 300)
  (rnn): RNN(300, 64)
  (hidden_to_label): Linear(in_features=64, out_features=2, bias=True)
)

In [30]:
# set training configuration
criterion = train_config.criterion()
criterion.to(device)

opt = train_config.optimizer(model.parameters(), **train_config.o_kwargs)

print(str(model_config) + "\n")
print(str(train_config) + "\n")

Model Config
d_hidden = 64
vocab_size = 168994
d_embed = 300
batch_size = 64
n_layers = 1
nonlin = tanh
dropout = 0
bidir = False

Train Config
criterion = <class 'torch.nn.modules.loss.CrossEntropyLoss'>
optimizer = <class 'torch.optim.adagrad.Adagrad'>
optimizer args = {}
epochs = 5



In [31]:
iterations = 0
start = time.time()
best_val_acc = -1
val_every = 100
train_iter.repeat = False
best_epoch = 1

model_fname = f"{model_config.id}|{train_config.id}.pt"

for epoch in range(train_config.epochs):
    train_iter.init_epoch()


    for batch_idx, batch in enumerate(train_iter):
        model.train()
        opt.zero_grad()
        
        iterations += 1
        #forward pass
        answer = model(batch.review)
        
        #calculate accuracy in current batch
        train_acc = binary_accuracy(torch.max(answer, 1).values, batch.label.float()).item()
        #calculate loss
        loss = criterion(answer, batch.label)

        #backpropagate, calculating gradients
        loss.backward()
        
        #update model parameters
        opt.step()
        
    #evaluate
    model.eval()
    val_iter.init_epoch()
    
    epoch_loss, epoch_acc = 0, 0
    
    with torch.no_grad():
        
        for val_batch_idx, val_batch in enumerate(val_iter):
            answer = model(val_batch.review)
            epoch_acc += binary_accuracy(torch.max(answer, 1).values, val_batch.label.float()).item()
            epoch_loss += criterion(answer, val_batch.label)

    epoch_loss /= len(val_iter)
    epoch_acc /= len(val_iter)

    if epoch_acc > best_val_acc:
        best_val_acc = epoch_acc
        torch.save(model.state_dict(), os.path.join(s.MODELS_PATH, model_fname))
        best_epoch = epoch
                
    epoch_mins, epoch_secs = epoch_time(start, time.time())
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f'\t Val. Loss: {epoch_loss:.3f} |  Val. Acc: {epoch_acc*100:.2f}%')
print(f'Best Epoch -> {best_epoch+1:02}')

Epoch: 01 | Epoch Time: 1m 32s
	Train Loss: 0.695 | Train Acc: 57.14%
	 Val. Loss: 0.693 |  Val. Acc: 49.25%
Best Epoch -> 01
Epoch: 02 | Epoch Time: 3m 6s
	Train Loss: 0.688 | Train Acc: 55.36%
	 Val. Loss: 0.694 |  Val. Acc: 49.29%
Best Epoch -> 02
Epoch: 03 | Epoch Time: 4m 40s
	Train Loss: 0.710 | Train Acc: 35.71%
	 Val. Loss: 0.693 |  Val. Acc: 49.49%
Best Epoch -> 03
Epoch: 04 | Epoch Time: 6m 13s
	Train Loss: 0.691 | Train Acc: 57.14%
	 Val. Loss: 0.695 |  Val. Acc: 49.27%
Best Epoch -> 03
Epoch: 05 | Epoch Time: 7m 44s
	Train Loss: 0.714 | Train Acc: 50.00%
	 Val. Loss: 0.694 |  Val. Acc: 49.05%
Best Epoch -> 03


In [32]:
len(val_iter)

79

In [33]:
type(train)

torchtext.data.dataset.Dataset

In [34]:
import sys

In [35]:
torch.cuda.max_memory_allocated(device)

1991483904

In [36]:
torch.cuda.memory_allocated(device)

1424389632