In [1]:
import pickle
import os
import settings as s
import time

import torch
import torchtext

import torch.optim as O
import torch.nn as nn

from gensim.models import KeyedVectors

from model import BinarySARNN
from utils import dir_to_csv, tokenize, binary_accuracy, epoch_time

In [4]:
train_files_path = os.path.join(s.DATASET_PATH, s.TRAIN_DIR)
train_dataset_path = os.path.join(s.DATA_DIR, s.TRAIN_CSV)
embeddings_path = os.path.join(s.DATASET_PATH, s.EMBEDDINGS_FILE)
vocab_path = os.path.join(s.DATA_DIR, s.VOCAB_FILE)

model_config = s.MODEL_CONFIG
train_config = s.TRAIN_CONFIG

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [5]:
# create train set
dir_to_csv(s.TRAIN_CSV, train_files_path)

REVIEW = torchtext.data.Field(tokenize=tokenize, lower=True)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

print("Loading training set...")
train = torchtext.data.TabularDataset(path=train_dataset_path, format="CSV",
                                      fields=[("review", REVIEW),
                                              ("label", LABEL)],
                                      csv_reader_params={"delimiter": " "})


train_iter = torchtext.data.Iterator(train, model_config.batch_size, device=device)

REVIEW.build_vocab(train)
print("Training set successfully loaded.")


Directory already exists, skipping creation...
Loading training set...
Training set successfully loaded.


In [7]:
0.2 * 25000

5000.0

In [4]:
# load word embeddings
print("Loading word embeddings...")
embeddings = KeyedVectors.load_word2vec_format(embeddings_path, binary=False,
                                               unicode_errors='ignore')
print("Word embeddings successfully loaded.\n")

vocab = embeddings.vocab
if not os.path.isfile(vocab_path):
    print("Vocabulary file not present, creating...")
    with open(vocab_path, "wb+") as vf:
        pickle.dump(vocab, vf)
    print("Done.\n")
else:
    print("vocabulary file already exists, skipping creation...\n")

Loading word embeddings...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Word embeddings successfully loaded.

vocabulary file already exists, skipping creation...



In [5]:
# set model configurations
model_config.vocab_size = len(embeddings.vocab)
model_config.d_embed = embeddings.vector_size

model = BinarySARNN(model_config)
model.embed.weight.data.copy_(torch.from_numpy(embeddings.vectors))
model.to(device)

BinarySARNN(
  (embed): Embedding(168994, 300)
  (rnn): RNN(300, 128, num_layers=2, bidirectional=True)
  (hidden_to_label): Linear(in_features=128, out_features=2, bias=True)
)

In [6]:
# set training configuration
criterion = train_config.criterion()
criterion.to(device)

opt = train_config.optimizer(model.parameters(), **train_config.o_kwargs)

print(str(model_config) + "\n")
print(str(train_config) + "\n")

Model Config
d_hidden = 128
vocab_size = 168994
d_embed = 300
batch_size = 128
n_layers = 2
nonlin = tanh
dropout = 0
bidir = True

Train Config
criterion = <class 'torch.nn.modules.loss.CrossEntropyLoss'>
optimizer = <class 'torch.optim.adagrad.Adagrad'>
optimizer args = {}
epochs = 5



In [7]:
iterations = 0
start = time.time()
best_dev_acc = -1
train_iter.repeat = False

for epoch in range(train_config.epochs):
    train_iter.init_epoch()


    for batch_idx, batch in enumerate(train_iter):
        model.train()
        opt.zero_grad()
        
        iterations += 1
        #forward pass
        answer = model(batch.review)
        
        #calculate accuracy in current batch
        train_acc = binary_accuracy(torch.max(answer, 1).values, batch.label.float()).item()
        #calculate loss
        loss = criterion(answer, batch.label)

        #backpropagate, calculating gradients
        loss.backward()
        
        #update model parameters
        opt.step()
        
    epoch_mins, epoch_secs = epoch_time(start, time.time())
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {loss:.3f} | Train Acc: {train_acc*100:.2f}%")

model_fname = f"{model_config.id}|{train_config.id}.pt"
torch.save(model.state_dict(), os.path.join(s.MODELS_PATH, model_fname))

Epoch: 01 | Epoch Time: 1m 51s
	Train Loss: 0.702 | Train Acc: 50.00%
Epoch: 02 | Epoch Time: 3m 42s
	Train Loss: 0.683 | Train Acc: 57.50%
Epoch: 03 | Epoch Time: 5m 33s
	Train Loss: 0.592 | Train Acc: 52.50%
Epoch: 04 | Epoch Time: 7m 25s
	Train Loss: 0.535 | Train Acc: 52.50%
Epoch: 05 | Epoch Time: 9m 16s
	Train Loss: 0.479 | Train Acc: 55.00%
