## Imports

In [1]:
import os
import re
import sys
import typing
import gc

sys.path.append(
    os.path.join('..','src')
)

from src.models import NextWordPredictorModel
from src.data_processing import *

In [2]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [3]:
import nltk

# from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize

## Global Variables & Hyperparameters

In [4]:
DEVICE = "cuda:0"
assert torch.cuda.is_available()
from apex import amp, optimizers

MIN_SEQ_LEN = 2
MAX_SEQ_LEN = 20

BATCH_SIZE = 16
NUM_LSTM_LAYERS = 2
EMB_SIZE = 128
HIDDEN_STATE_SIZE = 100
DROPOUT = 0.5
fp16 = True
POS_ENCODING = False
STARTING_LR = 1e-3

val_split = 0.2
test_split = 0.1

# for reproducibility
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.manual_seed(0)
np.random.seed(23)

## Vocabulary

In [5]:
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')

train_text = ' '.join(train_iter)

In [6]:
vocabulary = FromRawTextVocabulary(
    text = train_text,
    tokenizer = tokenizer,
    text_cleaner = None,
    max_voc_size = 10000,
    min_word_occ = 10
)

## Dataset

In [7]:
train_iter, val_iter, test_iter = WikiText2()

train_text = ' '.join(train_iter)
val_text = ' '.join(val_iter)
test_text = ' '.join(test_iter)

train_dataset = SequenceDataset(
    vocabulary = vocabulary,
    text = train_text,
    max_seq_length = MAX_SEQ_LEN + 1,
    min_seq_length = MIN_SEQ_LEN,
    device = DEVICE
)
val_dataset = SequenceDataset(
    vocabulary = vocabulary,
    text = val_text,
    max_seq_length = MAX_SEQ_LEN + 1,
    min_seq_length = MIN_SEQ_LEN,
    device = DEVICE
)
test_dataset = SequenceDataset(
    vocabulary = vocabulary,
    text = test_text,
    max_seq_length = MAX_SEQ_LEN + 1,
    min_seq_length = MIN_SEQ_LEN,
    device = DEVICE
)

del train_text
del val_text
del test_text

gc.collect()

0

In [8]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    pin_memory = False,
    drop_last = True
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    pin_memory = False,
    drop_last = True
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = 2,
    shuffle = False,
    pin_memory = False,
    drop_last = True
)

In [9]:
def map_weights(weights, m_ = 0.01, M_ = 1):
    weights = 1 / weights
    M, m = max(weights), min(weights)
    return (np.array(weights) - m) * (M_ - m_) / (M - m) + m_

weights = map_weights(np.array(list(vocabulary.vocab.values())))
weights = None

In [10]:
model = NextWordPredictorModel(
    emb_dim  = EMB_SIZE,
    vocab_size = vocabulary.get_vocab_size(),
    num_lstm_hidden_layers = NUM_LSTM_LAYERS,
    hidden_state_size = HIDDEN_STATE_SIZE,
    dropout = DROPOUT,
    device = DEVICE,
    lr = STARTING_LR,
    fp16 = fp16,
    weight = weights,
    positional_encoding = POS_ENCODING
).to(DEVICE)

if fp16:
    model, model.optimizer = amp.initialize(
        model,
        model.optimizer,
        opt_level = 'O1' # https://nvidia.github.io/apex/amp.html
    )

model.scheduler = torch.optim.lr_scheduler.StepLR(model.optimizer, 1.0, gamma=0.95)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [None]:
metrics = model.fit(
    train_dataloader = train_dataloader,
    eval_dataloader = val_dataloader,
    num_epochs = 5,
    early_stopping = True,
    early_stopping_patience = 2,
    early_stopping_metric = 'val_loss',
    early_stopping_metric_best = 'min', # if lower is better (like for loss)
)

 30%|███       | 2084/6870 [00:06<00:15, 303.11it/s]

In [None]:
df = pd.DataFrame(metrics).T
plt.figure()
plt.plot(df['train_loss'])
plt.plot(df['val_loss'])
plt.legend(['train_loss', 'val_loss'])
plt.show()

In [None]:
sent = 'Historians write in the context of their own time and with due regard to the current dominant ideas of how to interpret the past'
ind = [train_dataset.get_idx(w.lower()) for w in sent.split(' ')]
print(ind)
hidden = model.init_hidden(1)
inputs = torch.tensor([ind]).to(DEVICE)
model.eval()
output, _ = model(inputs, hidden)
preds = output.view(-1, model.vocab_size)

In [None]:
for top4 in preds.topk(4).indices:
    res = []
    for l in top4:
        w = vocabulary.idx_to_word[l.item()]
        res.append(w)
    print(res)