## Imports

In [41]:
import os
import re
import sys
import typing
import gc

sys.path.append(
    os.path.join('..','src')
)

from src.models import NextWordPredictorModel
from src.data_processing import *

In [42]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [43]:
import nltk

from nltk.corpus import webtext
# from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize

## Global Variables

In [44]:
DEVICE = "cuda:0"
assert torch.cuda.is_available()
from apex import amp, optimizers

PADDING_TOKEN = 'PAD' # voc 0
UNKNOWN_TOKEN = 'UKN' # voc 1

MAX_SEQ_LEN = 20

BATCH_SIZE = 16
EMB_SIZE = 256

val_split = 0.2
test_split = 0.1

# for reproducibility
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.manual_seed(0)
np.random.seed(23)

## Vocabulary

In [45]:
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')

train_text = ' '.join(train_iter)

In [46]:
vocabulary = FromRawTextVocabulary(
    text = train_text,
    tokenizer = tokenizer,
    text_cleaner = None,
    max_voc_size = 10000,
    min_word_occ = 10
)

## Dataset

In [47]:
train_iter, val_iter, test_iter = WikiText2()

train_text = ' '.join(train_iter)
val_text = ' '.join(val_iter)
test_text = ' '.join(test_iter)

train_dataset = SequenceDataset(
    vocabulary = vocabulary,
    text = train_text,
    max_seq_length = MAX_SEQ_LEN + 1,
    device = DEVICE
)
val_dataset = SequenceDataset(
    vocabulary = vocabulary,
    text = val_text,
    max_seq_length = MAX_SEQ_LEN + 1,
    device = DEVICE
)
test_dataset = SequenceDataset(
    vocabulary = vocabulary,
    text = test_text,
    max_seq_length = MAX_SEQ_LEN + 1,
    device = DEVICE
)

del train_text
del val_text
del test_text

gc.collect()

0

In [48]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    pin_memory = False,
    drop_last = True
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    pin_memory = False,
    drop_last = True
)

In [51]:
def map_weights(weights, m_ = 0.1, M_ = 1):
    weights = 1 / weights
    M, m = max(weights), min(weights)
    return (np.array(weights) - m) * (M_ - m_) / (M - m) + m_

weights = map_weights(np.array(list(vocabulary.vocab.values())))
#weights = None

In [53]:
model = NextWordPredictorModel(
    emb_dim  = EMB_SIZE,
    vocab_size = vocabulary.get_vocab_size(),
    num_lstm_hidden_layers = 2,
    hidden_state_size = 200,
    dropout = 0.5,
    device = DEVICE,
    lr = 1e-3,
    fp16 = True,
    weight = weights
).to(DEVICE)

if model.fp16:
    model, model.optimizer = amp.initialize(
        model,
        model.optimizer,
        opt_level = 'O1' # https://nvidia.github.io/apex/amp.html
    )

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [54]:
metrics = model.fit(
    train_dataloader = train_dataloader,
    eval_dataloader = val_dataloader,
    num_epochs = 100,
    early_stopping = True,
    early_stopping_patience = 2,
    early_stopping_metric = 'val_loss',
    early_stopping_metric_best = 'min', # if lower is better (like for loss)
)

100%|██████████| 7076/7076 [00:18<00:00, 377.92it/s]
100%|██████████| 742/742 [00:01<00:00, 380.10it/s]
  0%|          | 13/7076 [00:00<00:57, 122.27it/s]

Train loss at epoch 0 : 9.208509548024447
Eval loss at epoch 0 : 9.20819157731501
updating best metric


100%|██████████| 7076/7076 [00:52<00:00, 134.16it/s]
100%|██████████| 742/742 [00:02<00:00, 366.01it/s]
  0%|          | 12/7076 [00:00<01:00, 116.78it/s]

Train loss at epoch 1 : 8.769418183324564
Eval loss at epoch 1 : 7.661188159027511
updating best metric


 13%|█▎        | 947/7076 [00:07<00:47, 128.35it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 42%|████▏     | 2972/7076 [00:23<00:31, 129.17it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 70%|███████   | 4981/7076 [00:38<00:16, 123.53it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 99%|█████████▉| 7005/7076 [00:54<00:00, 130.02it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.82it/s]
100%|██████████| 742/742 [00:02<00:00, 336.57it/s]
  0%|          | 13/7076 [00:00<00:56, 124.71it/s]

Train loss at epoch 2 : 7.76934762692438
Eval loss at epoch 2 : 7.299711206209949
updating best metric


 27%|██▋       | 1919/7076 [00:15<00:41, 125.00it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 56%|█████▌    | 3957/7076 [00:30<00:24, 129.81it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 84%|████████▍ | 5956/7076 [00:46<00:08, 129.24it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.97it/s]
100%|██████████| 742/742 [00:02<00:00, 332.79it/s]
  0%|          | 12/7076 [00:00<01:01, 115.56it/s]

Train loss at epoch 3 : 7.569675821985748
Eval loss at epoch 3 : 7.1844001336881735
updating best metric


 13%|█▎        | 907/7076 [00:07<00:47, 130.05it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 34%|███▎      | 2380/7076 [00:18<00:37, 124.79it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


 90%|█████████ | 6388/7076 [00:50<00:05, 126.56it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.67it/s]
100%|██████████| 742/742 [00:02<00:00, 334.22it/s]
  0%|          | 13/7076 [00:00<00:57, 123.51it/s]

Train loss at epoch 4 : 7.475848264785862
Eval loss at epoch 4 : 7.118252758067252
updating best metric


 19%|█▉        | 1328/7076 [00:10<00:44, 129.22it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 47%|████▋     | 3342/7076 [00:26<00:28, 130.86it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 76%|███████▌  | 5355/7076 [00:41<00:13, 126.42it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.65it/s]
100%|██████████| 742/742 [00:02<00:00, 337.39it/s]
  0%|          | 13/7076 [00:00<00:56, 124.47it/s]

Train loss at epoch 5 : 7.416583713061935
Eval loss at epoch 5 : 7.073075287425614
updating best metric


  4%|▍         | 299/7076 [00:02<00:51, 130.46it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 32%|███▏      | 2293/7076 [00:17<00:37, 127.51it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 61%|██████    | 4304/7076 [00:33<00:21, 127.74it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 65%|██████▍   | 4591/7076 [00:35<00:19, 129.38it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:54<00:00, 129.09it/s]
100%|██████████| 742/742 [00:02<00:00, 354.75it/s]
  0%|          | 12/7076 [00:00<00:59, 118.81it/s]

Train loss at epoch 6 : 7.373476982925355
Eval loss at epoch 6 : 7.037618966758091
updating best metric


 22%|██▏       | 1544/7076 [00:11<00:42, 129.33it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 45%|████▌     | 3208/7076 [00:24<00:30, 128.57it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 128.22it/s]
100%|██████████| 742/742 [00:02<00:00, 335.00it/s]
  0%|          | 12/7076 [00:00<01:01, 114.75it/s]

Train loss at epoch 7 : 7.339355691265022
Eval loss at epoch 7 : 7.010763108569657
updating best metric


  2%|▏         | 162/7076 [00:01<00:53, 130.33it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 31%|███       | 2172/7076 [00:16<00:37, 129.85it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 59%|█████▉    | 4187/7076 [00:32<00:22, 126.90it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 74%|███████▎  | 5214/7076 [00:40<00:13, 133.73it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 127.82it/s]
100%|██████████| 742/742 [00:02<00:00, 327.33it/s]
  0%|          | 13/7076 [00:00<00:56, 125.25it/s]

Train loss at epoch 8 : 7.313255053445268
Eval loss at epoch 8 : 6.987489354578312
updating best metric


  3%|▎         | 190/7076 [00:01<00:53, 128.54it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


 59%|█████▉    | 4198/7076 [00:33<00:22, 126.33it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 88%|████████▊ | 6216/7076 [00:49<00:06, 127.45it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 126.94it/s]
100%|██████████| 742/742 [00:02<00:00, 326.34it/s]
  0%|          | 13/7076 [00:00<00:57, 122.24it/s]

Train loss at epoch 9 : 7.2909041876978895
Eval loss at epoch 9 : 6.9683936374849385
updating best metric


 16%|█▋        | 1151/7076 [00:08<00:45, 130.44it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 45%|████▍     | 3178/7076 [00:24<00:30, 127.95it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 48%|████▊     | 3380/7076 [00:26<00:29, 125.10it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


 99%|█████████▉| 7024/7076 [00:54<00:00, 130.55it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 128.01it/s]
100%|██████████| 742/742 [00:02<00:00, 331.16it/s]
  0%|          | 11/7076 [00:00<01:06, 106.46it/s]

Train loss at epoch 10 : 7.273355776607283
Eval loss at epoch 10 : 6.9530030505033835
updating best metric


 56%|█████▌    | 3954/7076 [00:30<00:25, 124.48it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 84%|████████▍ | 5950/7076 [00:46<00:08, 126.28it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.91it/s]
100%|██████████| 742/742 [00:02<00:00, 341.20it/s]
  0%|          | 13/7076 [00:00<00:55, 128.01it/s]

Train loss at epoch 11 : 7.257020458048383
Eval loss at epoch 11 : 6.939559532946975
updating best metric


 13%|█▎        | 885/7076 [00:06<00:48, 128.18it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 34%|███▍      | 2409/7076 [00:18<00:36, 129.56it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


 84%|████████▎ | 5912/7076 [00:46<00:09, 127.84it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 126.84it/s]
100%|██████████| 742/742 [00:02<00:00, 339.08it/s]
  0%|          | 12/7076 [00:00<01:02, 113.75it/s]

Train loss at epoch 12 : 7.243448868318087
Eval loss at epoch 12 : 6.9268055572664
updating best metric


 40%|████      | 2837/7076 [00:22<00:32, 129.73it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 68%|██████▊   | 4845/7076 [00:38<00:17, 129.08it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 93%|█████████▎| 6614/7076 [00:51<00:03, 120.46it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 127.41it/s]
100%|██████████| 742/742 [00:02<00:00, 330.61it/s]
  0%|          | 12/7076 [00:00<01:00, 116.30it/s]

Train loss at epoch 13 : 7.231267703057953
Eval loss at epoch 13 : 6.916154973269151
updating best metric


 50%|█████     | 3555/7076 [00:27<00:27, 128.36it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 58%|█████▊    | 4112/7076 [00:32<00:22, 129.73it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 127.57it/s]
100%|██████████| 742/742 [00:02<00:00, 342.86it/s]
  0%|          | 13/7076 [00:00<00:57, 121.86it/s]

Train loss at epoch 14 : 7.221177163132171
Eval loss at epoch 14 : 6.906679170150962
updating best metric


 15%|█▍        | 1050/7076 [00:08<00:47, 127.58it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 44%|████▎     | 3089/7076 [00:24<00:30, 129.98it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 45%|████▍     | 3157/7076 [00:24<00:30, 128.73it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 128.09it/s]
100%|██████████| 742/742 [00:02<00:00, 342.71it/s]
  0%|          | 11/7076 [00:00<01:05, 108.57it/s]

Train loss at epoch 15 : 7.210969419314943
Eval loss at epoch 15 : 6.898872493090977
updating best metric


  1%|▏         | 105/7076 [00:00<00:53, 129.93it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 30%|███       | 2124/7076 [00:16<00:38, 128.30it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 58%|█████▊    | 4137/7076 [00:32<00:23, 124.16it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 87%|████████▋ | 6150/7076 [00:47<00:07, 130.39it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 128.25it/s]
100%|██████████| 742/742 [00:02<00:00, 340.89it/s]
  0%|          | 12/7076 [00:00<01:02, 113.30it/s]

Train loss at epoch 16 : 7.202778196456137
Eval loss at epoch 16 : 6.890169549181134
updating best metric


 15%|█▌        | 1075/7076 [00:08<00:46, 129.08it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 37%|███▋      | 2617/7076 [00:20<00:34, 130.81it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


 93%|█████████▎| 6611/7076 [00:51<00:03, 129.82it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 128.22it/s]
100%|██████████| 742/742 [00:02<00:00, 341.99it/s]
  0%|          | 12/7076 [00:00<01:01, 114.34it/s]

Train loss at epoch 17 : 7.195461872743981
Eval loss at epoch 17 : 6.884262874120008
updating best metric


 22%|██▏       | 1542/7076 [00:11<00:42, 130.08it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 50%|█████     | 3551/7076 [00:27<00:27, 127.97it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 79%|███████▊  | 5569/7076 [00:43<00:11, 125.65it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:54<00:00, 128.93it/s]
100%|██████████| 742/742 [00:02<00:00, 339.71it/s]
  0%|          | 13/7076 [00:00<00:57, 123.25it/s]

Train loss at epoch 18 : 7.187982795061939
Eval loss at epoch 18 : 6.877918049331624
updating best metric


  7%|▋         | 496/7076 [00:03<00:54, 120.63it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 36%|███▌      | 2512/7076 [00:19<00:35, 127.69it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 64%|██████▍   | 4517/7076 [00:35<00:19, 128.96it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 92%|█████████▏| 6536/7076 [00:51<00:04, 127.27it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 126.80it/s]
100%|██████████| 742/742 [00:02<00:00, 338.28it/s]
  0%|          | 13/7076 [00:00<00:57, 122.51it/s]

Train loss at epoch 19 : 7.1819617641113656
Eval loss at epoch 19 : 6.873857768719408
updating best metric


 21%|██        | 1470/7076 [00:11<00:44, 125.72it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 49%|████▉     | 3491/7076 [00:27<00:28, 127.87it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 78%|███████▊  | 5527/7076 [00:43<00:12, 128.03it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:56<00:00, 126.27it/s]
100%|██████████| 742/742 [00:02<00:00, 325.15it/s]
  0%|          | 12/7076 [00:00<01:02, 112.54it/s]

Train loss at epoch 20 : 7.1761504020173374
Eval loss at epoch 20 : 6.8676851200607585
updating best metric


  6%|▋         | 454/7076 [00:03<00:52, 126.72it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 35%|███▍      | 2458/7076 [00:19<00:36, 125.52it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 63%|██████▎   | 4483/7076 [00:35<00:20, 126.93it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 67%|██████▋   | 4731/7076 [00:37<00:18, 126.19it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:56<00:00, 125.04it/s]
100%|██████████| 742/742 [00:02<00:00, 325.02it/s]
  0%|          | 13/7076 [00:00<00:58, 121.70it/s]

Train loss at epoch 21 : 7.170753499826784
Eval loss at epoch 21 : 6.864586236341944
updating best metric


 23%|██▎       | 1655/7076 [00:13<00:42, 128.30it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 31%|███▏      | 2226/7076 [00:17<00:37, 128.46it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


 77%|███████▋  | 5433/7076 [00:42<00:12, 129.49it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 127.69it/s]
100%|██████████| 742/742 [00:02<00:00, 326.79it/s]
  0%|          | 12/7076 [00:00<01:01, 114.50it/s]

Train loss at epoch 22 : 7.166206660211389
Eval loss at epoch 22 : 6.860817593705622
updating best metric


 34%|███▍      | 2390/7076 [00:18<00:35, 130.25it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 62%|██████▏   | 4400/7076 [00:34<00:20, 128.60it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 91%|█████████ | 6447/7076 [00:50<00:04, 126.98it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.94it/s]
100%|██████████| 742/742 [00:02<00:00, 320.12it/s]
  0%|          | 13/7076 [00:00<00:56, 124.34it/s]

Train loss at epoch 23 : 7.161859458351351
Eval loss at epoch 23 : 6.857652884609294
updating best metric


 19%|█▉        | 1371/7076 [00:10<00:44, 129.36it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 48%|████▊     | 3371/7076 [00:26<00:28, 128.43it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 76%|███████▌  | 5385/7076 [00:42<00:13, 127.18it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 128.29it/s]
100%|██████████| 742/742 [00:02<00:00, 342.22it/s]
  0%|          | 12/7076 [00:00<00:59, 118.60it/s]

Train loss at epoch 24 : 7.158572209359833
Eval loss at epoch 24 : 6.853058049620965
updating best metric


  5%|▍         | 324/7076 [00:02<00:54, 124.44it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 33%|███▎      | 2325/7076 [00:18<00:44, 107.93it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 61%|██████    | 4334/7076 [00:34<00:21, 127.30it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 90%|████████▉ | 6344/7076 [00:49<00:05, 131.15it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.52it/s]
100%|██████████| 742/742 [00:02<00:00, 339.82it/s]
  0%|          | 13/7076 [00:00<00:57, 123.84it/s]

Train loss at epoch 25 : 7.154474325743417
Eval loss at epoch 25 : 6.851182203729841
updating best metric


 18%|█▊        | 1287/7076 [00:10<00:44, 129.58it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 47%|████▋     | 3294/7076 [00:26<00:29, 130.24it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 75%|███████▍  | 5297/7076 [00:41<00:14, 124.78it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.83it/s]
100%|██████████| 742/742 [00:02<00:00, 335.63it/s]
  0%|          | 13/7076 [00:00<00:56, 124.13it/s]

Train loss at epoch 26 : 7.150242644356361
Eval loss at epoch 26 : 6.84709467078155
updating best metric


  3%|▎         | 217/7076 [00:01<00:54, 127.00it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 31%|███▏      | 2223/7076 [00:17<00:37, 129.23it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 60%|█████▉    | 4232/7076 [00:33<00:21, 130.02it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 88%|████████▊ | 6239/7076 [00:48<00:06, 129.98it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 127.58it/s]
100%|██████████| 742/742 [00:02<00:00, 341.14it/s]
  0%|          | 12/7076 [00:00<01:00, 116.22it/s]

Train loss at epoch 27 : 7.147063966320355
Eval loss at epoch 27 : 6.845296387402517
updating best metric


 16%|█▋        | 1166/7076 [00:09<00:46, 127.25it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 45%|████▍     | 3163/7076 [00:24<00:30, 130.05it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 73%|███████▎  | 5166/7076 [00:40<00:15, 124.06it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 97%|█████████▋| 6896/7076 [00:53<00:01, 127.40it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 128.47it/s]
100%|██████████| 742/742 [00:02<00:00, 338.25it/s]
  0%|          | 12/7076 [00:00<01:03, 111.30it/s]

Train loss at epoch 28 : 7.14457305930576
Eval loss at epoch 28 : 6.842483534645841
updating best metric


 54%|█████▍    | 3842/7076 [00:30<00:24, 129.50it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 57%|█████▋    | 4063/7076 [00:31<00:22, 131.15it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 128.50it/s]
100%|██████████| 742/742 [00:02<00:00, 343.99it/s]
  0%|          | 13/7076 [00:00<00:57, 122.10it/s]

Train loss at epoch 29 : 7.1415078367736795
Eval loss at epoch 29 : 6.840082850417679
updating best metric


 14%|█▍        | 993/7076 [00:07<00:46, 130.90it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 42%|████▏     | 3007/7076 [00:23<00:31, 127.72it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 71%|███████▏  | 5048/7076 [00:39<00:15, 130.09it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 89%|████████▊ | 6277/7076 [00:48<00:06, 131.61it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:54<00:00, 129.16it/s]
100%|██████████| 742/742 [00:02<00:00, 341.59it/s]
  0%|          | 13/7076 [00:00<00:58, 121.51it/s]

Train loss at epoch 30 : 7.1380744629353705
Eval loss at epoch 30 : 6.838504279720173
updating best metric


 45%|████▌     | 3205/7076 [00:24<00:29, 130.87it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 74%|███████▎  | 5209/7076 [00:40<00:14, 130.98it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:55<00:00, 128.51it/s]
100%|██████████| 742/742 [00:02<00:00, 343.04it/s]
  0%|          | 12/7076 [00:00<01:01, 114.43it/s]

Train loss at epoch 31 : 7.135695480842116
Eval loss at epoch 31 : 6.836815010826543
updating best metric


  2%|▏         | 133/7076 [00:01<00:57, 121.46it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 30%|███       | 2154/7076 [00:17<00:43, 113.52it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 59%|█████▉    | 4168/7076 [00:32<00:22, 130.75it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 79%|███████▉  | 5592/7076 [00:43<00:11, 130.86it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:55<00:00, 128.03it/s]
100%|██████████| 742/742 [00:02<00:00, 339.95it/s]
  0%|          | 12/7076 [00:00<01:01, 115.14it/s]

Train loss at epoch 32 : 7.133329972094098
Eval loss at epoch 32 : 6.834031897413762
updating best metric


 36%|███▌      | 2526/7076 [00:19<00:35, 127.67it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 65%|██████▍   | 4581/7076 [00:35<00:19, 130.11it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 86%|████████▌ | 6063/7076 [00:47<00:07, 130.91it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


100%|██████████| 7076/7076 [00:54<00:00, 128.83it/s]
100%|██████████| 742/742 [00:02<00:00, 333.91it/s]
  0%|          | 12/7076 [00:00<01:00, 115.90it/s]

Train loss at epoch 33 : 7.1322627364072915
Eval loss at epoch 33 : 6.832373656995213
updating best metric


 43%|████▎     | 3024/7076 [00:23<00:29, 135.12it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


 71%|███████   | 5024/7076 [00:38<00:14, 137.53it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|█████████▉| 7054/7076 [00:53<00:00, 136.60it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


100%|██████████| 7076/7076 [00:53<00:00, 133.05it/s]
100%|██████████| 742/742 [00:02<00:00, 355.17it/s]
  0%|          | 14/7076 [00:00<00:53, 131.25it/s]

Train loss at epoch 34 : 7.129407206317541
Eval loss at epoch 34 : 6.830802625080325
updating best metric


 11%|█         | 792/7076 [00:05<00:46, 134.41it/s]


KeyboardInterrupt: 

In [55]:
metrics

{0: {'train_loss': 9.211254693074979,
  'val_loss': 9.211471538338056,
  'lr': 0.01},
 1: {'train_loss': 7.481150862791903,
  'val_loss': 6.791930608672273,
  'lr': 0.0099},
 2: {'train_loss': 7.032236319062668,
  'val_loss': 6.694209414351018,
  'lr': 0.009801},
 3: {'train_loss': 6.965919334447741,
  'val_loss': 6.655035271477506,
  'lr': 0.00970299},
 4: {'train_loss': 6.935509015571741,
  'val_loss': 6.638524394151014,
  'lr': 0.0096059601},
 5: {'train_loss': 6.917404641735817,
  'val_loss': 6.628796935402801,
  'lr': 0.009509900499},
 6: {'train_loss': 6.905118976965539,
  'val_loss': 6.624028734119754,
  'lr': 0.00941480149401},
 7: {'train_loss': 6.896221652820731,
  'val_loss': 6.622018424005843,
  'lr': 0.0093206534790699},
 8: {'train_loss': 6.8900200257940165,
  'val_loss': 6.614226840577036,
  'lr': 0.0092274469442792},
 9: {'train_loss': 6.884402382838787,
  'val_loss': 6.61305994036384,
  'lr': 0.009135172474836408},
 10: {'train_loss': 6.880649321290853,
  'val_loss': 6

In [56]:
sent = 'so how are'
ind = [vocabulary.word_to_idx[w] for w in sent.split(' ')]
print(ind)
hidden = model.init_hidden(1)
inputs = torch.tensor([ind]).to(DEVICE)
model.eval()
output, hidden = model(inputs, hidden)
preds = output.view(-1, model.vocab_size)

[126, 372, 27]


In [57]:
for i in preds.topk(1)[1].view(-1):
    w = vocabulary.idx_to_word[i.item()]
    print(w)

the
the
the


In [58]:
preds.topk(10)

torch.return_types.topk(
values=tensor([[2.8789, 2.4219, 2.3633, 2.3262, 2.2344, 2.2227, 2.1797, 1.9902, 1.8594,
         1.8535],
        [4.5586, 3.9941, 3.9199, 3.8750, 3.7578, 3.7402, 3.6875, 3.4121, 3.2031,
         3.1836],
        [5.7578, 5.1094, 5.0234, 4.9805, 4.8281, 4.8203, 4.7461, 4.4062, 4.1367,
         4.0977]], device='cuda:0', dtype=torch.float16,
       grad_fn=<TopkBackward>),
indices=tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 12],
        [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 12],
        [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 12]], device='cuda:0'))