# HW 2: Language Modeling

In this homework you will be building several varieties of language models.

## Goal

We ask that you construct the following models in Torch / NamedTensor:

1. A count-based trigram model with linear-interpolation. $$p(y_t | y_{1:t-1}) =  \alpha_1 p(y_t | y_{t-2}, y_{t-1}) + \alpha_2 p(y_t | y_{t-1}) + (1 - \alpha_1 - \alpha_2) p(y_t) $$
2. A neural network language model (consult *A Neural Probabilistic Language Model* http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)
3. An LSTM language model (consult *Recurrent Neural Network Regularization*, https://arxiv.org/pdf/1409.2329.pdf) 
4. Your own extensions to these models.


Consult the papers provided for hyperparameters.

 


## Setup

This notebook provides a working definition of the setup of the problem itself. You may construct your models inline or use an external setup (preferred) to build your system.

In [None]:
!pip install -q torch torchtext opt_einsum
!pip install -qU git+https://github.com/harvardnlp/namedtensor
!pip install bayesian-optimization

In [None]:
import torch
import torchtext
from torchtext.vocab import Vectors

from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField

import numpy as np

from load_data import load_text
from models import TrigramModel, LSTM, NeuralNetwork, Transformer
from train_models import make_kaggle_submission

from bayes_opt import BayesianOptimization

In [None]:
train_iter, val_iter, test_iter, TEXT = load_text("./data", device = 'cuda')

In [None]:
TEXT.vocab.load_vectors('fasttext.simple.300d')

### Trigram

In [None]:
model = TrigramModel(.8, .16, len(TEXT.vocab))
model.fit(train_iter)

In [None]:
criterion = torch.nn.NLLLoss()
running_loss = 0.
count = 0
for batch in val_iter:
    outputs = model.predict(batch.text.cpu()).log()
    running_loss += criterion(
        outputs.transpose('batch', 'distribution', 'seqlen').values,
        batch.target.transpose('batch', 'seqlen').cpu().values).item()
    count += 1

In [None]:
np.exp(running_loss / count)

364.8028120807864

### Optimize NN Hyperparameters

In [None]:
def test_net_hyperparams(hidden, lr, dropout, kernel_size):
    """ Trains and evaluates nn with given params """
    model = NeuralNetwork(TEXT,
                          device='cuda',
                          hidden_size=int(hidden),
                          kernel_size=int(kernel_size + .5),
                          dropout=dropout,
                          freeze_embedding=False)
    net.fit(train_iter,
            val_iter=val_iter,
            lr=lr,
            batch_size=128,
            epochs=50,
            early_stopping=True,
            verbose=False,)
    return -net.val_loss

In [None]:
net_pbounds = {
    'hidden': (32, 512),
    'lr': (.0001, .01),
    'dropout': (0, .6),
    'kernel_size': (2, 5),
}

net_optimizer = BayesianOptimization(
    f=test_net_hyperparams,
    pbounds=net_pbounds,
)

In [None]:
# tests hyperparameters and finds best configuration
net_optimizer.maximize()

### Optimize LSTM Hyperparameters

In [None]:
def test_lstm_hyperparams(hidden, nlayers, dropout, lr):
    lstm = LSTM(TEXT,
                device='cuda',
                hidden_size=int(hidden),
                layers=int(nlayers + .5),
                dropout=dropout,
                freeze_embedding=False)
    lstm.fit(train_iter,
             val_iter=val_iter,
             lr=lr,
             batch_size=128,
             epochs=50,
             early_stopping=True,
             verbose=False)
    return -lstm.val_loss

In [None]:
lstm_pbounds = {'hidden': (32, 256),
                'nlayers': (1, 2),
                'dropout': (0, .6),
                'lr': (.0001, .01)
               }

lstm_optimizer = BayesianOptimization(
    f=test_lstm_hyperparams,
    pbounds=lstm_pbounds,
)

In [None]:
lstm_optimizer.maximize(init_points=3, n_iter=10)

In [None]:
lstm = LSTM(TEXT,
            device='cuda',
            hidden_size=400,
            dropout=.5,
            layers=1)
lstm.fit(train_iter,
        val_iter=val_iter,
        lr=.005,
        batch_size=128,
        epochs=50,
        interval=50)

In [None]:
lstm.fit(train_iter,
        val_iter=test_iter,
        lr=.0,
        batch_size=128,
        epochs=1,
        interval=100)

### And the Transformer

In [None]:
def test_transformer_hyperparams(num_layers,
                                 num_heads,
                                 k_depth,
                                 v_depth,
                                 filt_size):
    num_layers = int(num_layers + .5)
    num_heads = int(num_layers + .5)
    k_depth = int((k_depth // num_heads) * int(num_heads))
    v_depth = int((v_depth // num_heads) * int(num_heads))
    filt_size = int(filt_size + .5)
    tra = Transformer(TEXT,
                      device='cuda',
                      num_layers=num_layers,
                      num_heads=num_heads,
                      total_key_depth=k_depth,
                      total_value_depth=v_depth,
                      filter_size=filt_size,
                      freeze_embedding=False)
    tra.fit(train_iter,
            val_iter=val_iter,
            early_stopping=True,
            epochs=25,
            lr=.001,
            verbose=False)
    return -tra.val_loss

In [None]:
tra_pbounds = {
    'num_layers': (3, 8),
    'num_heads': (1, 4),
    'k_depth': (4, 16),
    'v_depth': (4, 16),
    'filt_size': (2, 5),
}

tra_optimizer = BayesianOptimization(
    f=test_transformer_hyperparams,
    pbounds=tra_pbounds,
)

In [None]:
tra_optimizer.maximize()

In [None]:
tra = Transformer(TEXT,
                  device='cuda',
                  num_layers=3,
                  num_heads=1,
                  total_key_depth=16,
                  total_value_depth=16,
                  filter_size=2,
                  layer_dropout=.1,
                  freeze_embedding=False)
tra.fit(train_iter,
        val_iter=val_iter,
        early_stopping=True,
        epochs=50,
        lr=.01,
        interval=50,
        verbose=True)

In [None]:
tra.fit(train_iter,
        val_iter=test_iter,
        early_stopping=True,
        epochs=3,
        lr=0.00001,
        interval=50,
        verbose=True)

In [None]:
def make_kaggle_submission(model, TEXT, path_to_data = "./data/", device = 'cpu'):
    kaggle_input = load_kaggle_data(path_to_data+"/input.txt", TEXT, device)
    pred = model.predict(kaggle_input, predict_last=True)

    _, top20 = pred[{'seqlen':-1}].values.topk(20, dim = 1)

    with open(path_to_data+"/sample.txt", "w") as fout:
        print("id,word", file=fout)
        for i, text in enumerate(top20, 1):
            predictions = [TEXT.vocab.itos[word] for word in text]
            print("%d,%s"%(i, " ".join(predictions)), file=fout)

def load_kaggle_data(path_to_data, TEXT, device):
    with open(path_to_data) as f:
        data = f.read()
    sentences = [sent for sent in data.split('\n')[:-1]]
    convert_sent_to_int = lambda sent: [TEXT.vocab.stoi[word] for word in sent.split(' ')[:-1]]
    sent_list = np.array([convert_sent_to_int(sent) for sent in sentences])
    return NamedTensor(torch.from_numpy(sent_list).to(device), names = ('batch','seqlen'))


In [None]:
make_kaggle_submission(lstm, TEXT, path_to_data = ".", device = 'cuda')