In [1]:
import os
import numpy as np
import time
import math
from data_utils import * 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import LambdaLR
from copy import deepcopy

In [2]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.2, softmax_temp=6):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp) # Token2Embeddings
       
        self.rnn = nn.LSTM(ninp, ninp, nlayers, dropout=dropout)
        
        self.decoder = nn.Linear(nhid, ntoken, bias=False)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
    
        self.decoder.weight = self.encoder.weight

        self.init_weights()

        
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)
       

    def forward(self, input, hidden):
      
        emb = self.drop(self.encoder(input))
  
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        output_scaled = softmax_temp*output.view(output.size(0)*output.size(1), output.size(2))
     
        decoded = self.decoder(output_scaled)
        return decoded, hidden, output

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
       
        return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))

In [3]:
device=torch.device('cuda') if torch.device.cuda.is_available() else torch.device('cpu')

In [4]:
train_batch_size = 32
eval_batch_size = 10

corpus_raw = Corpus('/home/ec2-user/wikitext-2/')

train_data = batchify(corpus_raw.train, train_batch_size) # size(total_len//bsz, bsz)
val_data = batchify(corpus_raw.valid, eval_batch_size)
test_data = batchify(corpus_raw.test, eval_batch_size)

In [5]:
interval = 200 # interval to report
ntokens = len(corpus_raw.dictionary)


#model hyperparameters
hidden_size = 650

n_layers = 2
net = RNNModel(ntokens, hidden_size, hidden_size, n_layers, dropout=.2)
bptt = 64

In [None]:
net.to(device)

In [34]:
lr = .01
weight_decay = .0001
opt = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss().to(device)

In [7]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    with torch.no_grad():
        net.eval()
        total_loss = 0
        ntokens = len(corpus_raw.dictionary)
        hidden = net.init_hidden(eval_batch_size) #hidden size(nlayers, bsz, hdsize)
        for i in range(0, data_source.size(0) - 1, 64):# iterate over every timestep
            data, targets = get_batch(data_source, i)
            data, targets = data.to(device), targets.to(device)
            output, hidden,_ = net(data, hidden)
            # model input and output
            # inputdata size(bptt, bsz), and size(bptt, bsz, embsize) after embedding
            # output size(bptt*bsz, ntoken)
            total_loss += len(data) * criterion(output, targets).data
            hidden = repackage_hidden(hidden)
        return total_loss / len(data_source)

In [35]:
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer (:class:`~torch.optim.Optimizer`):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (:obj:`int`):
            The number of steps for the warmup phase.
        num_training_steps (:obj:`int`):
            The total number of training steps.
        last_epoch (:obj:`int`, `optional`, defaults to -1):
            The index of the last epoch when resuming training.
    Return:
        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda, last_epoch)

num_training_steps = len(test_data)//2
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=opt,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)



In [10]:
def l2_loss():
    weight = deepcopy(net.encoder.weight)
    x = torch.norm(weight, dim=1, keepdim=True)
    y = torch.ones_like(x)
    
    return ((x-y)**2).mean()
    

In [15]:
def normalize_queries(queries):
    
    q_norm = torch.norm(queries, dim=1, keepdim=True)
    y = torch.ones_like(q_norm)
    
    return ((q_norm-y)**2).mean()

In [36]:
n_epochs = 200
lambda_key = 2 #regularizer to constrain key norms close to 1
lambda_query = 2 #regularizer to constrain query norms close to 1

def train():

    net.train()
    total_loss = 0
    ce_total_loss = 0
    start_time = time.time()
   
    hidden = net.init_hidden(train_batch_size)
   
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, 64)):
        data, targets = get_batch(train_data, i)
        data, targets = data.to(device), targets.to(device)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        
   
        output, hidden, query = net(data, hidden)

        loss = criterion(output, targets)
        full_loss = loss + lambda_query* normalize_queries(query) + lambda_key*l2_loss()
      
        opt.zero_grad()
        full_loss.backward()
       

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
        opt.step()
        lr_scheduler.step()

        total_loss += full_loss.item()
        ce_total_loss += loss.item()

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            ce_loss = ce_total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | learning rate {:5.4f}'.format(
                epoch, batch, len(train_data) // 64,
                elapsed * 1000 / interval, cur_loss, math.exp(ce_loss), opt.param_groups[0]['lr']))
            total_loss = 0
            start_time = time.time()




In [None]:
best_val_loss = None

try:
    for epoch in range(1, n_epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
           
            with open('/home/ec2-user/trained_models/wikitext2_reg_lstm.pkl', 'wb') as f:
                torch.save(net.state_dict(), f)
                
            best_val_loss = val_loss

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

In [8]:
criterion = nn.CrossEntropyLoss().to(device)
evaluate(test_data)

tensor(4.6573, device='cuda:0')

In [9]:
math.exp(4.6573)

105.35124943205062