In [0]:
import torch 
import torch.nn as nn

import numpy as np
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt

import argparse
import time
import collections
import os
import sys

In [17]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path = 'drive/My Drive/IFT6135/A2/'

Mounted at /content/drive


# RNN

In [0]:
class RNNCell(nn.Module):

	def __init__(self, input_size, hidden_size, dp_keep_prob):
    
		super(RNNCell, self).__init__()

		self.hidden_size = hidden_size

		self.fc_x = nn.Linear(input_size, hidden_size)
		self.fc_h = nn.Linear(hidden_size, hidden_size)

		self.dropout = nn.Dropout(1 - dp_keep_prob)
		self.tanh = nn.Tanh()


	def init_weights(self):

		k = np.sqrt(1 / self.hidden_size)
		
		nn.init.uniform_(self.fc_x.weight, -k, k)
		nn.init.uniform_(self.fc_x.bias, -k, k)

		nn.init.uniform_(self.fc_h.weight, -k, k)
		nn.init.uniform_(self.fc_h.bias, -k, k)

	def forward(self, inputs, hidden):
    
		inputs_dropout = self.dropout(inputs)

		out = self.fc_x(inputs_dropout) + self.fc_h(hidden)
		out = self.tanh(out)

		return out	

class RNN(nn.Module):

	def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size, num_layers, dp_keep_prob):
		
		super(RNN, self).__init__()

		self.emb_size = emb_size
		self.hidden_size = hidden_size
		self.seq_len = seq_len
		self.batch_size = batch_size
		self.vocab_size = vocab_size
		self.num_layers = num_layers
		self.dp_keep_prob = dp_keep_prob

		self.dropout = nn.Dropout(1 - dp_keep_prob)

		self.embedding_layer = nn.Embedding(vocab_size, emb_size)

		self.hidden_layers = nn.ModuleList()

		for i in range(num_layers):

			if i == 0:
				self.hidden_layers.append( RNNCell(emb_size, hidden_size, dp_keep_prob) )
			else:
				self.hidden_layers.append( RNNCell(hidden_size, hidden_size, dp_keep_prob) )

		self.output_layer = nn.Linear(hidden_size, vocab_size)

		self.init_weights()


	def init_weights(self):

		nn.init.uniform_(self.embedding_layer.weight, -0.1, 0.1)

		for hidden_layer in self.hidden_layers:
			hidden_layer.init_weights()

		nn.init.uniform_(self.output_layer.weight, -0.1, 0.1)
		nn.init.constant_(self.output_layer.bias, 0)

	def init_hidden(self):
  	
		return torch.zeros((self.num_layers, self.batch_size, self.hidden_size))
    
	def forward(self, inputs, hidden):

		embedded_inputs = self.embedding_layer(inputs)

		logits = torch.zeros([self.seq_len, self.batch_size, self.vocab_size], device=inputs.device)

		for t in range(self.seq_len):
			
			hidden_layers_outputs = []

			inputs_l = embedded_inputs[t]

			for l, hidden_layer_l in enumerate(self.hidden_layers):

				hidden_layer_l_output = hidden_layer_l(inputs_l, hidden[l])

				hidden_layers_outputs.append(hidden_layer_l_output)

				inputs_l = hidden_layer_l_output

			hidden = torch.stack(hidden_layers_outputs)

			last_hidden_layer_output_dropout = self.dropout(inputs_l)

			logits[t] = self.output_layer(last_hidden_layer_output_dropout)

		return logits.view(self.seq_len, self.batch_size, self.vocab_size), hidden


	def generate(self, input, hidden, generated_seq_len):

		samples = input.view(1, -1)

		embedded_input = self.embedding_layer(samples)
	
		for _ in range(self.seq_len):
			
			hidden_layers_outputs = []

			input_l = embedded_input[0]

			for l, hidden_layer_l in enumerate(self.hidden_layers):

				hidden_layer_l_output = hidden_layer_l(input_l, hidden[l])

				hidden_layers_outputs.append(hidden_layer_l_output)

				input_l = hidden_layer_l_output

			hidden = torch.stack(hidden_layers_outputs)

			last_hidden_layer_output_dropout = self.dropout(input_l)

			logits = self.output_layer(last_hidden_layer_output_dropout)

			token = torch.argmax( nn.Softmax(logits), dim=1 ).detach().view(1, -1)

			samples = torch.cat( (samples, token), dim=0 )

			embedded_input = self.embedding_layer(token)

		return samples

# GRU

In [0]:
# Problem 2
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.1):
        super(GRUCell, self).__init__()

        # self.reset_inp = nn.Linear(input_size, hidden_size)
        # self.reset_hid = nn.Linear(hidden_size, hidden_size, bias=False)
        #
        # self.forget_inp = nn.Linear(input_size, hidden_size)
        # self.forget_hid = nn.Linear(hidden_size, hidden_size, bias=False)
        #
        # self.new_h_inp = nn.Linear(input_size, hidden_size)
        # self.new_h_hid = nn.Linear(hidden_size, hidden_size, bias=False)

        self.xh = nn.Linear(input_size, hidden_size * 3)
        self.hh = nn.Linear(hidden_size, hidden_size * 3, bias=False)

        self.dropout = nn.Dropout(dropout)

    def forward(self, inp, hidden):

        x_r, x_f, x_n = self.xh(inp).chunk(3, dim=1)
        h_r, h_f, h_n = self.hh(hidden).chunk(3, dim=1)
        reset = torch.sigmoid(x_r + h_r)
        forget = torch.sigmoid(x_f + h_f)
        h_tild = torch.tanh(x_n + reset * h_n)
        h = (1-forget)*hidden + forget*h_tild

        y = self.dropout(h)

        # reset = torch.sigmoid(self.reset_inp(inp) + self.reset_hid(hidden))
        # forget = torch.sigmoid(self.forget_inp(inp) + self.forget_hid(hidden))
        #
        # h_tild = torch.tanh(self.new_h_inp(inp) + self.new_h_hid(reset * hidden))
        #
        # h = (1-forget)*hidden + forget*h_tild
        #
        # y = self.dropout(h)
        return y, h


class GRU(nn.Module): # Implement a stacked GRU RNN
    """
    Follow the same instructions as for RNN (above), but use the equations for
    GRU, not Vanilla RNN.
    """
    def __init__(self, emb_size, hidden_size, seq_len, batch_size, vocab_size, num_layers, dp_keep_prob):
        super(GRU, self).__init__()

        #  ========================
        # ---- MY CODE STARTS HERE ----
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.dp_keep_prob = dp_keep_prob

        self.embed = nn.Embedding(vocab_size, emb_size)
        self.embed_dropout = nn.Dropout(1-dp_keep_prob)

        self.hiddens = nn.ModuleList([GRUCell(emb_size, hidden_size, 1-dp_keep_prob)])
        for l in range(num_layers-1):
            self.hiddens.append(GRUCell(hidden_size, hidden_size, 1-dp_keep_prob))

        self.output = nn.Linear(hidden_size, vocab_size)

        self.init_weights_uniform()


    def init_weights_uniform(self):
        #  ========================
        # ---- MY CODE STARTS HERE ----
        nn.init.uniform_(self.embed.weight, -0.1, 0.1)
        nn.init.uniform_(self.output.weight, -0.1, 0.1)
        nn.init.constant_(self.output.bias, 0)

        for nm, param in self.hiddens.named_parameters():
            if param.requires_grad:
                k = 1.0 / math.sqrt(param.shape[0])
                nn.init.uniform_(param, -k, k)

    def init_hidden(self):
        #  ========================
        # ---- MY CODE STARTS HERE ----
        return torch.zeros((self.num_layers, self.batch_size, self.hidden_size))

    def forward(self, inputs, hidden):
        #  ========================
        # ---- MY CODE STARTS HERE ----
        res, new_hid = [], []
        for t in range(inputs.shape[0]):
            outp = self.embed_dropout(self.embed(inputs[t]))

            for l, layer in enumerate(self.hiddens):
                outp, hid = layer(outp, hidden[l])
                new_hid.append(hid.unsqueeze(0))
            hidden = torch.cat(new_hid, 0)
            new_hid = []

            outp = self.output(outp)
            res.append(outp)

        logits = torch.cat(res, 0)
        return logits.view(self.seq_len, self.batch_size, self.vocab_size), hidden

    def generate(self, input, hidden, generated_seq_len):
        # ========================
        # ---- MY CODE STARTS HERE ----
        samples = [input.unsqueeze(0)]
        new_hid = []
        with torch.no_grad():
            for t in range(generated_seq_len):
                outp = self.embed_dropout(self.embed(input))

                for l, layer in enumerate(self.hiddens):
                    outp, hid = layer(outp, hidden[l])
                    new_hid.append(hid.unsqueeze(0))
                hidden = torch.cat(new_hid, 0)
                new_hid = []

                outp = self.output(outp).softmax(dim=-1)
                next_inp = torch.multinomial(outp, num_samples=1)
                samples.append(next_inp.unsqueeze(0))
                input = next_inp
            samples = torch.cat(samples, 0)
        return samples


# Problem 3
##############################################################################
#
# Code for the Transformer model
#
##############################################################################

"""
Implement the MultiHeadedAttention module of the transformer architecture.
All other necessary modules have already been implemented for you.
We're building a transfomer architecture for next-step prediction tasks, and 
applying it to sequential language modelling. We use a binary "mask" to specify 
which time-steps the model can use for the current prediction.
This ensures that the model only attends to previous time-steps.
The model first encodes inputs using the concatenation of a learned WordEmbedding 
and a (in our case, hard-coded) PositionalEncoding.
The word embedding maps a word's one-hot encoding into a dense real vector.
The positional encoding 'tags' each element of an input sequence with a code that 
identifies it's position (i.e. time-step).
These encodings of the inputs are then transformed repeatedly using multiple
copies of a TransformerBlock.
This block consists of an application of MultiHeadedAttention, followed by a 
standard MLP; the MLP applies *the same* mapping at every position.
Both the attention and the MLP are applied with Resnet-style skip connections, 
and layer normalization.
The complete model consists of the embeddings, the stacked transformer blocks, 
and a linear layer followed by a softmax.
"""

#This code has been modified from an open-source project, by David Krueger.
#The original license is included below:
#MIT License
#
#Copyright (c) 2018 Alexander Rush
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
# ----------------------------------------------------------------------------------
# TODO: implement this class
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, n_units, dropout=0.1):
        """
        n_heads: the number of attention heads
        n_units: the number of input and output units
        dropout: probability of DROPPING units
        """
        super(MultiHeadedAttention, self).__init__()
        # This sets the size of the keys, values, and queries (self.d_k) to all
        # be equal to the number of output units divided by the number of heads.
        self.d_k = n_units // n_heads
        # This requires the number of n_heads to evenly divide n_units.
        assert n_units % n_heads == 0
        self.n_units = n_units

        # TODO: create/initialize any necessary parameters or layers
        # Initialize all weights and biases uniformly in the range [-k, k],
        # where k is the square root of 1/n_units.
        # Note: the only Pytorch modules you are allowed to use are nn.Linear
        # and nn.Dropout
        # ETA: you can also use softmax
        # ETA: you can use the "clones" function we provide.

        # ---- MY CODE STARTS HERE ----
        self.n_heads = n_heads
        self.attn_transform = clones(nn.Linear(self.n_units, self.n_units), 4)
        # self.outp_transform = nn.Linear(self.n_units, self.n_units)
        self.dropout = nn.Dropout(p=dropout)

        for nm, param in self.named_parameters():
            if param.requires_grad:
                k = 1.0 / math.sqrt(param.size(0))
                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(param)
                print(k, fan_in)
                print(1.0 / k**2, nm, param.size())
                print(param.data.mean())
                nn.init.uniform_(param, -k, k)
                print(param.data.mean())
                import sys
                sys.exit(0)

    def forward(self, query, key, value, mask=None):
        # TODO: implement the masked multi-head attention.
        # query, key, and value correspond to Q, K, and V in the latex, and
        # they all have size: (batch_size, seq_len, self.n_units)
        # mask has size: (batch_size, seq_len, seq_len)
        # As described in the .tex, apply input masking to the softmax
        # generating the "attention values" (i.e. A_i in the .tex)
        # Also apply dropout to the attention values.

        # ---- MY CODE STARTS HERE ----

        batch_size = query.shape[0]

        query, key, value = [linear(x).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2) \
                             for x, linear in zip((query, key, value), self.attn_transform)]  # batch_size, n_heads, seq_len, self.d_k

        # scaled self attention
        # batch_size, n_heads, seq_len, seq_len
        score = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            # batch_size, 1, seq_len, seq_len
            mask = mask.unsqueeze(1)
            score = score.masked_fill(mask == 0, -1e9)

        score = F.softmax(score, dim=-1)
        score = self.dropout(score)

        # batch_size, self.n_heads, seq_len, self.d_k
        weighted_value = torch.matmul(score, value)
        # batch_size, self.n_heads, seq_len, self.n_units
        weighted_value = weighted_value.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads*self.d_k)

        # return self.outp_transform(weighted_value)
        return self.attn_transform[-1](weighted_value)


# ----------------------------------------------------------------------------------
# The encodings of elements of the input sequence

class WordEmbedding(nn.Module):
    def __init__(self, n_units, vocab):
        super(WordEmbedding, self).__init__()
        self.lut = nn.Embedding(vocab, n_units)
        self.n_units = n_units

    def forward(self, x):
        # print (x)
        return self.lut(x) * math.sqrt(self.n_units)


class PositionalEncoding(nn.Module):
    def __init__(self, n_units, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, n_units)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, n_units, 2).float() *
                             -(math.log(10000.0) / n_units))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        return self.dropout(x)


# ----------------------------------------------------------------------------------
# The TransformerBlock and the full Transformer


class TransformerBlock(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(TransformerBlock, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(ResidualSkipConnectionWithLayerNorm(size, dropout), 2)

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))  # apply the self-attention
        return self.sublayer[1](x, self.feed_forward)  # apply the position-wise MLP


class TransformerStack(nn.Module):
    """
    This will be called on the TransformerBlock (above) to create a stack.
    """

    def __init__(self, layer, n_blocks):  # layer will be TransformerBlock (below)
        super(TransformerStack, self).__init__()
        self.layers = clones(layer, n_blocks)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


class FullTransformer(nn.Module):
    def __init__(self, transformer_stack, embedding, n_units, vocab_size):
        super(FullTransformer, self).__init__()
        self.transformer_stack = transformer_stack
        self.embedding = embedding
        self.output_layer = nn.Linear(n_units, vocab_size)

    def forward(self, input_sequence, mask):
        embeddings = self.embedding(input_sequence)
        return F.log_softmax(self.output_layer(self.transformer_stack(embeddings, mask)), dim=-1)


def make_model(vocab_size, n_blocks=6,
               n_units=512, n_heads=16, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(n_heads, n_units)
    ff = MLP(n_units, dropout)
    position = PositionalEncoding(n_units, dropout)
    model = FullTransformer(
        transformer_stack=TransformerStack(TransformerBlock(n_units, c(attn), c(ff), dropout), n_blocks),
        embedding=nn.Sequential(WordEmbedding(n_units, vocab_size), c(position)),
        n_units=n_units,
        vocab_size=vocab_size
    )

    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model


# ----------------------------------------------------------------------------------
# Data processing

def subsequent_mask(size):
    """ helper function for creating the masks. """
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0


class Batch:
    "Object for holding a batch of data with mask during training."

    def __init__(self, x, pad=0):
        self.data = x
        self.mask = self.make_mask(self.data, pad)

    @staticmethod
    def make_mask(data, pad):
        "Create a mask to hide future words."
        mask = (data != pad).unsqueeze(-2)
        mask = mask & Variable(
            subsequent_mask(data.size(-1)).type_as(mask.data))
        return mask


#----------------------------------------------------------------------------------
# Some standard modules

class LayerNorm(nn.Module):
    "layer normalization, as in: https://arxiv.org/abs/1607.06450"
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class ResidualSkipConnectionWithLayerNorm(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(ResidualSkipConnectionWithLayerNorm, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


class MLP(nn.Module):
    """
    This is just an MLP with 1 hidden layer
    """
    def __init__(self, n_units, dropout=0.1):
        super(MLP, self).__init__()
        self.w_1 = nn.Linear(n_units, 2048)
        self.w_2 = nn.Linear(2048, n_units)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

# Setup

In [43]:
##############################################################################
#
# ARG PARSING AND EXPERIMENT SETUP
#
##############################################################################

parser = argparse.ArgumentParser(description='PyTorch Penn Treebank Language Modeling')

# Arguments you may need to set to run different experiments in 4.1 & 4.2.
parser.add_argument('--data', type=str, default= path + 'data',
                    help='location of the data corpus. We suggest you change the default\
                    here, rather than passing as an argument, to avoid long file paths.')
parser.add_argument('--model', type=str, default='RNN',
                    help='type of recurrent net (RNN, GRU, TRANSFORMER)')
parser.add_argument('--optimizer', type=str, default='ADAM',
                    help='optimization algo to use; SGD, SGD_LR_SCHEDULE, ADAM')
parser.add_argument('--seq_len', type=int, default=35,
                    help='number of timesteps over which BPTT is performed')
parser.add_argument('--batch_size', type=int, default=20,
                    help='size of one minibatch')
parser.add_argument('--initial_lr', type=float, default=0.0001,
                    help='initial learning rate')
parser.add_argument('--hidden_size', type=int, default=1500,
                    help='size of hidden layers. IMPORTANT: for the transformer\
                    this must be a multiple of 16.')
parser.add_argument('--save_best', action='store_true',
                    help='save the model for the best validation performance')
parser.add_argument('--num_layers', type=int, default=2,
                    help='number of hidden layers in RNN/GRU, or number of transformer blocks in TRANSFORMER')

# Other hyperparameters you may want to tune in your exploration
parser.add_argument('--emb_size', type=int, default=200,
                    help='size of word embeddings')
parser.add_argument('--num_epochs', type=int, default=40,
                    help='number of epochs to stop after')
parser.add_argument('--dp_keep_prob', type=float, default=0.35,
                    help='dropout *keep* probability. drop_prob = 1-dp_keep_prob \
                    (dp_keep_prob=1 means no dropout)')

# Arguments that you may want to make use of / implement more code for
parser.add_argument('--debug', action='store_true') 
parser.add_argument('--save_dir', type=str, default=path,
                    help='path to save the experimental config, logs, model \
                    This is automatically generated based on the command line \
                    arguments you pass and only needs to be set if you want a \
                    custom dir name')
parser.add_argument('--evaluate', action='store_true',
                    help="use this flag to run on the test set. Only do this \
                    ONCE for each model setting, and only after you've \
                    completed ALL hyperparameter tuning on the validation set.\
                    Note we are not requiring you to do this.")

# DO NOT CHANGE THIS (setting the random seed makes experiments deterministic, 
# which helps for reproducibility)
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')

args = parser.parse_args(args=[])
argsdict = args.__dict__
argsdict['code_file'] = sys.argv[0]

# Use the model, optimizer, and the flags passed to the script to make the 
# name for the experimental dir
print("\n########## Setting Up Experiment ######################")
flags = [flag.lstrip('--') for flag in sys.argv[1:]]
experiment_path = os.path.join(args.save_dir+'_'.join([argsdict['model'],
                                         argsdict['optimizer']] 
                                         ))

# Increment a counter so that previous results with the same args will not
# be overwritten. Comment out the next four lines if you only want to keep
# the most recent results.
i = 0
while os.path.exists(experiment_path + "_" + str(i)):
    i += 1
experiment_path = experiment_path + "_" + str(i)

# Creates an experimental directory and dumps all the args to a text file
os.mkdir(experiment_path)
print ("\nPutting log in %s"%experiment_path)
argsdict['save_dir'] = experiment_path
with open (os.path.join(experiment_path,'exp_config.txt'), 'w') as f:
    for key in sorted(argsdict):
        f.write(key+'    '+str(argsdict[key])+'\n')

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)

# Use the GPU if you have one
if torch.cuda.is_available():
    print("Using the GPU")
    device = torch.device("cuda") 
else:
    print("WARNING: You are about to run on cpu, and this will likely run out \
      of memory. \n You can try setting batch_size=1 to reduce memory usage")
    device = torch.device("cpu")


###############################################################################
#
# DATA LOADING & PROCESSING
#
###############################################################################

# HELPER FUNCTIONS
def _read_words(filename):
    with open(filename, "r") as f:
      return f.read().replace("\n", "<eos>").split()

def _build_vocab(filename):
    data = _read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict((v, k) for k, v in word_to_id.items())

    return word_to_id, id_to_word

def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

# Processes the raw data from text files
def ptb_raw_data(data_path=None, prefix="ptb"):
    train_path = os.path.join(data_path, prefix + ".train.txt")
    valid_path = os.path.join(data_path, prefix + ".valid.txt")
    test_path = os.path.join(data_path, prefix + ".test.txt")

    word_to_id, id_2_word = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    return train_data, valid_data, test_data, word_to_id, id_2_word

# Yields minibatches of data
def ptb_iterator(raw_data, batch_size, num_steps):
    raw_data = np.array(raw_data, dtype=np.int32)

    data_len = len(raw_data)
    batch_len = data_len // batch_size
    data = np.zeros([batch_size, batch_len], dtype=np.int32)
    for i in range(batch_size):
        data[i] = raw_data[batch_len * i:batch_len * (i + 1)]

    epoch_size = (batch_len - 1) // num_steps

    if epoch_size == 0:
        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

    for i in range(epoch_size):
        x = data[:, i*num_steps:(i+1)*num_steps]
        y = data[:, i*num_steps+1:(i+1)*num_steps+1]
        yield (x, y)


class Batch:
    "Data processing for the transformer. This class adds a mask to the data."
    def __init__(self, x, pad=-1):
        self.data = x
        self.mask = self.make_mask(self.data, pad)
    
    @staticmethod
    def make_mask(data, pad):
        "Create a mask to hide future words."

        def subsequent_mask(size):
            """ helper function for creating the masks. """
            attn_shape = (1, size, size)
            subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
            return torch.from_numpy(subsequent_mask) == 0

        mask = (data != pad).unsqueeze(-2)
        mask = mask & Variable(
            subsequent_mask(data.size(-1)).type_as(mask.data))
        return mask


# LOAD DATA
print('Loading data from '+args.data)
raw_data = ptb_raw_data(data_path=args.data)
train_data, valid_data, test_data, word_to_id, id_2_word = raw_data
vocab_size = len(word_to_id)
print('  vocabulary size: {}'.format(vocab_size))


###############################################################################
# 
# MODEL SETUP
#
###############################################################################

# NOTE ==============================================
# This is where your model code will be called. You may modify this code
# if required for your implementation, but it should not typically be necessary,
# and you must let the TAs know if you do so.
if args.model == 'RNN':
    model = RNN(emb_size=args.emb_size, hidden_size=args.hidden_size, 
                seq_len=args.seq_len, batch_size=args.batch_size,
                vocab_size=vocab_size, num_layers=args.num_layers, 
                dp_keep_prob=args.dp_keep_prob)
elif args.model == 'GRU':
    model = GRU(emb_size=args.emb_size, hidden_size=args.hidden_size,
                seq_len=args.seq_len, batch_size=args.batch_size,
                vocab_size=vocab_size, num_layers=args.num_layers,
                dp_keep_prob=args.dp_keep_prob)
elif args.model == 'TRANSFORMER':
    if args.debug:  # use a very small model
        model = TRANSFORMER(vocab_size=vocab_size, n_units=16, n_blocks=2)
    else:
        # Note that we're using num_layers and hidden_size to mean slightly
        # different things here than in the RNNs.
        # Also, the Transformer also has other hyperparameters
        # (such as the number of attention heads) which can change it's behavior.
        model = TRANSFORMER(vocab_size=vocab_size, n_units=args.hidden_size,
                            n_blocks=args.num_layers, dropout=1.-args.dp_keep_prob)
    # these 3 attributes don't affect the Transformer's computations;
    # they are only used in run_epoch
    model.batch_size = args.batch_size
    model.seq_len = args.seq_len
    model.vocab_size = vocab_size
else:
    print("Model type not recognized.")


model = model.to(device)

# LOSS FUNCTION
loss_fn = nn.CrossEntropyLoss()
if args.optimizer == 'ADAM':
    optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr)

# LEARNING RATE SCHEDULE    
lr = args.initial_lr
lr_decay_base = 1 / 1.15
m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs


###############################################################################
# 
# DEFINE COMPUTATIONS FOR PROCESSING ONE EPOCH
#
###############################################################################

def repackage_hidden(h):
    """
    Wraps hidden states in new Tensors, to detach them from their history.
    
    This prevents Pytorch from trying to backpropagate into previous input 
    sequences when we use the final hidden states from one mini-batch as the 
    initial hidden states for the next mini-batch.
    
    Using the final hidden states in this way makes sense when the elements of 
    the mini-batches are actually successive subsequences in a set of longer sequences.
    This is the case with the way we've processed the Penn Treebank dataset.
    """
    if isinstance(h, Variable):
        return h.detach_()
    else:
        return tuple(repackage_hidden(v) for v in h)


def run_epoch(model, data, is_train=False, lr=1.0):
    """
    One epoch of training/validation (depending on flag is_train).
    """
    if is_train:
        model.train()
    else:
        model.eval()
    epoch_size = ((len(data) // model.batch_size) - 1) // model.seq_len
    start_time = time.time()
    if args.model != 'TRANSFORMER':
        hidden = model.init_hidden()
        hidden = hidden.to(device)
    costs = 0.0
    iters = 0
    b_time, f_time = 0, 0
    losses = []

    # LOOP THROUGH MINIBATCHES
    for step, (x, y) in enumerate(ptb_iterator(data, model.batch_size, model.seq_len)):
        if args.model == 'TRANSFORMER':
            batch = Batch(torch.from_numpy(x).long().to(device))
            model.zero_grad()
            forward_time = time.time()
            outputs = model.forward(batch.data, batch.mask).transpose(1,0)
            f_time += time.time() - forward_time
            #print ("outputs.shape", outputs.shape)
        else:
            inputs = torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda()
            model.zero_grad()
            hidden = repackage_hidden(hidden)
            
            forward_time = time.time()
            outputs, hidden = model(inputs, hidden)
            f_time += time.time() - forward_time

        targets = torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous().to(device)#.cuda()
        tt = torch.squeeze(targets.view(-1, model.batch_size * model.seq_len))

        # LOSS COMPUTATION
        # This line currently averages across all the sequences in a mini-batch 
        # and all time-steps of the sequences.
        # For problem 5.3, you will (instead) need to compute the average loss 
        #at each time-step separately. 
        loss = loss_fn(outputs.contiguous().view(-1, model.vocab_size), tt)
        costs += loss.data.item() * model.seq_len
        losses.append(costs)
        iters += model.seq_len
        if args.debug:
            print(step, loss)
        if is_train:  # Only update parameters if training 
            backward_time = time.time()
            loss.backward()
            b_time += time.time() - backward_time
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
            if args.optimizer == 'ADAM':
                optimizer.step()
            else: 
                for p in model.parameters():
                    if p.grad is not None:
                        p.data.add_(-lr, p.grad.data)
            if step % 10 == 0:
              print('\rstep: {}; loss: {:.5f}; costs: {:.2f}; speed (wps) {:.2f}; b_time = {:.2f}; f_time = {:.2f}'
                    ''.format(step, loss, costs, iters * model.batch_size / (time.time() - start_time), b_time, f_time),
                    end='')
    print('')          
    return np.exp(costs / iters), losses



########## Setting Up Experiment ######################

Putting log in drive/My Drive/IFT6135/A2/RNN_ADAM_4
Using the GPU
Loading data from drive/My Drive/IFT6135/A2/data
  vocabulary size: 10000


In [0]:
###############################################################################
#
# RUN MAIN LOOP (TRAIN AND VAL)
#
###############################################################################

print("\n########## Running Main Loop ##########################")
train_ppls = []
train_losses = []
val_ppls = []
val_losses = []
best_val_so_far = np.inf
times = []

# In debug mode, only run one epoch
if args.debug:
    num_epochs = 1 
else:
    num_epochs = args.num_epochs

# MAIN LOOP
for epoch in range(num_epochs):
    t0 = time.time()
    print('\nEPOCH '+str(epoch)+' ------------------')
    if args.optimizer == 'SGD_LR_SCHEDULE':
        lr_decay = lr_decay_base ** max(epoch - m_flat_lr, 0)
        lr = lr * lr_decay # decay lr if it is time

    # RUN MODEL ON TRAINING DATA
    train_ppl, train_loss = run_epoch(model, train_data, True, lr)

    # RUN MODEL ON VALIDATION DATA
    val_ppl, val_loss = run_epoch(model, valid_data)


    # SAVE MODEL IF IT'S THE BEST SO FAR
    if val_ppl < best_val_so_far:
        best_val_so_far = val_ppl
        if args.save_best:
            print("Saving model parameters to best_params.pt")
            torch.save(model.state_dict(), os.path.join(args.save_dir, 'best_params.pt'))
        # NOTE ==============================================
        # You will need to load these parameters into the same model
        # for a couple Problems: so that you can compute the gradient 
        # of the loss w.r.t. hidden state as required in Problem 5.2
        # and to sample from the the model as required in Problem 5.3
        # We are not asking you to run on the test data, but if you 
        # want to look at test performance you would load the saved
        # model and run on the test data with batch_size=1

    # LOC RESULTS
    train_ppls.append(train_ppl)
    val_ppls.append(val_ppl)
    train_losses.extend(train_loss)
    val_losses.extend(val_loss)
    times.append(time.time() - t0)
    log_str = 'epoch: ' + str(epoch) + '\t' \
            + 'train ppl: ' + str(train_ppl) + '\t' \
            + 'val ppl: ' + str(val_ppl)  + '\t' \
            + 'best val: ' + str(best_val_so_far) + '\t' \
            + 'time (s) spent in epoch: ' + str(times[-1])
    print(log_str)
    with open (os.path.join(args.save_dir, 'log.txt'), 'a') as f_:
        f_.write(log_str+ '\n')

# SAVE LEARNING CURVES
lc_path = os.path.join(args.save_dir, 'learning_curves.npy')
print('\nDONE\n\nSaving learning curves to '+lc_path)
np.save(lc_path, {'train_ppls':train_ppls, 
                  'val_ppls':val_ppls, 
                  'train_losses':train_losses,
                  'val_losses':val_losses})
# NOTE ==============================================
# To load these, run 
# >>> x = np.load(lc_path)[()]
# You will need these values for plotting learning curves (Problem 4)



########## Running Main Loop ##########################

EPOCH 0 ------------------
step: 1320; loss: 6.15342; costs: 299604.69; speed (wps) 2376.17; b_time = 132.37; f_time = 37.52

epoch: 0	train ppl: 651.2823708968865	val ppl: 401.53776257487937	best val: 401.53776257487937	time (s) spent in epoch: 397.9311761856079

EPOCH 1 ------------------
step: 1320; loss: 5.96623; costs: 278771.03; speed (wps) 2377.73; b_time = 132.31; f_time = 37.53

epoch: 1	train ppl: 415.4680243275044	val ppl: 322.47107944402575	best val: 322.47107944402575	time (s) spent in epoch: 397.68679690361023

EPOCH 2 ------------------
step: 1320; loss: 5.88561; costs: 270628.15; speed (wps) 2375.71; b_time = 132.38; f_time = 37.58

epoch: 2	train ppl: 348.4440338307249	val ppl: 280.12250525626547	best val: 280.12250525626547	time (s) spent in epoch: 398.022433757782

EPOCH 3 ------------------
step: 1320; loss: 5.81370; costs: 265333.08; speed (wps) 2375.40; b_time = 132.40; f_time = 37.41

epoch: 3	train ppl: 3