# Base RNN Language Models for English and Mi'kmaq 

In [122]:
import os
import time
import math

import torch
import torch.nn as nn

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [45]:
bptt = 35

## Corpus Processing 

In [14]:
class Dictionary():
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.idx2word)

In [17]:
class Corpus():
    def __init__(self, path):
        self.dictionary = Dictionary()
        
        try:
            self.train = self.tokenize(os.path.join(path, 'train.txt'))
        except FileNotFoundError:
            print("train.txt not found")
            
        
        try:
            self.dev = self.tokenize(os.path.join(path, 'dev.txt'))
        except FileNotFoundError:
            print("dev.txt not found")
        
        try:
            self.test = self.tokenize(os.path.join(path, 'test.txt'))
        except FileNotFoundError:
            print("test.txt not found")
        
    def tokenize(self, path):        
        with open(path, 'r', encoding="utf8") as f:
            tokens = 0
            for line in f:
                words = line.split()# + ['</s>']
                
                for word in words:
                    if word != '<s>':
                        tokens += 1
                        self.dictionary.add_word(word)
                        
        with open(path, 'r', encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() # + ['</s>']
                for word in words:
                    if word != '<s>':
                        ids[token] = self.dictionary.word2idx[word]
                        token += 1            
                    
        return ids
                    

## Reuters Corpus

In [18]:
reuters_corpus = Corpus('data/reuters')

test.txt not found


Arrange the data into batches, we won't learn the relation between the start and end of a batch, but it'll be a lot easier to batch

In [77]:
def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    
    data = data.narrow(0, 0, nbatch * bsz)
    
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)
    

## Mi'kmaq Corpus 

In [27]:
mic_corpus = Corpus('data/micmac')

test.txt not found


## Model Definition 

In [81]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, ntoken, ninput, nhidden, nlayers, 
                 dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninput)
        
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(ninput, nhidden, nlayers, dropout=dropout)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(ninput, nhidden, nlayers, dropout=dropout)
        elif rnn_type == "RNN_TANH":
            self.rnn = nn.RNN(ninput, nhidden, nlayers, nonlinearity='TANH', 
                              dropout=dropout)
        elif rnn_type == "RNN_RELU":
            self.rnn = nn.RNN(ninput, nhidden, nlayers, nonlinearity='RELU', 
                              dropout=dropout)
            
        self.decoder = nn.Linear(nhidden, ntoken)
        
        if tie_weights:
            if nhidden != ninput:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
            
        self.init_weights()
        
        self.rnn_type = rnn_type
        self.nhidden = nhidden
        self.nlayers = nlayers
    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhidden),
                    weight.new_zeros(self.nlayers, bsz, self.nhidden))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhidden)

## Models and Loss 

In [52]:
def make_model(corpus, rnn_type, emsize, nhidden, nlayers, dropout, tied):
    model = RNNModel(rnn_type, len(corpus.dictionary), emsize, nhidden, 
                     nlayers, dropout, tied)\
                .to(device)
    
    return model

In [36]:
criterion = nn.CrossEntropyLoss()

In [38]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

## Define training 

In [109]:
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [120]:
def evaluate(data_source, corpus, model):
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(10)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

In [116]:
def train_iter(model, corpus, train_data, epoch, lr):
    model.train()
    total_loss = 0.
    start_time = time.time()    
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(20)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), lr)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
            
        total_loss += loss.item()
        
        if batch % 200 == 0 and batch > 0:
            cur_loss = total_loss / 200
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'\
                  .format(epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / 200, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


In [118]:
def train(model, corpus, lr, epochs, path):
    train_data = batchify(corpus.train, 20)
    dev_data = batchify(corpus.dev, 10)
    #test_data = batchify(corpus.test, 10)
    best_val_loss = None
    
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train_iter(model, corpus, train_data, epoch, lr)
        val_loss = evaluate(dev_data, corpus, model)
        
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
            
        if not best_val_loss or val_loss < best_val_loss:
            with open(os.path.join(path, 'model.txt'), 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            lr /= 4

In [83]:
reuters_lstm = make_model(reuters_corpus, "LSTM", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

## Reuters Models 

In [None]:
train(reuters_lstm, reuters_corpus, 20, 40, 'data/reuters')

-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 153.26s | valid loss  8.05 | valid ppl  3142.04
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
