# Base RNN Language Models for English and Mi'kmaq 

In [None]:
import os
import time
import math

import torch
import torch.nn as nn

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
bptt = 35

## Corpus Processing 

In [None]:
class Dictionary():
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.idx2word)

In [33]:
class Corpus():
    def __init__(self, path):
        self.dictionary = Dictionary()
        
        try:
            self.train = self.tokenize(os.path.join(path, 'train.txt'))
        except FileNotFoundError:
            print("train.txt not found")
            
        
        try:
            self.dev = self.tokenize(os.path.join(path, 'dev.txt'))
        except FileNotFoundError:
            print("dev.txt not found")
        
        try:
            self.test = self.tokenize(os.path.join(path, 'test.txt'))
        except FileNotFoundError:
            print("test.txt not found")
        
    def tokenize(self, path):        
        with open(path, 'r', encoding="utf8") as f:
            tokens = 0
            for line in f:
                words = line.split()
                
                for word in words:
                    tokens += 1
                    self.dictionary.add_word(word)
                        
        with open(path, 'r', encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split()
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1            
                    
        return ids
                    

## Reuters Corpus

In [34]:
reuters_corpus = Corpus('data/reuters')

test.txt not found


Arrange the data into batches, we won't learn the relation between the start and end of a batch, but it'll be a lot easier to batch

In [None]:
def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    
    data = data.narrow(0, 0, nbatch * bsz)
    
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)
    

## Mi'kmaq Corpus 

In [None]:
mic_corpus = Corpus('data/micmac')

## Model Definition 

In [None]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, ntoken, ninput, nhidden, nlayers, 
                 dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninput)
        
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(ninput, nhidden, nlayers, dropout=dropout)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(ninput, nhidden, nlayers, dropout=dropout)
        elif rnn_type == "RNN_TANH":
            self.rnn = nn.RNN(ninput, nhidden, nlayers, nonlinearity='tanh', 
                              dropout=dropout)
        elif rnn_type == "RNN_RELU":
            self.rnn = nn.RNN(ninput, nhidden, nlayers, nonlinearity='relu', 
                              dropout=dropout)
        else:
            print("{} is an invalid rnn type".format(rnn_type))
            
        self.decoder = nn.Linear(nhidden, ntoken)
        
        if tie_weights:
            if nhidden != ninput:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
            
        self.init_weights()
        
        self.rnn_type = rnn_type
        self.nhidden = nhidden
        self.nlayers = nlayers
    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
    
    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhidden),
                    weight.new_zeros(self.nlayers, bsz, self.nhidden))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhidden)

## Models and Loss 

In [None]:
def make_model(corpus, rnn_type, emsize, nhidden, nlayers, dropout, tied):
    model = RNNModel(rnn_type, len(corpus.dictionary), emsize, nhidden, 
                     nlayers, dropout, tied)\
                .to(device)
    
    return model

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

## Define training 

In [None]:
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [None]:
def evaluate(data_source, corpus, model):
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(10)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

In [37]:
def train_iter(model, corpus, train_data, epoch, lr):
    model.train()
    total_loss = 0.
    start_time = time.time()    
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(20)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), lr)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)
            
        total_loss += loss.item()
        
        if batch % 25 == 0 and batch > 0:
            cur_loss = total_loss / 25
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'\
                  .format(epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / 25, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


In [23]:
def train(model, corpus, lr, epochs, path):
    train_data = batchify(corpus.train, 20)
    dev_data = batchify(corpus.dev, 10)
    #test_data = batchify(corpus.test, 10)
    best_val_loss = None
    
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train_iter(model, corpus, train_data, epoch, lr)
        val_loss = evaluate(dev_data, corpus, model)
        
        print('-' * 89)
        try:
            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        except MathRangeError:
            print('| end of epoch {:3d} | time: {:5.2f}s'.format(epoch, (time.time() - epoch_start_time)))
            print("PPL is uuugggeeeee")
            
        print('-' * 89)
            
        if not best_val_loss or val_loss < best_val_loss:
            with open(os.path.join(path, 'model.txt'), 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            lr /= 4

## Reuters Models 

In [24]:
reuters_lstm = make_model(reuters_corpus, "LSTM", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

reuters_gru = make_model(reuters_corpus, "GRU", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

reuters_thanh = make_model(reuters_corpus, "TANH", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

reuters_relu = make_model(reuters_corpus, "RELU", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

TANH is an invalid rnn type
RELU is an invalid rnn type


In [39]:
train(reuters_lstm, reuters_corpus, 20, 20, 'data/reuters')

| epoch   1 |    25/  100 batches | lr 20.00 | ms/batch 1469.38 | loss 15.72 | ppl 6736333.43
| epoch   1 |    50/  100 batches | lr 20.00 | ms/batch 1414.18 | loss 11.88 | ppl 144484.70
| epoch   1 |    75/  100 batches | lr 20.00 | ms/batch 1403.50 | loss 11.42 | ppl 91004.84
| epoch   1 |   100/  100 batches | lr 20.00 | ms/batch 1380.89 | loss 14.12 | ppl 1360690.08
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 145.35s | valid loss 12.46 | valid ppl 257077.08
-----------------------------------------------------------------------------------------
| epoch   2 |    25/  100 batches | lr 20.00 | ms/batch 1467.18 | loss 12.35 | ppl 231996.35
| epoch   2 |    50/  100 batches | lr 20.00 | ms/batch 1446.70 | loss 10.05 | ppl 23202.12
| epoch   2 |    75/  100 batches | lr 20.00 | ms/batch 1442.50 | loss 11.30 | ppl 81179.96
| epoch   2 |   100/  100 batches | lr 20.00 | ms/batch 1416.38 | loss 10.93 | ppl 56042.08
--

In [42]:
train(reuters_gru, reuters_corpus, 20, 40, 'data/reuters')

| epoch   1 |    25/  100 batches | lr 20.00 | ms/batch 290.66 | loss 99.19 | ppl 11958323269483587711212976867115459642654720.00
| epoch   1 |    50/  100 batches | lr 20.00 | ms/batch 214.57 | loss 50.84 | ppl 12022794092571099398144.00
| epoch   1 |    75/  100 batches | lr 20.00 | ms/batch 215.15 | loss 41.05 | ppl 675646400923478400.00
| epoch   1 |   100/  100 batches | lr 20.00 | ms/batch 226.84 | loss 53.95 | ppl 270350617514995236208640.00
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 24.20s | valid loss 90.58 | valid ppl 2171007146916599533901433112177284218880.00
-----------------------------------------------------------------------------------------
| epoch   2 |    25/  100 batches | lr 20.00 | ms/batch 242.55 | loss 70.02 | ppl 2560673317974902853734825459712.00
| epoch   2 |    50/  100 batches | lr 20.00 | ms/batch 252.84 | loss 81.09 | ppl 164865578941804859627737401689899008.00
| epoch   2 |    75

In [None]:
train(reuters_thanh, reuters_corpus, 20, 40, 'data/reuters')

In [None]:
train(reuters_relu, reuters_corpus, 20, 40, 'data/reuters')

## Mi'kmaq Models

In [40]:
micmac_lstm = make_model(mic_corpus, "LSTM", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

micmac_gru = make_model(mic_corpus, "GRU", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)


In [41]:
train(micmac_lstm, mic_corpus, 20, 40, 'data/micmac')

| epoch   1 |    25/  201 batches | lr 20.00 | ms/batch 1609.65 | loss 70.19 | ppl 3043738862814376850439020740608.00
| epoch   1 |    50/  201 batches | lr 20.00 | ms/batch 2049.27 | loss 57.42 | ppl 8633555709618255153856512.00
| epoch   1 |    75/  201 batches | lr 20.00 | ms/batch 2196.81 | loss 58.45 | ppl 24133229564286347568480256.00
| epoch   1 |   100/  201 batches | lr 20.00 | ms/batch 2221.97 | loss 62.20 | ppl 1029556726208738552923029504.00
| epoch   1 |   125/  201 batches | lr 20.00 | ms/batch 2374.86 | loss 54.87 | ppl 673240170147547222900736.00
| epoch   1 |   150/  201 batches | lr 20.00 | ms/batch 2067.02 | loss 59.13 | ppl 47682828660910310324961280.00
| epoch   1 |   175/  201 batches | lr 20.00 | ms/batch 2168.70 | loss 63.63 | ppl 4290550424691789843157483520.00
| epoch   1 |   200/  201 batches | lr 20.00 | ms/batch 2298.34 | loss 60.79 | ppl 250987635192146525648060416.00
-----------------------------------------------------------------------------------------

In [None]:
micmac_lstm2 = make_model(mic_corpus, "LSTM", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=True)

train(micmac_lstm2, mic_corpus, 20, 40, 'data/micmac')

In [None]:
micmac_lstm2 = make_model(mic_corpus, "LSTM", emsize=50, nhidden=50, nlayers=2, 
                          dropout=0.2, tied=False)

train(micmac_lstm2, mic_corpus, 20, 40, 'data/micmac')

In [43]:
train(micmac_gru, mic_corpus, 20, 40, 'data/micmac')

| epoch   1 |    25/  201 batches | lr 20.00 | ms/batch 790.33 | loss 326.89 | ppl 9286266198247115292586519714480277657649473013193173498282081372252742723394780397243182155643171744917138854305214443935897372697198363410432.00
| epoch   1 |    50/  201 batches | lr 20.00 | ms/batch 479.93 | loss 273.15 | ppl 42394188229441008340469049966062966046933482889558522968608766320888992415400098160918462774944241165698138120287944704.00
| epoch   1 |    75/  201 batches | lr 20.00 | ms/batch 466.10 | loss 204.91 | ppl 98215617301798750701494364066854155627123860091474860654175577757268408917499737448382464.00
| epoch   1 |   100/  201 batches | lr 20.00 | ms/batch 500.85 | loss 128.17 | ppl 46160241108581485544008358540208759580292018606111195136.00
| epoch   1 |   125/  201 batches | lr 20.00 | ms/batch 451.11 | loss 191.61 | ppl 164618502592794908643882645840203344850985135870670475609275640181481542617080528896.00
| epoch   1 |   150/  201 batches | lr 20.00 | ms/batch 465.36 | loss 194.8

In [44]:
micmac_tanh = make_model(mic_corpus, "RNN_TANH", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

In [45]:
train(micmac_tanh, mic_corpus, 20, 40, 'data/micmac')

| epoch   1 |    25/  201 batches | lr 20.00 | ms/batch 736.38 | loss 111.85 | ppl 3749259989727947814465805423753716685849373966336.00
| epoch   1 |    50/  201 batches | lr 20.00 | ms/batch 379.20 | loss 109.49 | ppl 355308240430623094834143746127675079124047626240.00
| epoch   1 |    75/  201 batches | lr 20.00 | ms/batch 354.28 | loss 97.58 | ppl 2391027848102916344237111316278949342871552.00
| epoch   1 |   100/  201 batches | lr 20.00 | ms/batch 337.72 | loss 79.97 | ppl 53966953842370299752225249591033856.00
| epoch   1 |   125/  201 batches | lr 20.00 | ms/batch 308.39 | loss 68.88 | ppl 819209373263647827058472517632.00
| epoch   1 |   150/  201 batches | lr 20.00 | ms/batch 311.85 | loss 77.50 | ppl 4540143106259459597248125074407424.00
| epoch   1 |   175/  201 batches | lr 20.00 | ms/batch 330.11 | loss 70.23 | ppl 3174866259668741762463911903232.00
| epoch   1 |   200/  201 batches | lr 20.00 | ms/batch 299.50 | loss 70.67 | ppl 4934853283224804067800508268544.00
---------

In [46]:
micmac_relu = make_model(mic_corpus, "RNN_RELU", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=False)

In [47]:
train(micmac_relu, mic_corpus, 20, 40, 'data/micmac')

| epoch   1 |    25/  201 batches | lr 20.00 | ms/batch 194.02 | loss   nan | ppl      nan
| epoch   1 |    50/  201 batches | lr 20.00 | ms/batch 176.88 | loss   nan | ppl      nan
| epoch   1 |    75/  201 batches | lr 20.00 | ms/batch 186.48 | loss   nan | ppl      nan
| epoch   1 |   100/  201 batches | lr 20.00 | ms/batch 186.79 | loss   nan | ppl      nan
| epoch   1 |   125/  201 batches | lr 20.00 | ms/batch 190.20 | loss   nan | ppl      nan
| epoch   1 |   150/  201 batches | lr 20.00 | ms/batch 189.53 | loss   nan | ppl      nan
| epoch   1 |   175/  201 batches | lr 20.00 | ms/batch 186.91 | loss   nan | ppl      nan
| epoch   1 |   200/  201 batches | lr 20.00 | ms/batch 185.47 | loss   nan | ppl      nan
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 38.66s | valid loss   nan | valid ppl      nan
-----------------------------------------------------------------------------------------
| epoch   2 |    2

In [None]:
micmac_lstm3 = make_model(mic_corpus, "LSTM", emsize=200, nhidden=200, nlayers=2, 
                          dropout=0.2, tied=True)

train(micmac_lstm3, mic_corpus, 40, 40, 'data/micmac')

| epoch   1 |    25/  201 batches | lr 40.00 | ms/batch 2152.73 | loss 317.79 | ppl 1038166756649441953915467283222094431980084899390550954873741759396001371466203483903407732082507244125770191068871824168215391988942372864.00
| epoch   1 |    50/  201 batches | lr 40.00 | ms/batch 1612.45 | loss 377.06 | ppl 56816807207634645192958175526224980349597868443279527008882898665888270148358567135201255700123016188598379780827902296338789716303733266790351961173607273547694080.00
| epoch   1 |    75/  201 batches | lr 40.00 | ms/batch 1317.60 | loss 321.70 | ppl 51629002721751039823362344667283195154814866251889807350935131341218113024459360950042333338468887090901885476285920959861102318693086396416.00
| epoch   1 |   100/  201 batches | lr 40.00 | ms/batch 1607.88 | loss 322.86 | ppl 163833260741778198495883880061274661254083733417295731648395513607445875237491318597103361112419078524419065955951703179993792714225381015552.00
| epoch   1 |   125/  201 batches | lr 40.00 | ms/batch 1599.93 

In [None]:
!command pip install https://github.com/kpu/kenlm/archive/master.zippip install 


In [None]:
!command pip install nltk