In [0]:
!pip3 install bpemb
!pip3 install nlpaug



In [0]:
import unicodedata
import string
import re
import random
import time
import datetime
import math
import socket
hostname = socket.gethostname()

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence#, masked_cross_entropy

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

In [0]:
from bpemb import BPEmb
import textwrap
import nlpaug.augmenter.char as nac

In [0]:
bpemb_en = BPEmb(lang="en", dim=50)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# open text file and read in data as `text`
with open('big.txt', 'r') as f:
    text = f.read()

In [0]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [0]:
def get_encodes(arr, seq_length, use_aug=False):
  if use_aug:
    aug_rr = nac.KeyboardAug(aug_char_min=0, aug_char_max=None, aug_char_p=0.4, aug_word_p=0.4, aug_word_min=0, aug_word_max=arr.size//3, special_char=False)
    
    augmented_data = list(map(lambda x: aug_rr.augment(x), arr.ravel()))
    arr = np.array(augmented_data).reshape(arr.shape)
  flat_arr = arr.ravel()

  def padded_encode(x):
    k = np.zeros((seq_length,))
    enc = np.array(bpemb_en.encode_ids(x))
    k[:enc.size] = enc
    return enc.size, k

  res_arr = np.empty((*flat_arr.shape, seq_length), dtype="int32")
  len_vec = np.empty( (arr.shape[0]))
  for i in range(flat_arr.size):

    res = padded_encode(flat_arr[i])  
    res_arr[i] = res[1]
    len_vec[i] = res[0]

  if not use_aug:
    res_arr = np.insert(res_arr, 0, 1, 1)
  len_vec, perm_idx = torch.from_numpy(len_vec).sort(0, descending=True)
  res_arr = res_arr[perm_idx]

  leng, res = len_vec, one_hot_encode(res_arr, 10000)

  leng += (1 if not use_aug else 0)
  leng, res = leng, torch.from_numpy(res)

  return leng, res

In [0]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    # total number of batches we can make
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size]    

  

    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1]):
        # The features
        base = arr[:, n:n+1]
        # y = np.vectorize(get_int)(base)

        x = base.copy()
        # y = one_hot_encode(y, len(words))
        lengths_x, x = get_encodes(x, seq_length, use_aug=True)
        lengths_y, y = get_encodes(base, seq_length)
        
        yield lengths_x, x, lengths_y, y

In [0]:

arr = np.array(textwrap.wrap(text=text, width=5))

batches = get_batches(arr, 2, 5)

In [0]:
lengths_x, x, lengths_y, y = next(batches)
x.shape

torch.Size([2, 5, 10000])

### Hide


In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.gru = nn.GRU(input_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True, batch_first=True)
        
    def forward(self, input_seqs, input_lengths, hidden=None):
        # Note: we run this all at once (over multiple batches of multiple sequences)
        packed = torch.nn.utils.rnn.pack_padded_sequence(input_seqs, input_lengths, batch_first=True)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) # unpack (back to padded)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
        return outputs, hidden

In [0]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S


        attn_energies = attn_energies.cuda()

        # For each batch of encoder outputs
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):
                print(hidden[:, b].shape, encoder_outputs[i, b].shape)
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.v.dot(energy)
            return energy

In [0]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, input_size, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(input_size, hidden_size, n_layers, dropout=dropout, batch_first=True)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):

        rnn_output, hidden = self.gru(input_seq, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        # print(rnn_output.shape, encoder_outputs.shape)
        # attn_weights = self.attn(rnn_output, encoder_outputs)
        # context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        # rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        # context = context.squeeze(1)       # B x S=1 x N -> B x N
        # concat_input = torch.cat((rnn_output, context), 1)
        # concat_output = F.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(rnn_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden#, attn_weights

In [0]:
seq_length = 1000
small_batch_size = 3
data = np.array(textwrap.wrap(text=text, width=seq_length))

batches = get_batches(data, small_batch_size, seq_length=seq_length)
input_lengths, input_batches, target_lengths, target_batches = next(batches)
for i in [input_lengths, input_batches, target_lengths, target_batches]:
  print(i.shape)
  i = i.cuda()

print('input_batches', input_batches.size()) # (max_len x batch_size)
print('target_batches', target_batches.size()) # (max_len x batch_size)

torch.Size([3])
torch.Size([3, 1000, 10000])
torch.Size([3])
torch.Size([3, 1001, 10000])
input_batches torch.Size([3, 1000, 10000])
target_batches torch.Size([3, 1001, 10000])


In [0]:
small_hidden_size = 1000
small_n_layers = 2

decoder_test = LuongAttnDecoderRNN('general', CLASSES, small_hidden_size, CLASSES, small_n_layers)

encoder_test = EncoderRNN(10000, small_hidden_size, small_n_layers)

encoder_test = encoder_test.cuda()
decoder_test = decoder_test.cuda()


In [0]:

encoder_outputs, encoder_hidden = encoder_test(input_batches.cuda(), input_lengths.cuda(), None)

print('encoder_outputs', encoder_outputs.size(), encoder_hidden.size()) # max_len x batch_size x hidden_size

encoder_outputs torch.Size([3, 288, 1000]) torch.Size([4, 3, 1000])


In [0]:
target_batches[:, 1:].argmax(2)

tensor([[1788,  146, 1988,  ...,    0,    0,    0],
        [  42, 9945,  555,  ...,    0,    0,    0],
        [   7, 1361,   51,  ...,    0,    0,    0]])

In [0]:
l = decoder_test(target_batches[:, :-1].cuda(), encoder_hidden[:decoder_test.n_layers], encoder_outputs)


In [0]:
criterion = nn.CrossEntropyLoss()

In [0]:
tar = target_batches[:, 1:].argmax(2)
tar = tar.view(tar.size(0)*tar.size(1))
cur = l[0].view(l[0].size(0)*l[0].size(1), -1)


In [0]:
criterion(l[0], target_batches[:, 1:].argmax(2))

ValueError: ignored

In [0]:
CLASSES = 10000
hidden_size = 600
decoder = LuongAttnDecoderRNN('general', CLASSES, hidden_size, CLASSES, n_layers=2)

encoder = EncoderRNN(CLASSES, hidden_size, n_layers=2)

sl = 80
data = np.array(textwrap.wrap(text=text, width=sl))

train(encoder, decoder, data, epochs=1, batch_size=100, seq_length=sl, hidden_size=hidden_size, lr=0.001, clip=5, val_frac=0.1, print_every=20)

Epoch: 1/1... Step: 20... Loss: 2.0075... Val Loss: 1.9093
Epoch: 1/1... Step: 40... Loss: 1.7845... Val Loss: 1.7611
Epoch: 1/1... Step: 60... Loss: 1.7087... Val Loss: 1.7186
Epoch: 1/1... Step: 80... Loss: 1.7535... Val Loss: 1.7425
Epoch: 1/1... Step: 100... Loss: 1.7237... Val Loss: 1.7370
Epoch: 1/1... Step: 120... Loss: 1.6854... Val Loss: 1.6971
Epoch: 1/1... Step: 140... Loss: 1.7274... Val Loss: 1.7642
Epoch: 1/1... Step: 160... Loss: 1.7060... Val Loss: 1.6879
Epoch: 1/1... Step: 180... Loss: 1.6150... Val Loss: 1.6796
Epoch: 1/1... Step: 200... Loss: 1.7107... Val Loss: 1.7205
Epoch: 1/1... Step: 220... Loss: 1.7046... Val Loss: 1.6846
Epoch: 1/1... Step: 240... Loss: 1.6657... Val Loss: 1.6753
Epoch: 1/1... Step: 260... Loss: 1.6639... Val Loss: 1.6716
Epoch: 1/1... Step: 280... Loss: 1.6692... Val Loss: 1.7321
Epoch: 1/1... Step: 300... Loss: 1.6813... Val Loss: 1.6743
Epoch: 1/1... Step: 320... Loss: 1.6739... Val Loss: 1.6897
Epoch: 1/1... Step: 340... Loss: 1.6542... V

In [0]:
def train(encoder, decoder, data, epochs=10, batch_size=30, seq_length=500, hidden_size=1000, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    encoder.train()
    decoder.train()
    
    opt1 = torch.optim.Adam(encoder.parameters(), lr=lr)
    opt2 = torch.optim.Adam(decoder.parameters(), lr=lr)

    criterion = nn.CrossEntropyLoss()


    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    decoder.cuda()
    encoder.cuda()
    
    counter = 0

    for e in range(epochs):
        # initialize hidden state
        h = None

        for lenx, x, leny, y in get_batches(data, batch_size=batch_size, seq_length=seq_length):
            counter += 1
                        # One-hot encode our data and make them Torch tensors
            lenx, leny = lenx.cuda(), leny.cuda()
            inputs, targets = x.cuda(), y.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = None

            # zero accumulated gradients
            encoder.zero_grad()
            decoder.zero_grad()
            

            
            # get the output from the model
            encoder_outputs, encoder_hidden = encoder(inputs, lenx, h)
            h_dec = encoder_outputs
            out, h_dec = decoder(targets[:, :-1], encoder_hidden[:decoder.n_layers], h_dec)
            tar = targets[:, 1:].argmax(2)
            tar = tar.view(tar.size(0)*tar.size(1))
            cur = out.view(out.size(0)*out.size(1), -1)
            loss = criterion(cur, tar)
            
            # calculate the loss and perform backprop
            # loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), clip)
            nn.utils.clip_grad_norm_(encoder.parameters(), clip)
            opt2.step()
            opt1.step()
            
            
            # loss stats
            if counter % print_every == 0:
                    # Get validation loss
                val_h = None
                val_losses = []
                encoder.eval()
                decoder.eval()

                for lenx, x, leny, y in get_batches(val_data, batch_size=batch_size, seq_length=seq_length):
                        # One-hot encode our data and make them Torch tensors
                        
                        # Creating new variables for the hidden state, otherwise
                        # we'd backprop through the entire training history
                    val_h = None
                        
                    inputs, targets = x, y
                    inputs, targets = inputs.cuda(), targets.cuda()
                    lenx, leny = lenx.cuda(), leny.cuda()
                        

                    encoder_outputs, encoder_hidden = encoder(inputs, lenx, val_h)
                    h_dec = encoder_outputs
                    out, h_dec = decoder(targets[:, :-1], encoder_hidden[:decoder.n_layers], h_dec)
                    tar = targets[:, 1:].argmax(2)
                    tar = tar.view(tar.size(0)*tar.size(1))
                    cur = out.view(out.size(0)*out.size(1), -1)
                    val_loss = criterion(cur, tar)
              

                    val_losses.append(val_loss.item())
                    
                encoder.train() # reset to train mode after iterationg through validation data
                decoder.train()
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [0]:
def predict(net, word, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        x = np.array([word])
        x = get_encodes(x).reshape(1, 1, -1)
        inputs = torch.from_numpy(x)
        print(inputs.shape)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        return p, h

In [0]:
def sample(encoder, decoder, sentence='The', top_k=None):
    lst_res = []
    encoder.eval()
    decoder.eval() # eval mode
    
        
    h = None
    length, inputs = get_encodes(np.array(sentence[:100]), seq_length=80)
    
    inputs, length = inputs.cuda(), length.cuda()
    res, h = encoder(inputs, length, h)
    while True:
      h_dec = res
      zero_s = np.zeros((100, 1, 10000))
      zero_s[:, :, 0] = 1
      print( h[:decoder.n_layers].shape, h_dec.shape)
      out, h_dec = decoder(zero_s, h[:decoder.n_layers], h_dec)

      tar = inputs.argmax(1)
      tar = tar.view(tar.size(0)*tar.size(1))
      cur = out.view(out.size(0)*out.size(1), -1)
      val_loss = criterion(cur, tar)
      

In [0]:
sample(encoder, decoder, data)

torch.Size([2, 100, 600]) torch.Size([100, 28, 600])


TypeError: ignored