In [3]:
!pip3 install fasttext spacy nlpaug

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/10/61/2e01f1397ec533756c1d893c22d9d5ed3fce3a6e4af1976e0d86bb13ea97/fasttext-0.9.1.tar.gz (57kB)
[K     |████████████████████████████████| 61kB 2.9MB/s 
Collecting nlpaug
[?25l  Downloading https://files.pythonhosted.org/packages/6e/45/ce353d60920cabe773de35ee8dac0989659c055540fa50eb0f6ac774e6f0/nlpaug-0.0.10-py3-none-any.whl (83kB)
[K     |████████████████████████████████| 92kB 6.0MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.1-cp36-cp36m-linux_x86_64.whl size=2384956 sha256=e9386ae7853270d57b003a6c464404f229ea5e83cd62c42d668751815d31010b
  Stored in directory: /root/.cache/pip/wheels/9f/f0/04/caa82c912aee89ce76358ff954f3f0729b7577c8ff23a292e3
Successfully built fasttext
Installing collected packages: fasttext, nlpaug
Successfully installed fasttext-0.9.1 nlpaug-0.0.10


In [0]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import nlpaug.augmenter.char as nac
import fasttext
import spacy
import gensim
from gensim.models.wrappers import FastText

In [0]:
with open('big.txt', 'r') as f:
    text = f.read()
spacy_nlp = spacy.load('en_core_web_sm')
spacy_nlp.max_length = 2*len(text)
x = spacy_nlp(text, disable=['parser', 'tagger', 'ner'])
splitted = [token.text for token in x if not (token.text.isspace() or (token.text[0].isdigit() and token.text[-1].isdigit()))]


In [0]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

encoded = np.array([char2int[ch] for ch in text])

In [0]:
words = tuple(set(splitted))
int2word = dict(enumerate(words))
word2int = {ch: ii for ii, ch in int2word.items()}


In [0]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [0]:
def one_hot_embedding(word, n_labels):
  return np.sum(one_hot_encode(word[1:-1], n_labels), axis=0)

def get_encodes(arr, use_aug=False):
  if use_aug:
    aug_rr = nac.KeyboardAug(aug_char_min=0, aug_char_max=None, aug_char_p=0.4, aug_word_p=0.4, aug_word_min=0, aug_word_max=arr.size//3, special_char=False)
    augmented_data = aug_rr.augment(" ".join(arr.ravel().tolist())).split()
    arr = np.array(augmented_data).reshape(arr.shape)
  
  flat_arr = arr.ravel()
  splitted_encoded = np.array(list(map(lambda x: np.array([char2int[ch] for ch in x]), flat_arr)))

  first_char = list(map(lambda x: x[0], splitted_encoded))
  last_char = list(map(lambda x: x[-1], splitted_encoded))
  middle = list(map(lambda x: x, splitted_encoded))

  first_char_encoded = one_hot_encode(np.array(first_char),  len(chars))
  last_char_encoded = one_hot_encode(np.array(last_char),  len(chars))

  middle_encoded = np.vstack(list(map(lambda x: one_hot_embedding(x, len(chars)), middle)))
  encoded_seq = np.hstack([first_char_encoded, middle_encoded, last_char_encoded]).reshape((*arr.shape, 3*len(chars)))
  return encoded_seq


def get_int(x):
  return word2int[x]

In [0]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]    

  

    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        base = arr[:, n:n+seq_length]
        y = np.vectorize(get_int)(base)

        x = base.copy()
        x = get_encodes(x, use_aug=True)
      
        
        yield x, y

In [0]:
train_on_gpu = torch.cuda.is_available()

batches = get_batches(np.array(splitted), 30, 50)
x, y = next(batches)

In [12]:
print(y)

[[28980 16906 21886 ...   827 33650  2186]
 [ 5801 31902 21796 ... 27236  9113 12591]
 [31800  6934 21626 ... 21395  2255 33872]
 ...
 [32421 34205 25412 ... 13386  2477 35384]
 [ 2950 24874  4751 ... 18282  4751 19934]
 [ 7248 11241 25810 ... 31902  7840 34205]]


In [0]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=650, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.rnn = nn.LSTM(3*len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        # self.fc = nn.Linear(n_hidden, len(words))
        self.fc = nn.Linear(n_hidden, len(words))
        
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        r_output, hidden = self.rnn(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        out = out.contiguous().view(-1, self.n_hidden)
        ## TODO: put x through the fully-connected layer
        out = F.log_softmax(self.fc(out))
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''

        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [0]:

def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)

    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = None

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            # loss = criterion(output.double(), targets.view(seq_length*batch_size, -1))

            training_loss.append(loss.item())

            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                val_acc = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = None
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                    # val_loss = criterion(output.double(), targets.view(seq_length*batch_size, -1))
                    validation_loss.append(val_loss.item())

                    val_losses.append(val_loss.item())
                    ind = min(len(validation_loss), 10)
                    acc_output = output.cpu().detach().numpy().reshape((100, 50, -1)).astype("int32")
                    current_accuracy = np.equal(acc_output.argmax(2), y.cpu().numpy()).sum()

                    # for i in range(100):
                    #   for j in range(50):
                    #     if y[i][j].item() == output[i][j].sum():
                    #       current_accuracy +=1
                    
                    val_acc.append(current_accuracy/5000)
                    validation_accuracy.append(current_accuracy/5000)
                    

                    if min(validation_loss[-ind:]) != min(validation_loss): break
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)),
                      "Val Accuracy: {:.4f}".format(np.mean(val_acc)))

In [0]:
# define and print the net
n_hidden=650
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
batch_size = 100
seq_length = 50
n_epochs = 10 
training_loss = []
validation_loss = []
validation_accuracy = []



In [20]:
train(net, np.array(splitted), epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)



Epoch: 1/10... Step: 10... Loss: 7.2601... Val Loss: 7.6032 Val Accuracy: 0.0308
Epoch: 1/10... Step: 20... Loss: 7.2875... Val Loss: 7.6562 Val Accuracy: 0.0410
Epoch: 1/10... Step: 30... Loss: 7.2212... Val Loss: 7.5488 Val Accuracy: 0.0400
Epoch: 1/10... Step: 40... Loss: 7.1460... Val Loss: 7.4799 Val Accuracy: 0.0445
Epoch: 1/10... Step: 50... Loss: 7.1079... Val Loss: 7.4158 Val Accuracy: 0.0394
Epoch: 1/10... Step: 60... Loss: 7.1366... Val Loss: 7.3999 Val Accuracy: 0.0408


KeyboardInterrupt: ignored

In [1]:
import seaborn as sns
plott = sns.lineplot(list(range(len(validation_accuracy))), validation_accuracy, color="blue")
plott.legend(['Validation Accuracy per iteration'])

NameError: ignored

In [2]:
model

NameError: ignored

In [0]:
model_name = 'scrnn_rnn_classification_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
  
    torch.save(checkpoint, f)

In [0]:
def predict(net, word, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        x = np.array([word])
        x = get_encodes(x).reshape(1, 1, -1)
        inputs = torch.from_numpy(x)
        print(inputs.shape)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        return p, h

In [0]:
def sample(net, sentence='The', top_k=None):
    lst_res = []
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    z = spacy_nlp(sentence, disable=['parser', 'tagger', 'ner'])
    spl = [token.text for token in z if not (token.text.isspace() or (token.text[0].isdigit() and token.text[-1].isdigit()))]
    h = net.init_hidden(1)
    for word in spl:
        pred, h = predict(net, word, h, top_k=top_k)
        lst_res.append(int2word[np.argmax(pred).item()])

    
    return ' '.join(lst_res)

In [0]:
sample(net, "Aoccdrnig to a rscheearch at Cmabrigde Uinervtisy,it deosn't mttaer in waht oredr the ltteers in a wrodare, the olny iprmoetnt tihng is taht the frist and lsatltteer be at the rghit pclae. The rset can be a toatlmses and you can sitll raed it wouthit porbelm. Tihsis bcuseae the huamn mnid deos not raed ervey lteterby istlef, but the wrod as a wlohe.")

In [0]:
model_name = 'scRNN_5_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
  
    torch.save(checkpoint, f)