# Text Character recognition using RNN with Pytorch


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
#imports

import torch
import torch as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
import numpy as np
import os

In [4]:
#Loading the data
with open('/content/drive/MyDrive/Colab Notebooks/Char_RNN/data/anna.txt','r')as f:
    text_data= f.read()

In [5]:
text_data[:150]

"Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverything was in confusion in the Oblonskys' house. The wi"

Tokenisation, by converting each character to and from intergers


In [6]:
chars=tuple(set(text_data))
int2char=dict(enumerate(chars))
char2int={ch:ii for ii, ch in int2char.items()}

#encode the text
encode_txt= np.array([char2int[ch] for ch in text_data])

In [7]:
encode_txt[:150]

array([18, 58, 41, 50, 48,  2, 21, 65, 23, 16, 16, 16, 59, 41, 50, 50, 71,
       65, 70, 41, 61, 35,  0, 35,  2, 80, 65, 41, 21,  2, 65, 41,  0,  0,
       65, 41,  0, 35, 73,  2, 34, 65,  2, 17,  2, 21, 71, 65, 28, 22, 58,
       41, 50, 50, 71, 65, 70, 41, 61, 35,  0, 71, 65, 35, 80, 65, 28, 22,
       58, 41, 50, 50, 71, 65, 35, 22, 65, 35, 48, 80, 65, 43, 24, 22, 16,
       24, 41, 71, 68, 16, 16, 60, 17,  2, 21, 71, 48, 58, 35, 22, 82, 65,
       24, 41, 80, 65, 35, 22, 65, 55, 43, 22, 70, 28, 80, 35, 43, 22, 65,
       35, 22, 65, 48, 58,  2, 65, 30, 32,  0, 43, 22, 80, 73, 71, 80, 14,
       65, 58, 43, 28, 80,  2, 68, 65, 77, 58,  2, 65, 24, 35])

In [8]:
#data preprocessing
def one_hot_encode(arr, n_labels):
   #initialise encoded array
    one_hot=np.zeros((arr.size, n_labels), dtype=np.float32)
    #fill arrays with zeros

    one_hot[np.arange(one_hot.shape[0]), arr.flatten()]=1.
    #reshaping array to original shape
    one_hot= one_hot.reshape((*arr.shape, n_labels))

    return one_hot

In [9]:
#check if one_hot_encode works as expected
test= np.array([[3,5,1]])
one_hot=one_hot_encode(test,8)
print(one_hot)


[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [10]:
#Creating a batch
#Creating a generator that returns batches of size= batch_size *seq_legth from arr
def batch_gen(arr, batch_size, seq_length):
    total_batch_size= batch_size*seq_length
    n_batches= len(arr)//total_batch_size

    #reserve only enough characters to make batches full
    arr=arr[:n_batches*total_batch_size]
    #reshape array
    arr= arr.reshape((batch_size, -1))
    for n in range(0,arr.shape[1], seq_length):
        x=arr[:,n:n+seq_length]

        y=np.zeros_like(x)

        try:
             y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y





In [11]:
#testing batch generator

batches=batch_gen(encode_txt, 8, 50)
x,y=next(batches)

In [12]:
#printing first 10 items in sequence

print('x\n', x[:10, :10])
print('y\n',y[:10,:10])

x
 [[18 58 41 50 48  2 21 65 23 16]
 [80 43 22 65 48 58 41 48 65 41]
 [ 2 22 79 65 43 21 65 41 65 70]
 [80 65 48 58  2 65 55 58 35  2]
 [65 80 41 24 65 58  2 21 65 48]
 [55 28 80 80 35 43 22 65 41 22]
 [65 31 22 22 41 65 58 41 79 65]
 [30 32  0 43 22 80 73 71 68 65]]
y
 [[58 41 50 48  2 21 65 23 16 16]
 [43 22 65 48 58 41 48 65 41 48]
 [22 79 65 43 21 65 41 65 70 43]
 [65 48 58  2 65 55 58 35  2 70]
 [80 41 24 65 58  2 21 65 48  2]
 [28 80 80 35 43 22 65 41 22 79]
 [31 22 22 41 65 58 41 79 65 80]
 [32  0 43 22 80 73 71 68 65 76]]


## Defining the neural network with pytorch


In [13]:
#checking for GPU
gpu_train= torch.cuda.is_available()

if(gpu_train):
    print(" Training on GPU")
else:
    print("No GPU available")

 Training on GPU


In [14]:
#Defining the RNN
 
class RNN(torch.nn.Module):
     n_hidden=256
     n_layers=2
     drop_prob=0.5
     lr=0.001
     def __init__(self, tokens, n_hidden,n_layers, drop_prob, lr):
            super().__init__()
            self.drop_prob=drop_prob
            self.n_layers=n_layers
            self.n_hidden=n_hidden
            self.lr=lr

         #creating dict for characters

            self.chars=tokens
            self.int2char=dict(enumerate(self.chars))
            self.char2int={ch: ii for ii, ch in self.int2char.items()}

         #defining the LSTM
            self.lstm=torch.nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)

         #dropout layer

            self.dropout=torch.nn.Dropout(drop_prob)

         #fully connected layer
            self.fc=torch.nn.Linear(n_hidden,len(self.chars))
            
     def forward(self, x, hidden):
                 #generate output and new hidden unit
                r_output, hidden=self.lstm(x, hidden)

                #send output via drop out layer

                out=self.dropout(r_output)

                    #Stacking up LSTM

                out=out.contiguous().view(-1, self.n_hidden)

        #pass output through fully connected layer

                out=self.fc(out)
                return out, hidden
     def init_hidden(self, batch_size):
        weight=next(self.parameters()).data

        if(gpu_train):
            hidden=(weight.new(self.n_layers, batch_size,self.n_hidden).zero_().cuda(),
            weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden=(weight.new(self.n_layers, batch_size,self.n_hidden).zero_(),
            weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        return hidden






    

In [15]:

#Defining the training loop

def trainer(network, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, val_frac=0.1, print_every=10):
    network.train()
    optimiser= torch.optim.Adam(network.parameters(),lr=0.001)
    criterion= torch.nn.CrossEntropyLoss()
    
    #generating training and valid set
    
    val_idx= int(len(data)*(1-val_frac))
    data,val_data=data[:val_idx], data[val_idx:]
    
    if (gpu_train):
      
        network.cuda()
        counter=0
        n_chars=len(network.chars)
    for e in range(epochs):
        h=net.init_hidden(batch_size)
        for x, y in batch_gen(data, batch_size, seq_length):
            counter+=1
            
            x=one_hot_encode(x, n_chars)
            inputs, goal= torch.from_numpy(x), torch.from_numpy(y)
            if (gpu_train):
                inputs,goal= inputs.cuda(), goal.cuda()
                #new variable for hidden state or else backprop through whole channel
               
                h=tuple([each.data for each in h])
                
                #accumulating zero grads
                
                network.zero_grad()
                
                #generate model output
                
                output, h= network(inputs, h)
                
                #determine loss and do backprop
                loss=criterion(output, goal.view(batch_size*seq_length).long())
                
                loss.backward()
                
                #avoid exploding gradient by clipping
                clip=1.0
                torch.nn.utils.clip_grad_norm(network.parameters(), clip)
                optimiser.step()
                
                #loss stats
                
                if counter %print_every==0:
                    val_h= network.init_hidden(batch_size)
                    
                    val_losses=[]
                    network.eval()
                    
                    for x, y in batch_gen(val_data, batch_size, seq_length):
                        x=one_hot_encode(x, n_chars)
                        x,y=torch.from_numpy(x), torch.from_numpy(y)
                        
                        #new val for val hidden state and backprop through whole channe
                        val_h= tuple([each.data for each in val_h])
                        
                        inputs, goal=x,y
                        if(gpu_train):
                            inputs, goal=inputs.cuda(), goal.cuda()
                            
                        output, val_h= network(inputs, val_h)
                        val_loss= criterion(output, goal.view(batch_size*seq_length).long())
                        val_losses.append(val_loss.item())
                    network.train()
                    print("Epoch: {}/{}...".format(e+1, epochs),
                          "Steps: {}...".format(counter),
                          "Loss: {:.4f}...".format(loss.item()),
                          "val_loss:  {:4f}".format(np.mean(val_losses)))



                    

In [16]:
#instantiating the model

n_hidden= 512
n_layers= 2
lr=0.001
drop_prob=0.5
net=RNN(chars,n_hidden, n_layers, drop_prob, lr)

print(net)

RNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [18]:
batch_size= 128
seq_length=100
n_epochs=50

#train the model
trainer(net, encode_txt, epochs=n_epochs, seq_length=seq_length, batch_size=batch_size)



Epoch: 1/50... Steps: 10... Loss: 2.2276... val_loss:  2.193624
Epoch: 1/50... Steps: 20... Loss: 2.1783... val_loss:  2.155711
Epoch: 1/50... Steps: 30... Loss: 2.1710... val_loss:  2.129650
Epoch: 1/50... Steps: 40... Loss: 2.1298... val_loss:  2.110672
Epoch: 1/50... Steps: 50... Loss: 2.1504... val_loss:  2.092978
Epoch: 1/50... Steps: 60... Loss: 2.0757... val_loss:  2.078940
Epoch: 1/50... Steps: 70... Loss: 2.0830... val_loss:  2.064860
Epoch: 1/50... Steps: 80... Loss: 2.0558... val_loss:  2.048202
Epoch: 1/50... Steps: 90... Loss: 2.0725... val_loss:  2.030150
Epoch: 1/50... Steps: 100... Loss: 2.0366... val_loss:  2.015803
Epoch: 1/50... Steps: 110... Loss: 2.0219... val_loss:  1.999008
Epoch: 1/50... Steps: 120... Loss: 1.9669... val_loss:  1.978324
Epoch: 1/50... Steps: 130... Loss: 2.0084... val_loss:  1.966215
Epoch: 2/50... Steps: 140... Loss: 2.0020... val_loss:  1.949462
Epoch: 2/50... Steps: 150... Loss: 1.9762... val_loss:  1.939422
Epoch: 2/50... Steps: 160... Loss:

In [32]:
#Saving the model
net=RNN(chars,n_hidden, n_layers, drop_prob, lr)
ckpt_path='/content/drive/MyDrive/Colab Notebooks/Char_RNN'
name='RNN_20_epoch.net'
epoch=20
path= os.path.join(ckpt_path, 'net_{}_{}.pth'.format(name, epoch))
torch.save(net.state_dict(), path)

In [31]:
#saving the model

model_name='rnn_20_epoch.net'

checkpoint={'n_hidden': net.n_hidden,
            'n_layers': net.n_layers,
            'state_dict': net.state_dict(),
            'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

In [33]:
def predict(net, char, h=None, top_k=None):
    x=np.array([[net.char2int[char]]])
    y=one_hot_encode(x, len(net.chars))
    inputs= torch.from_numpy(x)
    
    if (gpu_train):
        inputs=inputs.cuda()
        
        h=tuple ([each.data for each in h])
        output, h= net(inputs, h)

        p=F.softmax(output, dim=1).data
        
        #get top K characters
        if (gpu_train):
          p=p.cpu()

        
        if top_k is None:
            top_ch= np.arange(len(net.chars))
            
        else:
            p, top_ch=p.topk(top_k)
            top_ch= top_ch.numpy().squeeze()
        p=p.numpy().squeeze()
        
        char=np.random.choice(top_ch, p=p/p.sum())
        
        return net.int2char[char], h
        

In [34]:
#Printing and generating text by the neural network

def sample(net, size, prime='The', top_k=None):
    if(gpu_train):
        net.cuda()
    else:
        net.cpu()
    net.eval()
    
    chars=[ch for ch in prime]
    
    h=net.init_hidden(1)
    for ch in prime:
        char, h= predict(net, ch,h,top_k=top_k)
    chars.append(char)
    
    for ii in range(size):
        char, h= predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)
    return ''.join(chars)

In [41]:
#Loading the model and using it to generate a sample text

with open('/content/drive/MyDrive/Colab Notebooks/Char_RNN/net_RNN_20_epoch.net_20.pth', 'rb') as f:
  model=torch.load(f)

loaded_model=RNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'],drop_prob=0.5, lr=0.001 )
loaded_model.load_state_dict(checkpoint['state_dict'])

print(sample(net, 1000, prime='Levine Said', top_k=5))

RuntimeError: ignored

In [42]:
print(sample(net,1000,prime='Anna', top_k= 5))

RuntimeError: ignored