<a href="https://colab.research.google.com/github/GustaveRw/NLP-Fellowship/blob/master/Character_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Character Level LSTM Model

In [None]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd

In [None]:
#Fetch the Text dataset from the github repo

text_df = pd.read_csv('https://raw.githubusercontent.com/agent87/zindi/main/final_corpus.csv')

In [None]:
text = ' '.join(list(text_df['rw'].astype('str')))

In [None]:
text[:100]

'mbere mbere imana yaremye ijuru nisi isi itagira ishusho yariho ubusa busa umwijima hejuru yimuhenge'

# Tokenisation Stage

In [None]:
#find set of unique characters
chars = tuple(set(text)) #Get the unique set of characters in the text/corpus
int2char = dict(enumerate(chars)) #assign each character to a number
char2int = {character: index for index, character in int2char.items()} #have the characters on the left side

#encode the text
encoded = np.array([char2int[ch] for ch in text])

In [None]:
encoded_p = []
for char in text:
  encoded_p.append(str(char2int[char]))

print(text[:100]) #Print the first 100 words of the text
print('------------------')
print(' '.join(encoded_p[:100])) #print the first 100 words of the encoded text
print('-------------------------')
print(char2int) #print the character to number mapping

mbere mbere imana yaremye ijuru nisi isi itagira ishusho yariho ubusa busa umwijima hejuru yimuhenge
------------------
5 33 26 23 26 16 5 33 26 23 26 16 18 5 11 24 11 16 31 11 23 26 5 31 26 16 18 21 19 23 19 16 24 18 3 18 16 18 3 18 16 18 20 11 14 18 23 11 16 18 3 13 19 3 13 22 16 31 11 23 18 13 22 16 19 33 19 3 11 16 33 19 3 11 16 19 5 34 18 21 18 5 11 16 13 26 21 19 23 19 16 31 18 5 19 13 26 24 14 26
-------------------------
{'k': 0, 'v': 1, 'f': 2, 's': 3, 'ī': 4, 'm': 5, 'ā': 6, 'z': 7, 'd': 8, 'í': 9, 'ü': 10, 'a': 11, 'q': 12, 'h': 13, 'g': 14, 'ē': 15, ' ': 16, 'c': 17, 'i': 18, 'u': 19, 't': 20, 'j': 21, 'o': 22, 'r': 23, 'n': 24, 'ū': 25, 'e': 26, 'ˮ': 27, 'ú': 28, 'ō': 29, 'ñ': 30, 'y': 31, 'l': 32, 'b': 33, 'w': 34, 'x': 35, 'p': 36}


# One Hot Encoding the Data

In [None]:
def encode_char(char : str):
  encoding_template = [0 for i in range(len(char2int))]
  encoding_template[char2int[char]] = 1
  return np.array(encoding_template)

In [None]:
def encode_word(word: str):
  word_encode = []
  for char in word:
    word_encode.append(encode_char(char))
  return np.array(word_encode)

In [None]:
encodes = []
for word in text.split(' '):
  encodes.append(encode_word(word))

In [None]:
one_hot = np.array(encodes)

  one_hot = np.array(encodes)


In [None]:
one_hot #sentence level
one_hot[0] #word level
one_hot[0][0] #character level

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Creation of Batches

In [None]:
def get_batches(arr, batch_size, seq_length):
  '''
  one_hot : is the one hot encoded dataset 
  batch_size: Batch size, the number of samples to take
  seq_length: Number of encoded charcters in each batch
  '''

  batch_size_total = batch_size * seq_length
    # total number of batches we can make
  n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
  arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
  arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
  for n in range(0, arr.shape[1], seq_length):
        # The features
      x = arr[:, n:n+seq_length]
        # The targets, shifted by one
      y = np.zeros_like(x)
      try:
          y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
      except IndexError:
          y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
      yield x, y

In [None]:
batches = get_batches(encoded, 20, 50)
x, y = next(batches)

In [None]:
encode_text = [char2int[char] for char in text]

In [None]:
encode_text[0]

5

In [None]:
def window(text , batch_size, seq_size):
  """
  text : is the encoded text
  batch : collection of sequnces
  batch_size : actual number sequences(samples) in a batch
  seq_size : windows size / snetence size before the character predictions
  """
  batch_x = []
  batch_y = []
  for window in range(0,len(text), seq_size):
    try: 
      batch_x.append(np.array(text[window: window+seq_size]))
      batch_y.append(np.array(text[window+1: window+seq_size+1]))
    except IndexError:
      pass
  return np.array(batch_x), np.array(batch_y)

In [None]:
x, y = window(encode_text, 10, 40)

  return np.array(batch_x), np.array(batch_y)


In [None]:
x[:10]

array([array([ 5, 33, 26, 23, 26, 16,  5, 33, 26, 23, 26, 16, 18,  5, 11, 24, 11,
              16, 31, 11, 23, 26,  5, 31, 26, 16, 18, 21, 19, 23, 19, 16, 24, 18,
               3, 18, 16, 18,  3, 18])                                           ,
       array([16, 18, 20, 11, 14, 18, 23, 11, 16, 18,  3, 13, 19,  3, 13, 22, 16,
              31, 11, 23, 18, 13, 22, 16, 19, 33, 19,  3, 11, 16, 33, 19,  3, 11,
              16, 19,  5, 34, 18, 21])                                           ,
       array([18,  5, 11, 16, 13, 26, 21, 19, 23, 19, 16, 31, 18,  5, 19, 13, 26,
              24, 14, 26, 23, 18, 16, 19,  5, 34, 19,  0, 11, 16, 34, 18,  5, 11,
              24, 11, 16, 31, 11, 14])                                           ,
       array([26, 24,  8, 11, 14, 26, 24,  8, 11, 14, 11, 16, 13, 26, 21, 19, 23,
              19, 16, 31, 11,  5, 11,  7, 18, 16, 18,  5, 11, 24, 11, 16, 18, 23,
              11,  1, 19, 14, 11, 16])                                           ,
       array

In [None]:
y[1:11]

['ere mbere ',
 're mbere i',
 'e mbere im',
 ' mbere ima',
 'mbere iman',
 'bere imana',
 'ere imana ',
 're imana y',
 'e imana ya',
 ' imana yar']

In [None]:
for char in y[:10, :10][0]:
  print(int2char[char])

In [None]:
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])