<a href="https://colab.research.google.com/github/Manu-Sanchez/Learning/blob/ai%2Fbasic-gan/NLP_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

In [None]:
with open("shakespeare.txt", 'r', encoding="utf8") as file:
  text = file.read()

print(len(text))
print(text[:1000])

5445609

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own

In [None]:
all_characters = set(text) #Return a list of unique characters
print(len(all_characters))
print(all_characters)

84
{'_', '8', 'j', 'h', 'J', 'T', 'U', 'x', 'X', 'G', 'N', 'e', 'H', 'f', ')', 's', 'i', 'B', 'S', 'n', '&', "'", '5', 'F', 'O', '<', '[', 'A', 'V', ',', '?', 'q', 'z', 'g', '6', '"', 'D', 'C', 'K', 't', '9', ']', 'W', '0', '4', '1', '>', 'R', '.', '|', '`', ';', 'b', 'Q', 'P', 'c', 'l', 'p', '(', 'w', '!', 'Z', 'd', 'o', '}', 'L', ':', 'm', 'y', 'k', 'a', 'Y', 'I', '7', '\n', 'r', 'v', ' ', 'u', '-', 'M', '2', 'E', '3'}


In [None]:
decoder = dict(enumerate(all_characters))
encoder = {char: idx for idx, char in decoder.items()}

In [None]:
print(decoder)
print(encoder)

{0: '_', 1: '8', 2: 'j', 3: 'h', 4: 'J', 5: 'T', 6: 'U', 7: 'x', 8: 'X', 9: 'G', 10: 'N', 11: 'e', 12: 'H', 13: 'f', 14: ')', 15: 's', 16: 'i', 17: 'B', 18: 'S', 19: 'n', 20: '&', 21: "'", 22: '5', 23: 'F', 24: 'O', 25: '<', 26: '[', 27: 'A', 28: 'V', 29: ',', 30: '?', 31: 'q', 32: 'z', 33: 'g', 34: '6', 35: '"', 36: 'D', 37: 'C', 38: 'K', 39: 't', 40: '9', 41: ']', 42: 'W', 43: '0', 44: '4', 45: '1', 46: '>', 47: 'R', 48: '.', 49: '|', 50: '`', 51: ';', 52: 'b', 53: 'Q', 54: 'P', 55: 'c', 56: 'l', 57: 'p', 58: '(', 59: 'w', 60: '!', 61: 'Z', 62: 'd', 63: 'o', 64: '}', 65: 'L', 66: ':', 67: 'm', 68: 'y', 69: 'k', 70: 'a', 71: 'Y', 72: 'I', 73: '7', 74: '\n', 75: 'r', 76: 'v', 77: ' ', 78: 'u', 79: '-', 80: 'M', 81: '2', 82: 'E', 83: '3'}
{'_': 0, '8': 1, 'j': 2, 'h': 3, 'J': 4, 'T': 5, 'U': 6, 'x': 7, 'X': 8, 'G': 9, 'N': 10, 'e': 11, 'H': 12, 'f': 13, ')': 14, 's': 15, 'i': 16, 'B': 17, 'S': 18, 'n': 19, '&': 20, "'": 21, '5': 22, 'F': 23, 'O': 24, '<': 25, '[': 26, 'A': 27, 'V': 28, 

In [None]:
encoded_text = np.array([encoder[char] for char in text])
print(encoded_text[:500])

[74 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 45 74
 77 77 23 75 63 67 77 13 70 16 75 11 15 39 77 55 75 11 70 39 78 75 11 15
 77 59 11 77 62 11 15 16 75 11 77 16 19 55 75 11 70 15 11 29 74 77 77  5
  3 70 39 77 39  3 11 75 11 52 68 77 52 11 70 78 39 68 21 15 77 75 63 15
 11 77 67 16 33  3 39 77 19 11 76 11 75 77 62 16 11 29 74 77 77 17 78 39
 77 70 15 77 39  3 11 77 75 16 57 11 75 77 15  3 63 78 56 62 77 52 68 77
 39 16 67 11 77 62 11 55 11 70 15 11 29 74 77 77 12 16 15 77 39 11 19 62
 11 75 77  3 11 16 75 77 67 16 33  3 39 77 52 11 70 75 77  3 16 15 77 67
 11 67 63 75 68 66 74 77 77 17 78 39 77 39  3 63 78 77 55 63 19 39 75 70
 55 39 11 62 77 39 63 77 39  3 16 19 11 77 63 59 19 77 52 75 16 33  3 39
 77 11 68 11 15 29 74 77 77 23 11 11 62 21 15 39 77 39  3 68 77 56 16 33
  3 39 21 15 77 13 56 70 67 11 77 59 16 39  3 77 15 11 56 13 79 15 78 52
 15 39 70 19 39 16 70 56 77 13 78 11 56 29 74 77 77 80 70 69 16 19 33 77
 70 77 13 70 67 16 19 11 77 59  3 11 75 11 77 70 52

In [None]:

np.arange(3)

array([0, 1, 2])

In [None]:
encoded_text.size

5445609

In [None]:
def one_hot_encode(encoded_text, num_different_items):

  encoded_text_size = encoded_text.size
  one_hot_encoding = np.zeros((encoded_text_size, num_different_items)).astype(np.float32)

  one_hot_encoding[np.arange(encoded_text_size), encoded_text.flatten()] = 1

  return one_hot_encoding.reshape((*encoded_text.shape, num_different_items))

one_hot_encoded_text = one_hot_encode(encoded_text, len(all_characters))
print(one_hot_encoded_text[0:5])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [None]:
x = len(one_hot_encoded_text)//10
x

544560

In [None]:
one_hot_encoded_text[:x*10].reshape((10, -1))[:10, :40]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
   

In [None]:
"""
Windowed Data

X -> [H, e, l, l, o, , W, o, r, l]
Y -> [e, l, l, o, , W, o, r, l, d]

We can see that the actual Y is X shifted 1 element
"""
def generate_batches(encoded_text, batch_size=10, seq_len=50):

  #Calculate how many characters can we add into a batch
  char_per_batch = seq_len * batch_size

  #Calculate how many batches we have available
  num_batches_avail = len(encoded_text) // char_per_batch

  #Cut off the remaining characters that doesn't fit well with the actual character per batch size
  encoded_text = encoded_text[:num_batches_avail*char_per_batch]

  encoded_text = encoded_text.reshape((batch_size, -1))

  for n in range(0, encoded_text.shape[1], seq_len):

    x = encoded_text[:, n:n+seq_len]
    y = np.zeros_like(x)

    try:
      y[:, :-1] = x[:, 1:]
      y[:, -1] = encoded_text[:, n+seq_len]

    except:
      y[:, :-1] = x[:, 1:]
      y[:, -1] = encoded_text[:, 0]


    yield x,y

In [None]:
batch_generator = generate_batches(encoded_text, batch_size=5, seq_len=5)

In [None]:
x, y = next(batch_generator)
for x_itm, y_itm in zip(x,y):
  print(f"X: {x_itm} Y: {y_itm}")

X: [74 77 77 77 77] Y: [77 77 77 77 77]
X: [ 5 82 36 77 37] Y: [82 36 77 37 24]
X: [56 56 21 62 51] Y: [56 21 62 51 74]
X: [65  5 12 27 61] Y: [ 5 12 27 61 27]
X: [77 77 74 77 77] Y: [77 74 77 77 77]


In [None]:
class CharModel(nn.Module):

  def __init__(self, all_chars, num_hidden=256, num_layers=4, dropout=.5, use_gpu=False):
    super().__init__()

    self.dropout = dropout
    self.num_layers = num_layers
    self.num_hidden = num_hidden
    self.use_gpu = use_gpu

    self.all_chars = all_chars
    self.decoder = dict(enumerate(all_chars))
    self.encoder = {char: idx for idx, char in decoder.items()}

    self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=dropout, batch_first=True) #Batch first is because we will send data in the format (batch, seq, feature)
    self.dropout_layer = nn.Dropout(dropout)
    self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))

  def forward(self, x, hidden):

    lstm_out, hidden = self.lstm(x, hidden)
    drop_out = self.dropout_layer(lstm_out)
    drop_out = drop_out.contiguous().view(-1, self.num_hidden)
    final_out = self.fc_linear(drop_out)

    return final_out, hidden

  def hidden_state(self, batch_size):
    if self.use_gpu:
      hidden = (
        torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
        torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda()
      )

    else:
      hidden = (
        torch.zeros(self.num_layers, batch_size, self.num_hidden),
        torch.zeros(self.num_layers, batch_size, self.num_hidden)
      )

    return hidden

In [None]:
model = CharModel(all_chars=all_characters, num_hidden=512, num_layers=3, dropout=.5, use_gpu=True)
model

CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout_layer): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

In [None]:
total_params = sum([int(param.numel()) for param in model.parameters()])
total_params

5470292

In [None]:
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
train_percent = .9
train_size = int(train_percent * len(encoded_text))
train_data = encoded_text[:train_size]
validation_data = encoded_text[train_size:]

print(
    f"Training items: {len(train_data)}" + "\n" + \
    f"Validation items {len(validation_data)}"
)

Training items: 4901048
Validation items 544561


In [None]:
epochs = 60
batch_size = 100

seq_len = 100
tracker = 0
num_char = max(encoded_text)+1

In [None]:
model.train()

if model.use_gpu:
  model.cuda()

for i in range(epochs):

  hidden = model.hidden_state(batch_size)

  for x, y in generate_batches(train_data, batch_size, seq_len):
    tracker += 1

    x = one_hot_encode(x, num_char)
    inputs = torch.from_numpy(x)
    targets = torch.from_numpy(y)

    if model.use_gpu:
      inputs = inputs.cuda()
      targets = targets.cuda()

    hidden = tuple([state.data for state in hidden])
    model.zero_grad()

    lstm_out, hidden = model.forward(inputs, hidden)
    loss = criterion(lstm_out, targets.reshape(batch_size*seq_len).long())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
    optim.step()

    if tracker % 25 == 0:
      val_hidden = model.hidden_state(batch_size)
      val_losses = []

      model.eval()
      for x, y in generate_batches(validation_data, batch_size, seq_len):
        x = one_hot_encode(x, num_char)
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)

        if model.use_gpu:
          inputs = inputs.cuda()
          targets = targets.cuda()

        val_hidden = tuple([state.data for state in hidden])
        lstm_out, val_hidden = model.forward(inputs, val_hidden)
        val_loss = criterion(lstm_out, targets.reshape(batch_size*seq_len).long())

        val_losses.append(val_loss.item())

      model.train()
      print(f"Epoch {i} Step: {tracker} Val loss: {val_loss.item()}")

Epoch 0 Step: 25 Val loss: 3.2068777084350586
Epoch 0 Step: 50 Val loss: 3.19425368309021
Epoch 0 Step: 75 Val loss: 3.1957879066467285
Epoch 0 Step: 100 Val loss: 3.181749105453491
Epoch 0 Step: 125 Val loss: 3.0859487056732178
Epoch 0 Step: 150 Val loss: 2.98827862739563
Epoch 0 Step: 175 Val loss: 2.8832807540893555
Epoch 0 Step: 200 Val loss: 2.7572391033172607
Epoch 0 Step: 225 Val loss: 2.683788537979126
Epoch 0 Step: 250 Val loss: 2.599677085876465
Epoch 0 Step: 275 Val loss: 2.478297233581543
Epoch 0 Step: 300 Val loss: 2.3588197231292725
Epoch 0 Step: 325 Val loss: 2.2933597564697266
Epoch 0 Step: 350 Val loss: 2.2380192279815674
Epoch 0 Step: 375 Val loss: 2.196803569793701
Epoch 0 Step: 400 Val loss: 2.1656386852264404
Epoch 0 Step: 425 Val loss: 2.1348018646240234
Epoch 0 Step: 450 Val loss: 2.1135993003845215
Epoch 0 Step: 475 Val loss: 2.0704338550567627
Epoch 1 Step: 500 Val loss: 2.049184799194336
Epoch 1 Step: 525 Val loss: 2.0268375873565674
Epoch 1 Step: 550 Val loss

In [None]:
model_name = "hidden512_layers3_shakes.net"
torch.save(model.state_dict(), model_name)

In [None]:
def predict_next_character(model, char, hidden=None, k=1):

  encoded_text = model.encoder[char]
  encoded_text = np.array([[encoded_text]])
  encoded_text = one_hot_encode(encoded_text, len(model.all_chars))
  inputs = torch.from_numpy(encoded_text)

  if model.use_gpu:
    inputs = inputs.cuda()

  hidden = tuple([state.data for state in hidden])
  lstm_out, hidden = model(inputs, hidden)
  probs = F.softmax(lstm_out, dim=1).data

  if model.use_gpu:
    probs = probs.cpu()

  probs, index_positions = probs.topk(k)
  index_positions = index_positions.numpy().squeeze()

  probs = probs.numpy().flatten()
  probs = probs/probs.sum()
  char = np.random.choice(index_positions, p=probs)

  return model.decoder[char], hidden


def generate_text(module, size, seed="The", k=1):
  if model.use_gpu():
    model.cuda()

  else:
    model.cpu()

  model.eval()
  output_chars = [c for c in seed]

  hidden = model.hidden_state(1)
  for char in seed:
    char, hidden = predict_next_character(model, char, hidden, k)

  output_chars.append(char)

  for i in range(size):
    char, hidden = predict_next_character(model, output_chars[-1], hidden, k)
    output_chars.append(char)

  return ''.join(output_chars)

print(generate_text(model, 1000, seed="The", k=3))