In [2]:
import numpy as np
import time

# I. Preprocessing the data

#### a. create the vocab

In [5]:
## Reading and processing text
with open('openpyxl.txt', 'r', encoding="utf8") as fp:
    text=fp.read()
    
char_set = set(text) # make a set to count the unique characters
print('There is a total of:', len(text), 'characters.')
print('Unique Characters:', len(char_set))

There is a total of: 834178 characters.
Unique Characters: 97


#### b. convert into a sequence of integers

In [7]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)} # contains our mapping
char_array = np.array(chars_sorted)

text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)

print(f"Encoding <{text[:12]}> ==> {text_encoded[:12]}")
print(f"Decoding <{text_encoded[:12]}> ==> {''.join(char_array[i] for i in text_encoded[:12])}")

Encoding <def get_type> ==> [69 70 71  1 72 70 85 64 85 90 81 70]
Decoding <[69 70 71  1 72 70 85 64 85 90 81 70]> ==> def get_type


#### c.  divide text into chunks of equal length

In [9]:
seq_length = 40
chunk_size = seq_length + 1

# define sequence: sliding windows through the data
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded)-chunk_size+1)] 

## inspection:
for seq in text_chunks[:1]:
    input_seq = seq[:seq_length]
    target = seq[seq_length] 
    
print('-'.join(str(el) for el in input_seq), ' -> ', target)
print(repr(''.join(char_array[input_seq])),  ' -> ', repr(''.join(char_array[target])))

69-70-71-1-72-70-85-64-85-90-81-70-9-85-13-1-87-66-77-86-70-10-27-0-1-1-1-1-74-71-1-74-84-74-79-84-85-66-79-68  ->  70
'def get_type(t, value):\n    if isinstanc'  ->  'e'


#### d. build data loader

In [11]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks
        
    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_dataset = TextDataset(torch.tensor(text_chunks)) # convert dataset into chunks

  seq_dataset = TextDataset(torch.tensor(text_chunks)) # convert dataset into chunks


In [12]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x):', repr(''.join(char_array[seq])))
    print('Target (y):', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

 Input (x): 'def get_type(t, value):\n    if isinstanc'
Target (y): 'ef get_type(t, value):\n    if isinstance'

 Input (x): 'ef get_type(t, value):\n    if isinstance'
Target (y): 'f get_type(t, value):\n    if isinstance('



In [13]:
device = torch.device("cpu")

In [14]:
from torch.utils.data import DataLoader

batch_size = 64

torch.manual_seed(1)

# Return 2 matrices of 64 vectors each one of size 40. Input vector first 40 and output vector last 40 of a sequence of 41 characters.
# sequence -> def get
# input -> def ge
# output -> ef get
# d -> e
# de -> f
# ...
# def ge -> def get

seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# II. Build the model

In [16]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)
    
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(97, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=97, bias=True)
)

# III. Train the Model

#### a. partailly train the model
Given a vector X. The output vector is of size 97 containing a probability associated with each character of the vocab.

In [19]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 501

torch.manual_seed(1)

for epoch in range(num_epochs):
    
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 

Epoch 0 loss: 4.5427
Epoch 500 loss: 0.9463


For each epoch we will go randomly through a batch of 64 sequences. Then we will go through each sequence (at once) passing sucessively the i-th character of the sequence. The embedding will make a good representation of each integer of the vocab.

In [21]:
seq_batch[1]

tensor([66, 77, 74, 72, 79, 78, 70, 79, 85, 16, 31,  0,  1,  1,  1,  1, 29, 67,
        80, 83, 69, 70, 83, 31,  0,  1,  1,  1,  1,  1,  1, 29, 77, 70, 71, 85,
        16, 31,  0,  1])

In [22]:
input_seq = "".join(str(char_array[j]) for j in seq_batch[1])
input_seq

'alignment/>\n    <border>\n      <left/>\n '

In [23]:
# we pass successively each character of the sequence
'-'.join(str(seq_batch[1, c].item()) for c in range(seq_length))

'66-77-74-72-79-78-70-79-85-16-31-0-1-1-1-1-29-67-80-83-69-70-83-31-0-1-1-1-1-1-1-29-77-70-71-85-16-31-0-1'

In [24]:
target_batch[1]

tensor([77, 74, 72, 79, 78, 70, 79, 85, 16, 31,  0,  1,  1,  1,  1, 29, 67, 80,
        83, 69, 70, 83, 31,  0,  1,  1,  1,  1,  1,  1, 29, 77, 70, 71, 85, 16,
        31,  0,  1,  1])

In [25]:
# prediction associated with the sequence '13-1-...34'
pred[0] # logit for each word in the vocab, which is the most probable

tensor([-1.1245,  0.0489, -2.6509, -0.5982, -2.5322, -2.9296, -6.3309, -6.1606,
         0.5573,  0.3158,  0.2778,  4.2142, -4.7120, -2.4285, -4.3358, -1.9903,
        -6.2573,  0.2355, -0.6779, -4.3647, -2.4662, -3.9219, -2.3750, -0.2091,
        -3.3720, -4.4933, -3.7280,  0.3501, -6.0850, -3.3400, -0.2872, -7.3120,
        -1.2690, -2.2167, -1.0912, -3.0538, -3.5304, -1.1028, -3.4811, -1.1156,
        -5.0963, -4.3124, -2.8485, -5.8801, -6.5585, -1.9057, -1.9401, -0.0290,
        -1.9691, -2.2149, -7.6845, -1.2522, -1.8178, -0.9566, -4.5071, -3.4762,
        -3.6400, -2.8625, -3.9558, -4.4462,  0.1113, -2.7414, -1.7847, -4.1110,
         1.7682, -4.0836,  1.5958, -1.7733,  0.8026,  0.2151,  4.7465,  0.6532,
        -0.6007, -2.3480,  1.6775, -5.3684,  3.8451, -0.8640,  2.3995,  2.5504,
         2.2107,  3.2204, -2.0138,  2.9113,  4.9218,  1.9882, -2.6164,  2.3710,
         1.4099,  3.6283, -0.1290, -2.3222, -0.4279, -4.3806, -4.7737, -6.2720,
        -6.5861], grad_fn=<SelectBackwar

In [26]:
i = pred[0].argmax().item()
val = pred[0][i]
print(f"Most probable item val={val:.3f}, index={i}.")

Most probable item val=4.922, index=84.


In [27]:
pred_char = char_array[i]
true_char = char_array[target_batch[1][-1]]
print('pred_char ->', pred_char, 'True_char ->', true_char)

pred_char -> s True_char ->  


In [28]:
# save the model (partial model)
torch.save(model.state_dict(), 'partial_model.pth')

#### b. save partially trained model

In [30]:
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model.load_state_dict(torch.load('partial_model.pth', weights_only=True))
model.eval()

RNN(
  (embedding): Embedding(97, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=97, bias=True)
)

In [31]:
def decode_seq(seq):
    print("-".join(str(el.item()) for el in seq))
    return "".join(char_array[j] for j in seq)

In [32]:
decode_seq(seq_batch[1]) # input seq

66-77-74-72-79-78-70-79-85-16-31-0-1-1-1-1-29-67-80-83-69-70-83-31-0-1-1-1-1-1-1-29-77-70-71-85-16-31-0-1


'alignment/>\n    <border>\n      <left/>\n '

In [33]:
decode_seq(target_batch[1]) # output seq

77-74-72-79-78-70-79-85-16-31-0-1-1-1-1-29-67-80-83-69-70-83-31-0-1-1-1-1-1-1-29-77-70-71-85-16-31-0-1-1


'lignment/>\n    <border>\n      <left/>\n  '

In [34]:
# last character prediction

In [35]:
pred, hidden, cell = model(seq_batch[:, -1], hidden, cell) 

In [36]:
pred[0]

tensor([ 0.1035, -0.7862, -1.4581,  0.4942, -0.9661, -1.0190, -6.3753, -5.4056,
         1.1921,  1.6248,  0.4791,  3.7483, -4.0674, -1.8715, -4.2060, -1.2852,
        -5.8493,  0.0992, -0.4466, -4.4085, -2.3427, -3.1154, -1.4382, -0.4925,
        -2.7121, -3.1308, -2.1250,  1.5444, -5.1638, -3.5885, -1.0242, -6.2112,
         1.8063, -1.1311, -0.2573, -2.0556, -2.1473, -0.4185, -2.9109,  0.1957,
        -5.0529, -3.2311, -2.8833, -5.0160, -5.1881, -2.1661, -0.7307,  0.4137,
        -2.3973, -1.0966, -6.8419, -0.3958,  0.0214, -0.8122, -4.5089, -2.2260,
        -1.8051, -2.4980, -2.5969, -3.6495,  1.9037, -1.1061, -1.5899, -2.0148,
         1.9990, -3.0049, -0.3741, -1.1033,  1.0346, -0.1056,  4.2064,  0.9622,
        -1.7771, -1.8486,  1.0415, -3.5869,  3.3563, -0.7338,  2.1549,  1.7562,
         0.6947,  3.2622, -1.6808,  1.7336,  5.0644,  0.7114, -2.5878,  1.9146,
         2.1550,  2.3643, -1.2161, -2.7538, -0.6194, -3.1247, -3.3311, -4.9865,
        -5.4289], grad_fn=<SelectBackwar

In [37]:
i = pred[0].argmax().item()
val = pred[0][i]
print(f"Most probable item val={val:.3f}, index={i}.")

Most probable item val=5.064, index=84.


In [38]:
pred_char = char_array[i]
true_char = char_array[target_batch[1][-1]]
print('pred_char ->', pred_char, 'True_char ->', true_char)

pred_char -> s True_char ->  


### c. fully train the model

In [40]:
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 10000

torch.manual_seed(1)

for epoch in range(num_epochs):
    
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

# save the model (partial model)
torch.save(model.state_dict(), 'model.pth')

Epoch 0 loss: 4.5427
Epoch 500 loss: 0.9463
Epoch 1000 loss: 0.8106
Epoch 1500 loss: 0.8181
Epoch 2000 loss: 0.7479
Epoch 2500 loss: 0.7963
Epoch 3000 loss: 0.7100
Epoch 3500 loss: 0.6763
Epoch 4000 loss: 0.7382
Epoch 4500 loss: 0.6336
Epoch 5000 loss: 0.7140
Epoch 5500 loss: 0.7821
Epoch 6000 loss: 0.8590
Epoch 6500 loss: 0.7630
Epoch 7000 loss: 0.7782
Epoch 7500 loss: 0.7442
Epoch 8000 loss: 0.8220
Epoch 8500 loss: 0.8846
Epoch 9000 loss: 0.8854
Epoch 9500 loss: 0.8589
