<a href="https://colab.research.google.com/github/Jerryson520/NLP-Projects/blob/main/CharRNN_and_RandomDecoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import re

### Get the data and process
- This is the Mysterious island found in Project Gutenberg.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/MyDrive/1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

len(text)

1131478

In [None]:
## Reading and processing text
with open('1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()
# Get the index of 'THE MYSTERIOUS ISLAND' or 'The Mysterious Island'
start_indx = text.find('The Mysterious Island')
print(start_indx)
# Get the index of 'End of the Project Gutenberg' END OF THE PROJECT GUTENBERG
end_indx = text.rfind('END OF THE PROJECT GUTENBERG') + len('END OF THE PROJECT GUTENBERG') - 1
print(end_indx)
# Set text to the text between start and end idx.
text = text[start_indx:end_indx+1]
# Get the unique set of characters.
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))
# assert(len(text) == 1130711)
# assert(len(char_set) == 85)

32
1113093
Total Length: 1113062
Unique Characters: 83


### Tokenze and get other helpers
- We do this manually since everything is character based.

In [None]:
# The universe of words.
chars_sorted = sorted(char_set)

# Effectively, these maps are the tokenizer.
# Map each char to a unique int. This is a dict.
char2int = {char: i for i, char in enumerate(chars_sorted)}
# Do the revverse of the above, this should be a np array.
int2char = np.array(chars_sorted)

# Tokenize the entire corpus. This should be an np array of np.int32 type.
text_encoded = np.array([char2int[char] for char in text], dtype = np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1113062,)
The Mysterious       == Encoding ==>  [45 60 57  1 38 77 71 72 57 70 61 67 73 71  1]
[34 71 64 53 66 56]  == Reverse  ==>  Island


#### Examples

In [None]:
print('Text encoded shape: ', text_encoded.shape)
print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:21]]))

Text encoded shape:  (1113062,)
The Mysterious       == Encoding ==>  [45 60 57  1 38 77 71 72 57 70 61 67 73 71  1]
[34 71 64 53 66 56]  == Reverse  ==>  Island


In [None]:
# assert(
#     np.array_equal(
#     text_encoded[:15],
#         [48, 36, 33, 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1]
#     )
# )

### Process the data and get the data loader

In [None]:
a = [1,24,4,5,5,5,6]
a[0:4]

[1, 24, 4, 5]

In [None]:
seq_length = 40
chunk_size = seq_length + 1

# Break up the data into chunks of size 41. This should be a list of lists.
# Use text_encoded. This will be used to get (x, y) pairs.
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - seq_length)]

In [None]:
len(text_chunks[2])

41

In [None]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, idx):
        # Get the text chunk at index idx.
        text_chunk = self.text_chunks[idx]
        # Return (x, y) where x has length 40 and y has length 40.
        # y should be x shifted by 1 time.
        return (text_chunk[0:-1], text_chunk[1:])

seq_dataset = TextDataset(torch.tensor(np.array(text_chunks))) # batch_size * seq_len

In [None]:
for i, (seq, target) in enumerate(seq_dataset):
    # 40 characters for source and target ...
    print(seq.shape, target.shape)
    print('Input (x):', repr(''.join(int2char[seq])))
    print('Target (y):', repr(''.join(int2char[target])))
    print()
    if i == 1:
        break

torch.Size([40]) torch.Size([40])
Input (x): 'The Mysterious Island, by Jules Verne\n\nT'
Target (y): 'he Mysterious Island, by Jules Verne\n\nTh'

torch.Size([40]) torch.Size([40])
Input (x): 'he Mysterious Island, by Jules Verne\n\nTh'
Target (y): 'e Mysterious Island, by Jules Verne\n\nThi'



In [None]:
device = torch.device("cpu")

In [None]:
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Write the models

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        # Set to an embedding layer of vocab_size by embed_dim.
        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=None
        )
        self.rnn_hidden_size = rnn_hidden_size
        # Set to an LSTM with x having embed_dim and h dimension rnn_hidden_size.
        # batch_first should be true.
        self.rnn = nn.LSTM(
            input_size = embed_dim,
            hidden_size = rnn_hidden_size,
            batch_first = True
        )

        # Make a linear layer from rnn_hidden_size to vocab_size.
        # This will be used to get the yt for each xt.
        self.fc = nn.Linear(
            in_features = self.rnn_hidden_size,
            out_features = vocab_size)

    def forward(self, text, hidden=None, cell=None):
        # Get the embeddings for text.
        out = self.embedding(text) # batch_size * seq_len * embed_dim

        # Pass out, hidden and cell through the rnn.
        # If hidden is None, don't specify it and just use out.
        if hidden is not None:
            out, (hidden, cell) = self.rnn(out, (hidden, cell))
        else:
            out, (hidden, cell) = self.rnn(out) # out: batch * seq_len * hidden_size

        # Pass out through fc.
        out = self.fc(out)

        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        # Initialize to zeros of 1 by ??? appropriate dimensions.
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

### Do this right way - across all data all at once!

In [None]:
vocab_size = len(int2char)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model = model.to(device)
model

RNN(
  (embedding): Embedding(83, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Set to 10000.
num_epochs = 10000

torch.manual_seed(1)

# epochs here will mean batches.
# If the above takes too long, use 1000.

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)

    # Get the next batch from seq_dl
    seq_batch, target_batch = next(iter(seq_dl))

    seq_batch = seq_batch.to(device) # batch_size * seq_len
    target_batch = target_batch.to(device)

    optimizer.zero_grad()

    loss = 0

    # Pass through the model.
    logits, _ = model(seq_batch, hidden, cell) # batch_size * seq_len * vocab_size

    # Get the loss.
    # You'll need to reshape / view things to make this work.
    loss += criterion(logits.view(logits.shape[0] * logits.shape[1], -1), target_batch.long().view(-1))

    # Do back prop.
    loss.backward()
    optimizer.step()

    # Get the value in the tensor loss.
    loss = loss.item()

    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 1.8589
Epoch 10 loss: 1.9141
Epoch 20 loss: 1.8323
Epoch 30 loss: 1.7749
Epoch 40 loss: 1.7208
Epoch 50 loss: 1.7147
Epoch 60 loss: 1.6634
Epoch 70 loss: 1.6843
Epoch 80 loss: 1.5436
Epoch 90 loss: 1.5359
Epoch 100 loss: 1.5601
Epoch 110 loss: 1.5613
Epoch 120 loss: 1.5092
Epoch 130 loss: 1.5445
Epoch 140 loss: 1.4979
Epoch 150 loss: 1.5395
Epoch 160 loss: 1.5086
Epoch 170 loss: 1.5039
Epoch 180 loss: 1.4999
Epoch 190 loss: 1.4859
Epoch 200 loss: 1.4746
Epoch 210 loss: 1.4383
Epoch 220 loss: 1.4343
Epoch 230 loss: 1.4839
Epoch 240 loss: 1.4800
Epoch 250 loss: 1.4152
Epoch 260 loss: 1.4137
Epoch 270 loss: 1.4180
Epoch 280 loss: 1.4707
Epoch 290 loss: 1.4234
Epoch 300 loss: 1.3895
Epoch 310 loss: 1.3773
Epoch 320 loss: 1.4086
Epoch 330 loss: 1.4408
Epoch 340 loss: 1.3993
Epoch 350 loss: 1.4153
Epoch 360 loss: 1.3527
Epoch 370 loss: 1.3755
Epoch 380 loss: 1.3338
Epoch 390 loss: 1.3603
Epoch 400 loss: 1.3362
Epoch 410 loss: 1.3795
Epoch 420 loss: 1.3096
Epoch 430 loss: 1.3705

In [None]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[-1.0, 1.0, 3.0]])
# print(logits.shape)

# Get the probabilities for these logits.
print('Probabilities:', nn.Softmax()(logits))

# Get a Categorical random variable with the above probabilities for each of the classes.
m = Categorical(nn.Softmax()(logits))
# Generate 10 things.
samples = m.sample((10,))

print(samples.numpy())

Probabilities: tensor([[0.0159, 0.1173, 0.8668]])
[[1]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [None]:
a = np.array([[1,2,3], [3,4,5], [5,6,7]])
a[:,-1:]

array([[3],
       [5],
       [7]])

### Random decoding.
- This compounds problems: once you make a mistake, you can't undo it.

In [None]:
def random_sample(
    model,
    starting_str,
    len_generated_text=500,
):

    # Encode starting string into a tensor using char2str.
    encoded_input = torch.tensor([char2int[s] for s in starting_str])

    # Reshape to be 1 by ??? - let PyTorch figure this out.
    encoded_input = encoded_input.view(1,-1)

    # This will be what you generate, but it starts off with something.
    generated_str = starting_str

    # Put model in eval mode. This matters if we had dropout o batch / layer norms.
    model.eval()

    hidden, cell = model.init_hidden(1) # 1 * batch_size * hidden_size

    hidden = hidden.to(device)

    cell = cell.to(device)

    # Build up the starting hidden and cell states.
    # You can do this all in one go?
    for c in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state.
        out = encoded_input[:,c].view(1,1)
        # Pass out through, note we update hidden and cell and use them again
        # print(out.shape)
        # print(hidden.shape)
        # print(cell.shape)
        _, (hidden, cell) = model(out, hidden, cell)

    # Gte the last char; note we did not do go to the last char above.
    last_char = encoded_input[:,-1].view(1,1) # 1 * 1
    # Generate chars one at a time, add them to generated_str.
    # Do this over and over until you get the desired length.
    for i in range(len_generated_text):

        # Use hidden and cell from the above.
        # Use last_char, which will be updated over and over.
        # print(last_char.view(1,1) == last_char)
        logits, (hidden, cell) = model(last_char, hidden, cell) # 1 * vocab_size

        # Get the logits.
        logits = torch.squeeze(logits, dim = 0)


        # m is a random variable with probabilities based on the softmax of the logits.
        m = Categorical(nn.Softmax()(logits))

        # Generate from m 1 char.
        last_char = m.sample((1,))
        # print(last_char.shape)

        # Add the generated char to generated_str, but pass it through int2str so that
        generated_str += int2char[last_char.item()]

    return generated_str

torch.manual_seed(1)
model.to(device)
print(random_sample(model, starting_str='The island'))

The island,
my forewhat game, as iron foliar raiser!

“You are there wase from an island, without the
planks. All hours in more was finished to the sould
have been energue to Granite, that the
from evenable sufficient
gliment he
did, that an one rocument. It has been seni.

Last Pencroft, he talking them back.”

“Consider 4

Cyrus
Harding, “I doubt nothing apmeriantly have explorations, among, the crusion; the jokers
on the poor for another
BOb Harding looking and for thoust could not an” confident, in th
