# Generating Bible

We are going to create two generating models with LSTM: at character and word level.

In [1]:
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.distributions.categorical import Categorical

### Define classes for datasets and models

In [2]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        # Get the text chunk at index idx.
        text_chunk = self.text_chunks[idx]
        # Return (x, y) where x has length 40 and y has length 40.
        # y should be x shifted by 1 time.
        return (text_chunk[:-1], text_chunk[1:])

In [3]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        # Set to an embedding layer of vocab_size by embed_dim.
        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim
        ) 
        self.rnn_hidden_size = rnn_hidden_size
        
        # LSTM layer: recurrent network with long memory
        self.rnn = nn.LSTM(
            input_size=embed_dim,
            hidden_size=rnn_hidden_size,
            batch_first=True
        )
        
        # Make a linear layer from rnn_hidden_size to vocab_size.
        # This will be used to get the yt for each xt.
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, text, hidden=None, cell=None):
        # Get the embeddings for text.
        out = self.embedding(text)
        
        # Pass out, hidden and cell through the rnn.
        # If hidden is None, don't specify it and just use out.
        if hidden is not None:
            out, (hidden, cell) = self.rnn(out, (hidden, cell))
        else:
            out, (hidden, cell) = out, (None, None)
        
        # Pass out through fc.
        out = self.fc(out)
        
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        # Initialize to zeros of 1 by batch_size. Maybe Xavier initialization?
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)

## I At character level

### Get the data and process

In [4]:
with open('bible.txt', 'r', encoding="utf8") as fp:
    text=fp.read()
    
text[:250]

'1:1 In the beginning God created the heaven and the earth.\n\n1:2 And the earth was without form, and void; and darkness was upon\nthe face of the deep. And the Spirit of God moved upon the face of the\nwaters.\n\n1:3 And God said, Let there be light: and '

In [5]:
# Get the unique set of characters.
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 4332453
Unique Characters: 74


### Tokenize and get other helpers

In [6]:
# The universe of characters.
chars_sorted = sorted(char_set)

# Effectively, these maps are the tokenizer.
# Map each char to a unique int. This is a dict.
char2int = {char: i for i, char in enumerate(chars_sorted)}
# Do the revverse of the above, this should be a np array.
int2char = np.array(chars_sorted)

# Tokenize the entire corpus. This should be an np array of np.int32 type.
text_encoded = np.array([char2int[c] for c in text])

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:30], ' == Reverse  ==> ', ''.join(int2char[text_encoded[15:30]]))

Text encoded shape:  (4332453,)
1:1 In the begi      == Encoding ==>  [11 20 11  1 31 61  1 67 55 52  1 49 52 54 56]
[61 61 56 61 54  1 29 62 51  1 50 65 52 48 67]  == Reverse  ==>  nning God creat


### Process the data and get the data loader

In [7]:
seq_length = 40
chunk_size = seq_length + 1

# Break up the data into chunks of size 41. This should be a list of lists.
# Use text_encoded. This will be used to get (x, y) pairs.
text_chunks = np.array([list(text_encoded[i:i+chunk_size]) for i in range(len(text_encoded)-chunk_size)])

seq_dataset = TextDataset(torch.tensor(text_chunks))

In [8]:
# y is x shifted by 1
for i, (seq, target) in enumerate(seq_dataset):
    # 40 characters for source and target ...
    print(seq.shape, target.shape)
    print('Input (x):', repr(''.join(int2char[seq])))
    print('Target (y):', repr(''.join(int2char[target])))
    print()
    if i == 1:
        break 

torch.Size([40]) torch.Size([40])
Input (x): '1:1 In the beginning God created the hea'
Target (y): ':1 In the beginning God created the heav'

torch.Size([40]) torch.Size([40])
Input (x): ':1 In the beginning God created the heav'
Target (y): '1 In the beginning God created the heave'



In [9]:
device = torch.device("cpu")
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Instantiate the model

In [10]:
vocab_size = len(int2char)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)

char_model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
char_model = char_model.to(device)
char_model

RNN(
  (embedding): Embedding(74, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=74, bias=True)
)

### Training phase

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(char_model.parameters(), lr=0.005)

# Set to 1000.
num_epochs = 1000

# epochs here will mean batches.

for epoch in range(num_epochs):
    hidden, cell = char_model.init_hidden(batch_size)
    
    # Get the next batch from seq_dl
    seq_batch, target_batch = next(iter(seq_dl))
    
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    
    optimizer.zero_grad()
    
    loss = 0

    # Pass through the model.
    logits, _ = char_model(seq_batch, hidden=hidden, cell=cell)
    
    # Get the loss.
    # You'll need to reshape / view things to make this work.
    #target_ = torch.Tensor([[[1 if k==target_batch[i, j] else 0 for k in range(vocab_size)] for j in range(seq_length)] for i in range(batch_size)])
    loss += criterion(input=logits.transpose(1,2), target=target_batch.long())   
    
    # Do back prop.
    loss.backward()
    
    # Do an optimization step.
    optimizer.step()
    
    # Get the value in the tensor loss.
    loss = loss.item()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 4.3080
Epoch 100 loss: 1.6483
Epoch 200 loss: 1.4469
Epoch 300 loss: 1.4253
Epoch 400 loss: 1.3399
Epoch 500 loss: 1.2940
Epoch 600 loss: 1.2677
Epoch 700 loss: 1.3161
Epoch 800 loss: 1.2719
Epoch 900 loss: 1.2610


### Random decoding : Sample each character according to the proba distribution given by the model

In [12]:
def random_sample_char(
    model,
    starting_str, 
    len_generated_text=500, 
):

    # Encode starting string into a tensor using char2str.
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    
    # Reshape to be 1 by ??? - let PyTorch figure this out.
    encoded_input = encoded_input.reshape(1, len(starting_str))

    # This will be what you generate, but it starts off with something.
    generated_str = starting_str

    # Put model in eval mode. This matters if we had dropout o batch / layer norms.
    model.eval()
    
    hidden, cell = model.init_hidden(1)
    
    hidden = hidden.to(device)
    cell = cell.to(device)
        
    # Build up the starting hidden and cell states.
    # You can do this all in one go?
    for c in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state.
        out = encoded_input[0, c].reshape(1,1)
        # Pass out through, note we update hidden and cell and use them again
        _, (hidden, cell) = model(out, hidden=hidden, cell=cell)
        
    
    # Gte the last char; note we did not do go to the last char above.
    last_char = starting_str[-1]
    # Generate chars one at a time, add them to generated_str.
    # Do this over and over until you get the desired length.
    for i in range(len_generated_text):
        
        # Use hidden and cell from the above.
        # Use last_char, which will be updated over and over.
        logits, (hidden, cell) = model(torch.as_tensor([[char2int[last_char]]]), hidden=hidden, cell=cell) 
        
        # Get the logits.
        logits = logits[0][0]
        
        # m is a random variable with probabilities based on the softmax of the logits.
        m = Categorical(nn.functional.softmax(logits))
        
        # Generate from m 1 char.
        last_char_index = m.sample().item()
        last_char = int2char[last_char_index]
        
        # Add the geenrated char to generated_str, but pass it through int2str so that 
        generated_str += last_char
        
    return generated_str

char_model.to(device)
print(random_sample_char(char_model, starting_str='23:4 And the Lord said unto Moses', len_generated_text=1000))

  m = Categorical(nn.functional.softmax(logits))


23:4 And the Lord said unto Moses, My servants idol, and Hamah,
underadgeth, and in
Jerusaleaz.

9:14 He done the cannot shall be with ointment, and Hozor and said, Go of
NebrethKseph and Aaron, and two life is
ne, Go yet heart.

38:37 And the children of Isroah,
and tell doctcrine beproys the priests:
therewing, nor mitting of silves are trulf was divest tree? which shut on his
fasters: 30:8 She into Egypt,: the name
on his time is iniquity.

9:32 And it did not the manness with
the ground against her.

44:20 And the I have given them and I will do it live; changed I am the unterices and walking over the
Phechehioh, and have build, he was rise to the
LORD; and understand het an oil, les spake eleven and hations men, she shall she
that man laughters, and come for anger hand, and
the people, and wilt passeth to this
prey and all the names, against the hosseh, O gland cutcity acceptanting her.

47:23 And the law is a mother waiteth, and and
full be bullock, nor habitation; who disple to


## II At word level

### Get data and preprocess

In [13]:
with open('bible.txt', 'r', encoding="utf8") as fp:
    text=fp.read()
    
# remove \n and \r and lower case
text = text.replace('\r', '').replace('\n', ' ').lower()

# remover the paragaphs numbers
import re
pattern = r'[0-9]+:[0-9]+'
text = re.sub(pattern, '', text)[1:].replace('   ', ' ').replace('  ', ' ')

# remove punctutation
text = re.sub(r'[^\w\s]', '', text)
text[:250]

'in the beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters and god said let there be light and there was light and g'

In [14]:
# Get the unique set of words.
text = text.split()
word_set = set(text)
print('Total Length:', len(text))
print('Unique Words:', len(word_set))

Total Length: 790010
Unique Words: 12697


### Tokenize and get helpers

In [15]:
# The universe of words.
words_sorted = sorted(word_set)

# Effectively, these maps are the tokenizer.
# Map each char to a unique int. This is a dict.
w2int = {w: i for i, w in enumerate(words_sorted)}
# Do the revverse of the above, this should be a np array.
int2w = np.array(words_sorted)

# Tokenize the entire corpus. This should be an np array of np.int32 type.
text_encoded = np.array([w2int[w] for w in text])

print('Text encoded shape: ', text_encoded.shape)

print(' '.join(text[:5]), '     == Encoding ==> ', text_encoded[:5])
print(text_encoded[5:10], ' == Reverse  ==> ', ' '.join(int2w[text_encoded[5:10]]))

Text encoded shape:  (790010,)
in the beginning god created      == Encoding ==>  [ 5723 11307  1205  4787  2657]
[11307  5258   532 11307  3439]  == Reverse  ==>  the heaven and the earth


### Process the data and get dataloader

In [16]:
seq_length = 40
chunk_size = seq_length + 1

# Break up the data into chunks of size 41. This should be a list of lists.
# Use text_encoded. This will be used to get (x, y) pairs.
text_chunks = np.array([list(text_encoded[i:i+chunk_size]) for i in range(len(text_encoded)-chunk_size)])

seq_dataset = TextDataset(torch.tensor(text_chunks))

In [17]:
for i, (seq, target) in enumerate(seq_dataset):
    # 40 characters for source and target ...
    print(seq.shape, target.shape)
    print('Input (x):', repr(' '.join(int2w[seq])))
    print('Target (y):', repr(' '.join(int2w[target])))
    print()
    if i == 1:
        break 

torch.Size([40]) torch.Size([40])
Input (x): 'in the beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters and'
Target (y): 'the beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters and god'

torch.Size([40]) torch.Size([40])
Input (x): 'the beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters and god'
Target (y): 'beginning god created the heaven and the earth and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters and god said'



In [18]:
device = torch.device("cpu")
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Instantiate the model

In [19]:
vocab_size = len(int2w)
embed_dim = 512
rnn_hidden_size = 2048
torch.manual_seed(1)

word_model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
word_model = word_model.to(device)
word_model

RNN(
  (embedding): Embedding(12697, 512)
  (rnn): LSTM(512, 2048, batch_first=True)
  (fc): Linear(in_features=2048, out_features=12697, bias=True)
)

### Training phase

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(word_model.parameters(), lr=0.005)

# Set to 500.
num_epochs = 1000

# epochs here will mean batches.

for epoch in range(num_epochs):
    hidden, cell = word_model.init_hidden(batch_size)
    
    # Get the next batch from seq_dl
    seq_batch, target_batch = next(iter(seq_dl))
    
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    
    optimizer.zero_grad()
    
    loss = 0

    # Pass through the model.
    logits, _ = word_model(seq_batch, hidden=hidden, cell=cell)
    
    # Get the loss.
    # You'll need to reshape / view things to make this work.
    #target_ = torch.Tensor([[[1 if k==target_batch[i, j] else 0 for k in range(vocab_size)] for j in range(seq_length)] for i in range(batch_size)])
    loss += criterion(input=logits.transpose(1,2), target=target_batch.long())   
    
    # Do back prop.
    loss.backward()
    
    # Do an optimization step.
    optimizer.step()
    
    # Get the value in the tensor loss.
    loss = loss.item()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 9.4491
Epoch 100 loss: 5.0692
Epoch 200 loss: 4.4230
Epoch 300 loss: 3.9703
Epoch 400 loss: 3.8680
Epoch 500 loss: 3.6103
Epoch 600 loss: 3.1783
Epoch 700 loss: 3.2783
Epoch 800 loss: 2.9378
Epoch 900 loss: 2.9400


### Random decoding

In [22]:
def random_sample_word(
    model,
    starting_str, 
    len_generated_text=100, 
):

    starting_str = starting_str.split()
    
    # Encode starting string into a tensor using char2str.
    encoded_input = torch.tensor([w2int[w] for w in starting_str])
    
    # Reshape to be 1 by ??? - let PyTorch figure this out.
    encoded_input = encoded_input.reshape(1, len(starting_str))

    # This will be what you generate, but it starts off with something.
    generated_str = starting_str

    # Put model in eval mode. This matters if we had dropout o batch / layer norms.
    model.eval()
    
    hidden, cell = model.init_hidden(1)
    
    hidden = hidden.to(device)
    cell = cell.to(device)
        
    # Build up the starting hidden and cell states.
    # You can do this all in one go?
    for c in range(len(starting_str)-1):
        # Feed each letter 1 by 1 and then get the final hidden state.
        out = encoded_input[0, c].reshape(1,1)
        # Pass out through, note we update hidden and cell and use them again
        _, (hidden, cell) = model(out, hidden=hidden, cell=cell)
        
    
    # Gte the last word; note we did not do go to the last word above.
    last_word = starting_str[-1]
    # Generate chars one at a time, add them to generated_str.
    # Do this over and over until you get the desired length.
    for i in range(len_generated_text):
        
        # Use hidden and cell from the above.
        # Use last_word, which will be updated over and over.
        logits, (hidden, cell) = model(torch.as_tensor([[w2int[last_word]]]), hidden=hidden, cell=cell) 
        
        # Get the logits.
        logits = logits[0][0]
        
        # m is a random variable with probabilities based on the softmax of the logits.
        m = Categorical(nn.functional.softmax(logits))
        
        # Generate from m 1 char.
        last_word_index = m.sample().item()
        last_word = int2w[last_word_index]
        
        # Add the geenrated char to generated_str 
        generated_str.append(last_word)
        
    return ' '.join(generated_str)

word_model.to(device)
print(random_sample_word(word_model, starting_str='and the lord said unto moses', len_generated_text=100))

  m = Categorical(nn.functional.softmax(logits))


and the lord said unto moses the people after that paul had appeared unto him and moses said unto aaron take a censer and put fire therein and when the trumpet of sin is the crown of man wherefore evil in my ways work in righteousness and in the house of god with all his heart and with all his host against the lord his maker is mine and i am bereaved of my spots therefore were the princes rebuked the congregation of the lord hath no meat therefore my fury shall devour flesh and the dead bodies of the righteousness of the increase he shall
