# 0. Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### GPU setup if available

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# 1. Data preprocessing

## 1.1 Loading Data

In [4]:
dataset = datasets.load_dataset('KyiThinNu/FairyTales')

Downloading data: 100%|██████████| 446k/446k [00:00<00:00, 676kB/s]
Downloading data: 100%|██████████| 561k/561k [00:00<00:00, 1.61MB/s]
Downloading data: 100%|██████████| 236k/236k [00:00<00:00, 720kB/s]
Downloading data: 100%|██████████| 983k/983k [00:00<00:00, 1.06MB/s]
Downloading data: 100%|██████████| 49.8k/49.8k [00:00<00:00, 152kB/s]
Downloading data: 100%|██████████| 279k/279k [00:00<00:00, 860kB/s]
Downloading data: 100%|██████████| 1.53M/1.53M [00:00<00:00, 3.37MB/s]
Downloading data: 100%|██████████| 246k/246k [00:00<00:00, 744kB/s]
Downloading data: 100%|██████████| 301k/301k [00:00<00:00, 714kB/s]
Downloading data: 100%|██████████| 403k/403k [00:00<00:00, 1.13MB/s]
Downloading data: 100%|██████████| 351k/351k [00:00<00:00, 978kB/s]
Downloading data: 100%|██████████| 523k/523k [00:00<00:00, 1.51MB/s]
Downloading data: 100%|██████████| 629k/629k [00:00<00:00, 1.76MB/s]
Downloading data: 100%|██████████| 113k/113k [00:00<00:00, 358kB/s]
Downloading data: 100%|██████████| 657

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 102542
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 48274
    })
    test: Dataset({
        features: ['text'],
        num_rows: 41472
    })
})

In [6]:
dataset['train'].shape

(102542, 1)

In [7]:
# import os

# # Specify the directory containing your text files (one file per book)
# data_directory = './Datasets/'

# # Initialize an empty list to store the texts
# texts = []

# # Loop through each file in the directory
# for filename in os.listdir(data_directory):
#     if filename.endswith('.txt'):
#         file_path = os.path.join(data_directory, filename)
        
#         # Read the contents of the file and append to the list
#         with open(file_path, 'r', encoding='utf-8') as file:
#             text = file.read()
#             texts.append(text)

# # Now 'texts' contains the content of each book in your dataset


## 1.2 Text preprocessing

In [8]:
# # Split the dataset into paragraphs based on double line breaks : to get one paragraph in a list item ['paragraph1', 'paragraph']
# paragraphs = [paragraph.strip() for text in texts for paragraph in text.split('\n\n') ]

In [9]:
# len(paragraphs)

In [10]:
# for i in range (len(paragraphs)):
#     # Replace newline characters with spaces
#     paragraphs[i] = paragraphs[i].replace('\n', ' ')

#     # Replace Byte Order Mark (BOM) characters with spaces
#     paragraphs[i] = paragraphs[i].replace('\ufeff', ' ')

In [11]:
# data = datasets.map(paragraphs)

## 1.3 Tokenization

In [12]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}

tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})

Map: 100%|██████████| 102542/102542 [00:02<00:00, 42988.09 examples/s]
Map: 100%|██████████| 48274/48274 [00:01<00:00, 43209.99 examples/s]
Map: 100%|██████████| 41472/41472 [00:00<00:00, 41719.27 examples/s]


In [13]:
print(tokenized_dataset['train'][99]['tokens'])

['the', 'three', 'little', 'pigs']


In [14]:
# tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# tokenized_paragraphs = [{'tokens': tokenizer(paragraph)} for paragraph in paragraphs]

## 1.4 Numericalization

In [15]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [16]:
print(len(vocab))

12950


In [17]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', ',', 'the', 'and', '.', 'of', 'to', 'a', 'he']


## 2. Prepare the batch loader

In [18]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens']
            # Add <eos> at the end of each sequence when a period occurs
            if '.' in tokens:
                tokens.append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [19]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [20]:
train_data.shape

torch.Size([128, 9019])

## 3. Modeling

In [21]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

## 4. Training

In [22]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [23]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 43,328,150 trainable parameters


In [24]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [25]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [26]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [31]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'fairy-tale-lstm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 124.889
	Valid Perplexity: 94.358


                                                           

	Train Perplexity: 104.552
	Valid Perplexity: 84.630


                                                           

	Train Perplexity: 90.980
	Valid Perplexity: 78.312


## 5. Testing

In [28]:
seq_len  = 50
model.load_state_dict(torch.load('fairy-tale-lstm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 118.325


## 6. Real world inference

In [29]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [30]:
prompt = 'Dr. Watson is going '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
dr . <unk> is going to

0.7
dr . <unk> is going to

0.75
dr . <unk> is going to

0.8
dr . <unk> is going to

1.0
dr . <unk> is going to

