In [3]:
import torch 
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math # hugging face dataset loading
from tqdm import tqdm # to track the process bar

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## 1. Load data - Harry potter books

train_dataset = book no 1-5

test_dataset  = book no 7

valid_dataset = book no 6

In [5]:
seed = 1234 
torch.manual_seed(seed) 
torch.backends.cudnn.deterministic = True  

In [6]:
dataset = datasets.load_dataset('KaungHtetCho/Harry_Potter_LSTM') # load_dataset from my repo (https://huggingface.co/datasets/KaungHtetCho/Harry_Potter_LSTM)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 57435
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5897
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6589
    })
})

In [8]:
print(dataset['train'][100]['text'])

"Fancy seeing you here, Professor McGonagall." 


In [9]:
print(dataset['train'].shape)

(57435, 1)


## 2. Preprocessing

### tokenization

In [10]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english') # built-in tokenizer from torchtext

In [11]:
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])} 

# def tokenize_data(example, tokenizer):
#     tokens = tokenizer(example['text'])
#     return tokens

In [12]:
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})

In [13]:
print(tokenized_dataset['train'])
print(tokenized_dataset['test'])
print(tokenized_dataset['validation'])


Dataset({
    features: ['tokens'],
    num_rows: 57435
})
Dataset({
    features: ['tokens'],
    num_rows: 6589
})
Dataset({
    features: ['tokens'],
    num_rows: 5897
})


### Numericalizing

In [14]:
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3) 
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>']) # if there is no index, assigned '<unk>'

In [15]:
print(len(vocab))

9803


In [16]:
print(vocab.get_itos()[:10]) # .get_itos > map index to acutal tokens
                            # need to <eos> for the end of sentence

['<unk>', '<eos>', '.', ',', 'the', 'and', 'to', "'", 'of', 'a']


## 3. Prepare the batch loader


Given "Chaky loves eating at AIT", and "I really love deep learning", and given batch size = 3,

we will get three batches of data "Chaky loves eating at", "AIT `<eos>` I really", "love deep learning `<eos>`".  

In [17]:
def get_data(dataset, vocab, batch_size): # vocab = tokens to integer index

    data = []

    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens) # creating new list # append = extra list

    data = torch.LongTensor(data) # coverting to tensor int

    num_batches = data.shape[0]  // batch_size # integer division # data.shape[0] = 12 as in example
    data        = data[:num_batches * batch_size] 

    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len] 

# can use dataloader in pytorch way also

In [18]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [19]:
train_data.shape, valid_data.shape, test_data.shape

(torch.Size([128, 7204]), torch.Size([128, 1711]), torch.Size([128, 1902]))

## 4. Modeling

In [20]:
class LSTMLanguageModel(nn.Module):

    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):

        super().__init__()
        self.num_layers = num_layers # defining lstm (how many layers of LSTM)
        self.hid_dim    = hid_dim    # vector size
        self.emb_dim    = emb_dim    
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim) # input the text > get_embedded > sent to LSTM (vectorized)
                                                            # word -> embedding(vectorized)
        
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True) # dropout connect -> drop weights between LSTM
                                                                                                                   # in paper
        
        # seq length -> 

        self.dropout    = nn.Dropout(dropout_rate) # after certain process

        # hidden dim to vocab size to softmax 
        self.fc         = nn.Linear(hid_dim, vocab_size) # prediction head

        self.init_weights()

    # optionally # inital weight with range
    def init_weights(self):
        # from the paper
        # by bounding them into specific range, the weight doesn't go too big
        
        init_range_emb   = 0.1 
        init_range_other = 1/math.sqrt(self.hid_dim) 
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other) 
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_() # bias is not effecting a lot, then zero

        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim, self.hid_dim).uniform_(-init_range_other, init_range_other) #We #work with x
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh #work with previous h
    
    # will be called in training (hidden,cell)
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device) # to take fully control of hidden 
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device) 
        return hidden, cell
           
    def detach_hidden(self, hidden):
        hidden, cell = hidden 
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden): 
        #src: [batch_size, seq len]
        embedding  = self.dropout(self.embedding(src)) #harry potter is ... # can learn pattern # embedding dropout
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden) 
        #ouput: [batch size, seq len, hid dim] # 
        #hidden: [num_layers * direction, seq len, hid_dim]
        output     = self.dropout(output) # variation dropout is similar to dropout 

        #
        prediction = self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden # to carry foward hidden

## 5. Training

In [21]:
vocab_size   = len(vocab)
emb_dim      = 1024                # 400 in the paper
hid_dim      = 1024                # 1150 in the paper
num_layers   = 2                   # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [22]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr) # similar to SGD, having learning rate that is adaptive
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 36,879,947 trainable parameters


In [23]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [24]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train() 
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device) # to ensure that the previous hidden is not effected
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden) # hidden is not effected our training 

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # to sure not to explode
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [25]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss  = 0
    model.eval()
    num_batches = data.shape[-1]
    data        = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden      = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [26]:
import time

n_epochs = 50
seq_len  = 50 # <----decoding length
clip     = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0) # reduce the learing rate by 50

best_valid_loss = float('inf')

start_time = time.time()

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './app/models/best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

end_time = time.time()
total_time = end_time - start_time
total_mins, total_secs = divmod(total_time, 60)
print(f"Training completed in {int(total_mins):02d}m {int(total_secs):02d}s")

                                                           

	Train Perplexity: 503.092
	Valid Perplexity: 288.698


                                                           

	Train Perplexity: 237.758
	Valid Perplexity: 150.560


                                                           

	Train Perplexity: 151.484
	Valid Perplexity: 114.608


                                                           

	Train Perplexity: 119.994
	Valid Perplexity: 99.491


                                                           

	Train Perplexity: 103.955
	Valid Perplexity: 90.662


                                                           

	Train Perplexity: 93.269
	Valid Perplexity: 84.886


                                                           

	Train Perplexity: 85.048
	Valid Perplexity: 81.244


                                                           

	Train Perplexity: 78.625
	Valid Perplexity: 77.842


                                                           

	Train Perplexity: 73.444
	Valid Perplexity: 75.940


                                                           

	Train Perplexity: 69.072
	Valid Perplexity: 74.356


                                                           

	Train Perplexity: 65.358
	Valid Perplexity: 72.787


                                                           

	Train Perplexity: 62.149
	Valid Perplexity: 71.659


                                                           

	Train Perplexity: 59.285
	Valid Perplexity: 71.156


                                                           

	Train Perplexity: 56.783
	Valid Perplexity: 70.322


                                                           

	Train Perplexity: 54.591
	Valid Perplexity: 69.628


                                                           

	Train Perplexity: 52.612
	Valid Perplexity: 69.468


                                                           

	Train Perplexity: 50.834
	Valid Perplexity: 68.948


                                                           

	Train Perplexity: 49.180
	Valid Perplexity: 68.494


                                                           

	Train Perplexity: 47.570
	Valid Perplexity: 68.397


                                                           

	Train Perplexity: 46.136
	Valid Perplexity: 68.126


                                                           

	Train Perplexity: 44.854
	Valid Perplexity: 67.506


                                                           

	Train Perplexity: 43.577
	Valid Perplexity: 67.387


                                                           

	Train Perplexity: 42.490
	Valid Perplexity: 67.318


                                                           

	Train Perplexity: 41.414
	Valid Perplexity: 67.326


                                                           

	Train Perplexity: 39.542
	Valid Perplexity: 66.944


                                                           

	Train Perplexity: 38.641
	Valid Perplexity: 67.139


                                                           

	Train Perplexity: 37.713
	Valid Perplexity: 67.158


                                                           

	Train Perplexity: 37.145
	Valid Perplexity: 66.686


                                                           

	Train Perplexity: 36.791
	Valid Perplexity: 66.759


                                                           

	Train Perplexity: 36.484
	Valid Perplexity: 66.649


                                                           

	Train Perplexity: 36.402
	Valid Perplexity: 66.662


                                                           

	Train Perplexity: 36.170
	Valid Perplexity: 66.604


                                                           

	Train Perplexity: 36.077
	Valid Perplexity: 66.669


                                                           

	Train Perplexity: 36.043
	Valid Perplexity: 66.701


                                                           

	Train Perplexity: 35.956
	Valid Perplexity: 66.710


                                                           

	Train Perplexity: 35.907
	Valid Perplexity: 66.716


                                                           

	Train Perplexity: 35.949
	Valid Perplexity: 66.713


                                                           

	Train Perplexity: 35.937
	Valid Perplexity: 66.716


                                                           

	Train Perplexity: 35.882
	Valid Perplexity: 66.718


                                                           

	Train Perplexity: 35.902
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.957
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.875
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.938
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.910
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.903
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.850
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.894
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.861
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.862
	Valid Perplexity: 66.717


                                                           

	Train Perplexity: 35.901
	Valid Perplexity: 66.717
Training completed in 27m 15s


## 6. Testing

In [27]:
seq_len = 50

In [29]:
model.load_state_dict(torch.load('./app/models/best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 84.753


## 7. Real-world inference

In [30]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] 
            #probability of last vocab
            
            # is = 0.3 on = 0.5 eat 0.2
            # is is is on on on on on eat eat >  sampling

            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [31]:
prompt = 'Hagrid scare '
max_seq_len = 100
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence

temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
hagrid scare up , said ron , pointing at the piece of parchment .

0.7
hagrid scare up to the castle for years that was well worked . he had to exercise it . . . and then he was not going to think he was in the head , but he was starting to be able to find himself to try and get rid of him , so he hadn ' t been stuck in , and there was no sign of him .

0.75
hagrid scare up to the castle for years that was well worked . he had to exercise it . . . and then he was not going to think he was in the head , but he was starting his firebolt , too . . . he was going to show him , so he hadn ' t been stuck in , and there was no sign of him .

0.8
hagrid scare up to the castle for years that was well worked . he had to exercise it . . . and then he was not going to think to the dementors . . . but he said , his eyes were definitely as though he was going to be a bit sorry .

1.0
hagrid scare being killed , tell me that ! well , you may see you . it is .



In [32]:
# Save training data for website

import pickle

Data = {
    'vocab_size': vocab_size,
    'emb_dim': emb_dim,
    'hid_dim': hid_dim,
    'num_layers': num_layers,
    'dropout_rate': dropout_rate,
    'tokenizer': tokenizer,
    'vocab': vocab
}

pickle.dump(Data,open('./app/models/data.pkl', 'wb'))