# Recurrent Neural Networks and Language Models

https://huggingface.co/datasets/myamjechal/star-wars-dataset --> This is path to my dataset from Huggin Face

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1112
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# torch.cuda.get_device_name(0)

cuda


## 1. Load data - Star Wars

- I have selected a Star Wars dataset from Hugging Face, which consists of approximately 22,900 rows in the training set, 2,543 rows in the validation set, and 2,825 rows in the test set.

In [3]:
dataset = datasets.load_dataset('myamjechal/star-wars-dataset')
print(dataset)

README.md:   0%|          | 0.00/581 [00:00<?, ?B/s]

star_wars_train_80.csv:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

star_wars_val_10.csv:   0%|          | 0.00/153k [00:00<?, ?B/s]

star_wars_test_10.csv:   0%|          | 0.00/168k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/22878 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2543 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2825 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 22878
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2543
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2825
    })
})


In [5]:
print(dataset['train'][333]['text'])

Luke thinks he hears a tiny softening


## 2. Preprocessing

### Tokenizing

Simply tokenize the given text to tokens.

In [6]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')

#function to tokenize
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  

#map the function to each example
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][333]['tokens'])



Map:   0%|          | 0/22878 [00:00<?, ? examples/s]

Map:   0%|          | 0/2543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2825 [00:00<?, ? examples/s]

['luke', 'thinks', 'he', 'hears', 'a', 'tiny', 'softening']


### Numericalizing
- We’ll configure torchtext to add any word that appears at least three times in the dataset to the vocabulary. This helps keep the vocabulary size manageable. Additionally, we’ll ensure that the special tokens `unk` (unknown) and `eos` (end of sequence) are included in the vocabulary.

In [7]:
## numericalizing
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])       



5752
['<unk>', '<eos>', 'the', ',', 'and', 'a', 'to', 'of', 'is', 'in']


## 3. Prepare the batch loader

### Prepare data

In [8]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example['tokens'].append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size #get the int number of batches...
    data = data[:num_batches * batch_size] #make the batch evenly, and cut out any remaining                      
    data = data.view(batch_size, num_batches)          
    return data #[batch size, bunch of tokens]


In [9]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

## 4. Modeling 

In [10]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

## 5. Training 

- The process follows a straightforward approach. One thing to keep in mind is that some of the input sequences fed into the model might contain parts from different sequences in the original dataset or be a subset of a single sequence, depending on the decoding length. Because of this, I reset the hidden state at the start of each epoch. This approach assumes that the next batch of sequences is likely a continuation of the previous ones in the original dataset.

In [11]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [12]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 28,579,448 trainable parameters


In [13]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [14]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, bunch of tokens]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [15]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

- On the next cell of code, I'll be using the `ReduceLROnPlateau` learning rate scheduler, which automatically reduces the learning rate by a certain factor if the loss doesn't improve after a specified number of epochs.

In [16]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                         

	Train Perplexity: 683.480
	Valid Perplexity: 435.430


                                                         

	Train Perplexity: 430.917
	Valid Perplexity: 325.457


                                                         

	Train Perplexity: 307.490
	Valid Perplexity: 229.548


                                                         

	Train Perplexity: 239.335
	Valid Perplexity: 191.843


                                                         

	Train Perplexity: 200.203
	Valid Perplexity: 163.574


                                                         

	Train Perplexity: 173.476
	Valid Perplexity: 144.391


                                                         

	Train Perplexity: 153.276
	Valid Perplexity: 131.126


                                                         

	Train Perplexity: 138.222
	Valid Perplexity: 120.743


                                                         

	Train Perplexity: 125.899
	Valid Perplexity: 112.960


                                                         

	Train Perplexity: 116.802
	Valid Perplexity: 107.323


                                                         

	Train Perplexity: 109.156
	Valid Perplexity: 102.732


                                                         

	Train Perplexity: 102.638
	Valid Perplexity: 99.081


                                                         

	Train Perplexity: 96.858
	Valid Perplexity: 95.568


                                                         

	Train Perplexity: 91.946
	Valid Perplexity: 93.063


                                                         

	Train Perplexity: 87.501
	Valid Perplexity: 91.131


                                                         

	Train Perplexity: 83.573
	Valid Perplexity: 88.903


                                                         

	Train Perplexity: 80.094
	Valid Perplexity: 87.339


                                                         

	Train Perplexity: 76.697
	Valid Perplexity: 85.847


                                                         

	Train Perplexity: 73.905
	Valid Perplexity: 85.197


                                                         

	Train Perplexity: 71.310
	Valid Perplexity: 83.593


                                                         

	Train Perplexity: 68.784
	Valid Perplexity: 82.721


                                                         

	Train Perplexity: 66.372
	Valid Perplexity: 81.786


                                                         

	Train Perplexity: 64.331
	Valid Perplexity: 81.306


                                                         

	Train Perplexity: 62.391
	Valid Perplexity: 80.426


                                                         

	Train Perplexity: 60.558
	Valid Perplexity: 80.063


                                                         

	Train Perplexity: 58.703
	Valid Perplexity: 79.674


                                                         

	Train Perplexity: 57.143
	Valid Perplexity: 79.162


                                                         

	Train Perplexity: 55.552
	Valid Perplexity: 78.996


                                                         

	Train Perplexity: 53.916
	Valid Perplexity: 78.674


                                                         

	Train Perplexity: 52.582
	Valid Perplexity: 78.585


                                                         

	Train Perplexity: 51.162
	Valid Perplexity: 78.546


                                                         

	Train Perplexity: 49.853
	Valid Perplexity: 78.243


                                                         

	Train Perplexity: 48.611
	Valid Perplexity: 78.650


                                                         

	Train Perplexity: 46.476
	Valid Perplexity: 77.767


                                                         

	Train Perplexity: 45.572
	Valid Perplexity: 77.792


                                                         

	Train Perplexity: 44.373
	Valid Perplexity: 77.976


                                                         

	Train Perplexity: 43.608
	Valid Perplexity: 77.498


                                                         

	Train Perplexity: 43.197
	Valid Perplexity: 77.580


                                                         

	Train Perplexity: 42.975
	Valid Perplexity: 77.485


                                                         

	Train Perplexity: 42.790
	Valid Perplexity: 77.546


                                                         

	Train Perplexity: 42.655
	Valid Perplexity: 77.482


                                                         

	Train Perplexity: 42.691
	Valid Perplexity: 77.493


                                                         

	Train Perplexity: 42.653
	Valid Perplexity: 77.487


                                                         

	Train Perplexity: 42.593
	Valid Perplexity: 77.492


                                                         

	Train Perplexity: 42.551
	Valid Perplexity: 77.495


                                                         

	Train Perplexity: 42.604
	Valid Perplexity: 77.494


                                                         

	Train Perplexity: 42.650
	Valid Perplexity: 77.494


                                                         

	Train Perplexity: 42.559
	Valid Perplexity: 77.494


                                                         

	Train Perplexity: 42.612
	Valid Perplexity: 77.494


                                                         

	Train Perplexity: 42.565
	Valid Perplexity: 77.494


## 6. Testing

In [17]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 78.050


## 7. Real-world inference

1. First, I take the input prompt, tokenize it, encode it, and pass it into the model to get predictions. Then, I apply the softmax function, focusing on the output of the last word in the sequence, which predicts the next word. To adjust the model's confidence, I divide the logits by a temperature value, which changes the softmax probability distribution.

2. After getting the softmax distribution, I randomly sample from it to predict the next word. If the result is `<unk>`, I try again. The process stops once I get the `<eos>` token, which marks the end of the sequence.

3. Finally, I decode the predicted tokens back into text.

In [20]:
import pickle ## To save a file

In [21]:
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

In [22]:
with open("vocab.pkl", "rb") as f:
    vocab = pickle.load(f)

In [24]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [25]:
prompt = "Star Wars is"
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature) + '\n' + ' '.join(generation) + '\n')

0.5
star wars is a small , shrouded , and dark figure , and the trade federation has silenced contact

0.7
star wars is a small explosion with the ship

0.75
star wars is a small explosion with the ship

0.8
star wars is a small explosion with the ship

1.0
star wars is old city

