# 0. Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
import pickle
from tqdm import tqdm


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\Jue\Desktop\NLP\A2\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\Jue\Desktop\NLP\A2\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\Jue\Desktop\NLP\A2\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\Ju

### GPU setup if available

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
# Set a fixed random seed for reproducibility
seed = 1234
torch.manual_seed(seed)

# Ensure deterministic behavior for operations on CUDA
torch.backends.cudnn.deterministic = True


# 1. Data preprocessing

## 1.1 Loading Data

In [4]:
dataset_main = datasets.load_dataset('minmarn/Gone_with_the_wind')

In [5]:
dataset_main

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 48491
    })
})

In [6]:
from datasets import DatasetDict

# Split the dataset into train (80%) and temporary (20%)
splits = dataset_main["train"].train_test_split(test_size=0.2, seed=42)
train_data = splits["train"]
temp_data = splits["test"]

# Further split the temporary set into validation (50%) and test (50%)
temp_splits = temp_data.train_test_split(test_size=0.5, seed=42)
validation_data = temp_splits["train"]
test_data = temp_splits["test"]

# Create the final DatasetDict
dataset = DatasetDict({
    "train": train_data,
    "validation": validation_data,
    "test": test_data,
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 38792
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 4849
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4850
    })
})


## 1.3 Tokenization

In [7]:
from torchtext.data.utils import get_tokenizer

# Initialize the tokenizer
tokenizer = get_tokenizer('basic_english')

# Define a lambda function to tokenize the dataset
def tokenize_data(example, tokenizer):
    return {'tokens': tokenizer(example['text'])}

# Apply the tokenizer using the map function
tokenized_dataset = dataset.map(
    lambda example: tokenize_data(example, tokenizer),
    remove_columns=['text']
)

print(tokenized_dataset)

Map: 100%|██████████| 38792/38792 [00:01<00:00, 28997.59 examples/s]
Map: 100%|██████████| 4849/4849 [00:00<00:00, 30996.90 examples/s]
Map: 100%|██████████| 4850/4850 [00:00<00:00, 28235.61 examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens'],
        num_rows: 38792
    })
    validation: Dataset({
        features: ['tokens'],
        num_rows: 4849
    })
    test: Dataset({
        features: ['tokens'],
        num_rows: 4850
    })
})





In [8]:
print(tokenized_dataset['train']['tokens'])



## 1.4 Numericalization

In [9]:
from torchtext.vocab import build_vocab_from_iterator

# Build the vocabulary from the tokens in the training dataset
vocab = build_vocab_from_iterator(
    tokenized_dataset['train']['tokens'],
    min_freq=3  # Only include tokens that appear at least 3 times
)

# Add special tokens to the vocabulary
vocab.insert_token('<unk>', 0)  # Unknown token
vocab.insert_token('<eos>', 1)  # End-of-sequence token

# Set the default index for unknown tokens
vocab.set_default_index(vocab['<unk>'])


In [10]:
print(len(vocab))

6996


In [11]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', ',', '.', 'the', 'and', "'", 'to', 'of', 'she']


## 2. Prepare the batch loader

In [12]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens']
            # Add <eos> at the end of each sequence when a period occurs
            if '.' in tokens:
                tokens.append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [13]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [14]:
train_data.shape

torch.Size([128, 3238])

## 3. Modeling

In [15]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

## 4. Training

In [16]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [17]:
# Data dictionary to save
Data = {
    'vocab_size': vocab_size,
    'emb_dim': emb_dim,
    'hid_dim': hid_dim,
    'num_layers': num_layers,
    'dropout_rate': dropout_rate,
    'tokenizer': tokenizer,
    'vocab': vocab
}

# Save the dictionary to a .pkl file
output_path = './app/models/Data.pkl'

pickle.dump(Data,open(output_path, 'wb'))


In [18]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 31,128,404 trainable parameters


In [19]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [20]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [21]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [27]:
n_epochs = 100
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './app/models/best-val-lstm_lm.pt')

    if epoch % 10 == 0:
        print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
        print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                         

	Train Perplexity: 426.947
	Valid Perplexity: 324.734


                                                         

	Train Perplexity: 111.901
	Valid Perplexity: 112.742


                                                         

	Train Perplexity: 80.807
	Valid Perplexity: 105.712


                                                         

	Train Perplexity: 66.701
	Valid Perplexity: 105.358


                                                         

	Train Perplexity: 66.761
	Valid Perplexity: 105.330


                                                         

	Train Perplexity: 66.748
	Valid Perplexity: 105.330


                                                         

	Train Perplexity: 66.696
	Valid Perplexity: 105.329


                                                         

	Train Perplexity: 66.606
	Valid Perplexity: 105.329


                                                         

	Train Perplexity: 66.715
	Valid Perplexity: 105.329


                                                         

	Train Perplexity: 66.747
	Valid Perplexity: 105.329


                                                         

## 5. Testing

In [None]:
seq_len  = 50
model.load_state_dict(torch.load('./app/models/best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

## 6. Real world inference

In [25]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [26]:
prompt = 'Scarlett was not beautiful,'
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
scarlett was not beautiful , dragging defeated somber dog awkward prissy hairpins reach catholic runner buckets stillness exclaimed maddening graces rudeness alabama grade wooden emaciated barefooted kill tidings purple candles recognized dilcey years macon wench

0.7
scarlett was not beautiful , dragging defeated somber dog awkward well-bred hairpins reach catholic runner buckets stillness exclaimed maddening graces rudeness alabama grade wooden emaciated barefooted kill tidings purple candles recognized dilcey years macon wench

0.75
scarlett was not beautiful , dragging defeated somber dog awkward well-bred hairpins reach catholic runner buckets stillness exclaimed maddening graces rudeness alabama grade wooden emaciated barefooted kill tidings purple candles recognized dilcey years macon wench

0.8
scarlett was not beautiful , dragging defeated somber dog awkward well-bred hairpins reach catholic runner buckets stillness exclaimed maddening graces rudeness alabama grade wooden 