# Step 0: Preparations and code from Assignment 1

Group 17: Jakob Svensson, Mahdi Afarideh, Maximilian Forsell

In [1]:
!git clone https://github.com/MahdiTheGreat/Intro-to-language-modeling.git
%cd Intro-to-language-modeling

Cloning into 'Intro-to-language-modeling'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 82 (delta 43), reused 2 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (82/82), 33.44 MiB | 19.22 MiB/s, done.
Resolving deltas: 100% (43/43), done.
/content/Intro-to-language-modeling


In [2]:
import sklearn
import nltk
import torch
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Set random seed for reproducibility
def set_seed(seed=2024):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(1998)

In [5]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f'Using device: {device}')

Using device: cuda


In [6]:
dataset='lmdemo'
zip_file = f"{dataset}.zip"
!unzip -q $zip_file
!rm $zip_file

In [7]:
training_set=open(f'{dataset}/train.txt','r',encoding='utf-8').read()
val_set=open(f'{dataset}/val.txt','r',encoding='utf-8').read()

In [8]:
from collections import Counter
class VocabularyBuilder:
    def __init__(self, max_voc_size):
        self.max_voc_size = max_voc_size
        self.str_to_int = {}
        self.int_to_str = {}
        self.special_tokens = ["BEGINNING", "END", "UNKNOWN", "PADDING"] #Added padding
        self.token_counter = Counter()

    def build_vocabulary(self, text):

        sents=nltk.word_tokenize(text.lower())

        for token in sents:
            self.token_counter[token] += 1

    def create_vocabulary(self):
        for idx, token in enumerate(self.special_tokens):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

        max_words = self.max_voc_size - len(self.special_tokens)
        most_common_tokens = self.token_counter.most_common(max_words)

        for idx, (token, _) in enumerate(most_common_tokens, start=len(self.special_tokens)):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

    def create_premade_vocabulary(self, c):
        for idx, token in enumerate(self.special_tokens):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

        max_words = self.max_voc_size - len(self.special_tokens)
        most_common_tokens = c.most_common(max_words) # Here we can use a premade counter from a previous run

        for idx, (token, _) in enumerate(most_common_tokens, start=len(self.special_tokens)):
            self.str_to_int[token] = idx
            self.int_to_str[idx] = token

    def get_token_id(self, token):
        return self.str_to_int.get(token.lower(), self.str_to_int["UNKNOWN"])

    def get_token_str(self, token_id):
        return self.int_to_str.get(token_id, "UNKNOWN")

    def sanity_check(self): # Here we run the sanity tests recommended in the assignment
        assert len(self.str_to_int) <= self.max_voc_size, "Vocabulary size exceeds max_voc_size."

        for token in self.special_tokens:
            assert token in self.str_to_int, f"Missing special token: {token}"

        common_words = ["the", "and"]
        rare_words = ["cuboidal", "epiglottis"]

        for word in common_words:
            assert word in self.str_to_int, f"Common word '{word}' not in vocabulary."

        for word in rare_words:
            assert word not in self.str_to_int, f"Rare word '{word}' should not be in vocabulary."

        test_word = "the"
        token_id = self.get_token_id(test_word)
        assert self.get_token_str(token_id) == test_word.lower(), "Round-trip token mapping failed."

        print("Sanity check passed!")

vocab_builder = VocabularyBuilder(max_voc_size=16384)


In [9]:
for paragraph in tqdm(training_set.splitlines()):
  vocab_builder.build_vocabulary(paragraph)
vocab_builder.create_vocabulary()

100%|██████████| 294118/294118 [01:37<00:00, 3023.45it/s]


In [10]:
# Perform sanity check
vocab_builder.sanity_check()

Sanity check passed!


In [11]:
# Modified for assignment 2
class TrainingDataPreparerRNN:
    def __init__(self, vocab_builder, max_sequence_length):
        self.vocab_builder = vocab_builder
        self.max = max_sequence_length

    def encode_text(self, text):
        """Tokenizes and encodes a single string with special symbols.

        Parameters:
        - text (str): The input string to encode.

        Returns:
        - List[int]: A list of token IDs including BEGINNING and END tokens.
        """
        # Tokenize the text
        tokens = nltk.word_tokenize(text.lower())

        token_ids = [self.vocab_builder.get_token_id(token) for token in tokens]
        modified_tokens = [0] # Add 1 BEGINNING
        modified_tokens.extend(token_ids)
        modified_tokens.append(1) # Add 1 END

        return modified_tokens

    def create_training_sequences(self, text):
        """
        Creates training sequences from a single string by generating sequences of length N+1.

        Parameters:
        - text (str): The input string to create sequences from.

        Returns:
        - List[Tuple[List[int], int]]: A list of (context, target) pairs.
        """
        encoded_text = self.encode_text(text)

        # Taken from: https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
        training_sequences = [encoded_text[i * self.max:(i + 1) * self.max] for i in range((len(encoded_text) + self.max - 1) // self.max )]

        return training_sequences


# Step 1

In [12]:
# Splitting
preparer = TrainingDataPreparerRNN(vocab_builder, max_sequence_length=128)

training_sequences = []
split_training_set = list(filter(''.__ne__, training_set.splitlines())) # Split and remove empty lines
for paragraph in tqdm(split_training_set):
  training_sequences.append(preparer.create_training_sequences(paragraph))
flattened_training_sequences =  [
    x
    for xs in training_sequences
    for x in xs
]

100%|██████████| 147059/147059 [01:26<00:00, 1703.43it/s]


In [13]:
# Prepare validation data also
val_sequences = []
split_val_set = list(filter(''.__ne__, val_set.splitlines())) # Split and remove empty lines
for paragraph in tqdm(split_val_set):
  val_sequences.append(preparer.create_training_sequences(paragraph))
flattened_val_sequences =  [
    x
    for xs in val_sequences
    for x in xs
]

100%|██████████| 17874/17874 [00:10<00:00, 1710.44it/s]


In [14]:
# Sanity check
for context in flattened_training_sequences[:10]:  # Show the first few sequences
    print([vocab_builder.get_token_str(id) for id in context])

['BEGINNING', 'anatomy', 'END']
['BEGINNING', 'anatomy', '(', 'greek', 'UNKNOWN', ',', '“', 'dissection', '”', ')', 'is', 'the', 'branch', 'of', 'biology', 'concerned', 'with', 'the', 'study', 'of', 'the', 'structure', 'of', 'organisms', 'and', 'their', 'parts', '.', 'anatomy', 'is', 'a', 'branch', 'of', 'natural', 'science', 'dealing', 'with', 'the', 'structural', 'organization', 'of', 'living', 'things', '.', 'it', 'is', 'an', 'old', 'science', ',', 'having', 'its', 'beginnings', 'in', 'prehistoric', 'times', '.', 'anatomy', 'is', 'inherently', 'tied', 'to', 'UNKNOWN', ',', 'comparative', 'anatomy', ',', 'evolutionary', 'biology', ',', 'and', 'phylogeny', ',', 'as', 'these', 'are', 'the', 'processes', 'by', 'which', 'anatomy', 'is', 'generated', 'over', 'immediate', '(', 'UNKNOWN', ')', 'and', 'long', '(', 'evolution', ')', 'UNKNOWN', '.', 'human', 'anatomy', 'is', 'one', 'of', 'the', 'basic', 'essential', 'sciences', 'of', 'medicine', '.', 'END']
['BEGINNING', 'the', 'discipline', '

In [15]:
# Sanity check nr. 2
print(len(flattened_training_sequences))
print(len(flattened_val_sequences))

179387
22232


In [16]:
#Adapted batcher
from torch.utils.data import DataLoader, TensorDataset
def TorchDataLoaderRNN(training_sequences, batch_size):
  # Find longest length in sequence
  longest = len(max(training_sequences, key=len)) # Should never exceed max_sequence_length

  # Padding
  padded_sequences = [sequence +([3] * (longest - len(sequence))) for sequence in training_sequences] # PADDING has integer code 3

  # Convert lists to tensors
  context_tensor = torch.tensor(padded_sequences, dtype=torch.long)  # Shape: (num_samples, 3)

  # Create a TensorDataset
  dataset = TensorDataset(context_tensor)

  # Create a DataLoader for batching
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  return dataloader

In [17]:
trainloader = TorchDataLoaderRNN(flattened_training_sequences, 64)

In [18]:
valloader = TorchDataLoaderRNN(flattened_val_sequences, 64)

In [19]:
# Sanity check
for batch_context in trainloader:
    print(batch_context[0])
    print(batch_context[0].shape)
    break

tensor([[    4,   175,     6,  ...,     3,     3,     3],
        [    0,    35,  6315,  ...,     3,     3,     3],
        [    0,    32,   354,  ...,     3,     3,     3],
        ...,
        [    0,     4,  1112,  ...,    38,   212,   101],
        [    0, 16347,  5158,  ...,     3,     3,     3],
        [    0,  4661,    40,  ...,     3,     3,     3]])
torch.Size([64, 128])


# Step 2: RNN model

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim


# EarlyStopping class remains the same
class EarlyStopping:
    def __init__(self, patience=5, delta=0, verbose=False, path='checkpoint.pth'):
        self.patience = patience  # Number of epochs to wait for improvement
        self.delta = delta  # Minimum change to qualify as an improvement
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.path = path  # Path to save the best model

    def __call__(self, val_loss, model):
        if self.best_score is None:
            self.best_score = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss < self.best_score - self.delta:
            self.best_score = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0
        else:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, val_loss, model):
        '''Save model when validation loss decreases.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

class RNN(nn.Module):

    def __init__(self, num_layers, hidden_dim, vocab_size, embed_size, activation=nn.ReLU,last_layer_activation=nn.Softmax,dropout=0.05):

        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embed_size

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_dim, num_layers=num_layers,
                    dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedding = self.dropout(self.embedding(x))
        output, hidden = self.lstm(embedding)
        output = self.dropout(output)
        prediction = self.fc(output)
        return prediction


In [21]:
# Sanity check
model = RNN(num_layers=2, hidden_dim=1024, vocab_size=16384, embed_size=128)
test_input = torch.tensor([0, 6 , 8 , 10, 15, 1])
output = model(test_input)
print(output.shape)

test_input = torch.tensor([0, 7 , 7 , 32, 32, 18, 99, 500, 12, 1])
output = model(test_input)
print(output.shape)

torch.Size([6, 16384])
torch.Size([10, 16384])


In [22]:
model = RNN(num_layers=2, hidden_dim=1024, vocab_size=16384, embed_size=128)
model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=3) # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)
patience = 5
early_stopping = EarlyStopping(patience=patience, verbose=True)

number_of_epochs = 10

for epoch in range(number_of_epochs):
    model.train()  # Set model to training mode
    for batch_context in tqdm(trainloader):
        #FORWARD PASS:
        X = batch_context[0][:,:-1]
        Y = batch_context[0][:,1:]
        X, Y = X.to(device), Y.to(device)
        logits = model(X)  # Model output for X
        targets = Y.view(-1)                      # 2-dimensional -> 1-dimensional
        logits = logits.view(-1, logits.shape[-1])  # 3-dimensional -> 2-dimensional
        loss = criterion(logits, targets) # Compute the loss between model output and Y

        #BACKWARD PASS (updating the model parameters):
        optimizer.zero_grad()  # Clear gradients
        loss.backward()        # Compute gradients
        optimizer.step()       # Update model parameters

    print(f"Epoch [{epoch+1}/{number_of_epochs}], Loss: {loss.item():.4f}")

    # Validation loop
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # No gradient computation for validation
        for batch_context in valloader:
        #FORWARD PASS:
          X = batch_context[0][:,:-1]
          Y = batch_context[0][:,1:]
          X, Y = X.to(device), Y.to(device)
          logits = model(X)  # Model output for X
          targets = Y.view(-1)                      # 2-dimensional -> 1-dimensional
          logits = logits.view(-1, logits.shape[-1])  # 3-dimensional -> 2-dimensional
          loss = criterion(logits, targets) # Compute the loss between model output and Y
          val_loss += loss.item()

    avg_val_loss = val_loss / len(valloader)  # Average validation loss
    print(f"Epoch {epoch+1}/{number_of_epochs} - Perplexity: {np.exp(avg_val_loss):.6f}")

    # Call early stopping after each epoch
    early_stopping(avg_val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break

# Optionally, load the best model after training
model.load_state_dict(torch.load('checkpoint.pth'))

100%|██████████| 2803/2803 [20:49<00:00,  2.24it/s]


Epoch [1/10], Loss: 4.8157
Epoch 1/10 - Perplexity: 123.971015
Validation loss decreased (inf --> 4.820048).  Saving model ...


100%|██████████| 2803/2803 [21:06<00:00,  2.21it/s]


Epoch [2/10], Loss: 4.5877
Epoch 2/10 - Perplexity: 92.578977
Validation loss decreased (4.820048 --> 4.528062).  Saving model ...


100%|██████████| 2803/2803 [21:10<00:00,  2.21it/s]


Epoch [3/10], Loss: 4.3119
Epoch 3/10 - Perplexity: 80.842643
Validation loss decreased (4.528062 --> 4.392505).  Saving model ...


100%|██████████| 2803/2803 [21:09<00:00,  2.21it/s]


Epoch [4/10], Loss: 4.1769
Epoch 4/10 - Perplexity: 75.559984
Validation loss decreased (4.392505 --> 4.324927).  Saving model ...


100%|██████████| 2803/2803 [21:10<00:00,  2.21it/s]


Epoch [5/10], Loss: 4.0304
Epoch 5/10 - Perplexity: 72.764355
Validation loss decreased (4.324927 --> 4.287226).  Saving model ...


100%|██████████| 2803/2803 [21:10<00:00,  2.21it/s]


Epoch [6/10], Loss: 3.9923
Epoch 6/10 - Perplexity: 71.632367
Validation loss decreased (4.287226 --> 4.271547).  Saving model ...


100%|██████████| 2803/2803 [21:12<00:00,  2.20it/s]


Epoch [7/10], Loss: 3.7958
Epoch 7/10 - Perplexity: 71.081673
Validation loss decreased (4.271547 --> 4.263830).  Saving model ...


100%|██████████| 2803/2803 [21:12<00:00,  2.20it/s]


Epoch [8/10], Loss: 3.8374
Epoch 8/10 - Perplexity: 71.249013
EarlyStopping counter: 1 out of 5


100%|██████████| 2803/2803 [21:13<00:00,  2.20it/s]


Epoch [9/10], Loss: 3.8346
Epoch 9/10 - Perplexity: 71.963848
EarlyStopping counter: 2 out of 5


100%|██████████| 2803/2803 [21:12<00:00,  2.20it/s]


Epoch [10/10], Loss: 3.7444
Epoch 10/10 - Perplexity: 72.932845
EarlyStopping counter: 3 out of 5


  model.load_state_dict(torch.load('checkpoint.pth'))


<All keys matched successfully>

# Step 3 generating text

Regular test sentence using argmax

In [23]:
test_sentence = "he lives in san"

encoded_sentence = [vocab_builder.get_token_id(word) for word in test_sentence.split(" ")]

output = model(torch.tensor(encoded_sentence).to(device))

# Predict
prediction = torch.argmax(output[-1])

print(vocab_builder.get_token_str(prediction.item()))

francisco


Random algorithm

In [36]:
from torch.distributions import Categorical
def random_sampling(model, prompt, max_length, temperature, topk):
    # First, encode the input
    encoded_prompt = [vocab_builder.get_token_id(word) for word in prompt.split(" ")]

    logits = model(torch.tensor(encoded_prompt).to(device))

    # Apply temperature
    softmax = torch.nn.Softmax()
    tempered_logits = softmax(logits / temperature)

    # Apply topk
    # From https://gist.github.com/bsantraigi/5752667525d88d375207f099bd78818b
    indices_to_remove = logits < torch.topk(logits, topk, dim=1)[0][..., -1, None]
    tempered_logits[indices_to_remove] = -np.Inf

    # Sample from the distribution
    distribution = Categorical(logits=tempered_logits)
    prediction = distribution.sample()

    encoded_prompt.append(prediction[-1].item())

    end_of_sentence = (prediction[-1].item() == 1)
    words_generated = 1

    # Repeat with its own outputs:
    while (words_generated < max_length) and not end_of_sentence:

        # The logits
        logits = model(torch.tensor(encoded_prompt).to(device))

        # Apply temperature
        tempered_logits = softmax(logits / temperature)

        # Apply topk
        indices_to_remove = logits < torch.topk(logits, topk, dim=1)[0][..., -1, None]
        tempered_logits[indices_to_remove] = -np.Inf

        # Sample from the distribution
        distribution = Categorical(logits=tempered_logits)
        prediction = distribution.sample()
        encoded_prompt.append(prediction[-1].item())

        # Check if end of sentence and update word counter
        if (prediction[-1].item() == 1):
            end_of_sentence = True
        words_generated += 1
    return [vocab_builder.get_token_str(word) for word in encoded_prompt]

# Test it
print(random_sampling(model, "he lives in san", 1, 1, 1)) # Sanity check
print(random_sampling(model, "he lives in san", 1, 0.000001, 10)) # Sanity check
print(random_sampling(model, "he lives in san", 30, 0.5, 5))
print(random_sampling(model, "which is very", 30, 1, 5))
print(random_sampling(model, "which is very", 30, 2, 10))
print(random_sampling(model, "and here is another interesting fact about", 30, 0.5, 10))
print(random_sampling(model, "and here is another interesting fact about", 30, 0.5, 3))
print(random_sampling(model, "and here is another interesting fact about", 30, 2, 100))

['he', 'lives', 'in', 'san', 'francisco']
['he', 'lives', 'in', 'san', 'francisco']
['he', 'lives', 'in', 'san', 'francisco', '.', 'in', 'the', 'same', 'month', ',', 'UNKNOWN', "'s", 'father', 'died', 'in', 'an', 'accident', '.', 'he', 'also', 'UNKNOWN', 'a', 'new', 'family', 'for', 'the', 'family', '.', 'END']
['which', 'is', 'very', 'close', '.', 'in', 'the', 'past', ',', 'UNKNOWN', 'is', 'UNKNOWN', '.', 'he', 'was', 'also', 'a', 'member', 'who', 'had', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', ',', 'and', 'his', 'wife', 'was', 'murdered', 'by', 'the', 'UNKNOWN', ',']
['which', 'is', 'very', 'well', 'received', 'by', 'its', 'use', ',', 'but', 'rather', 'the', 'UNKNOWN', '.', 'this', 'allows', 'them', 'with', 'a', 'higher', 'chance', 'to', 'have', 'to', 'have', 'any', 'more', 'effect', ',', 'such', 'UNKNOWN', '.', 'a']
['and', 'here', 'is', 'another', 'interesting', 'fact', 'about', 'that', ',', 'in', 'order', 'to', 'prove', 'what', '``', 'a', 'certain', 'part', 'or', 'the', 'UNKNOWN', 'of', '