In [210]:
import re

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch._tensor import Tensor
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

import os
import pandas as pd
import math

# Deep Learning for NLP - Lab Assignment 2

## Loading Data


In [211]:
DATA_PATH = "../data/google" # Folder path for the train/test/dev .tsv files

train_data_raw = pd.read_csv(os.path.join(DATA_PATH, "train.tsv"), sep='\t', header=None)[0].tolist()[:5000] # Necessary to take a subset of the dataset due to computation constraints
test_data_raw = pd.read_csv(os.path.join(DATA_PATH, "test.tsv"), sep='\t', header=None)[0].tolist()[:500]
dev_data_raw = pd.read_csv(os.path.join(DATA_PATH, "dev.tsv"), sep='\t', header=None)[0].tolist()[:500]

In [212]:
def clean_str(string: str, tolower:bool=True) -> str:
    """
    Tokenization/string cleaning.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    if tolower:
        string = string.lower()
    return string.strip()


train_data, test_data, dev_data = [], [], []

for sentence in train_data_raw:
    train_data.append(clean_str(sentence))
for sentence in dev_data_raw:
    dev_data.append(clean_str(sentence))
for sentence in test_data_raw:
    test_data.append(clean_str(sentence))

## Creating vocabulary


In [213]:
class WordDict:
    """Word dictionnary class.
    """
    # constructor, words must be a set containing all words
    def __init__(self, words:set) -> None:
        """Initialize a word dictionnary

        Args:
            words (set): set of all words in a dataset
        """
        assert type(words) == set
        self.word_to_idx = {word: idx for idx, word in enumerate(words)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}

    def word_to_id(self, word:str) -> int:
        """Return the integer associated with a word.

        Args:
            word (str): word

        Returns:
            int: index of the word in the vocabulary
        """
        return self.word_to_idx[word]
    
    def id_to_word(self, idx:int) -> str:
        """Return the word associated with an integer.

        Args:
            idx (int): integer

        Returns:
            str: word at that index in the word dictionnary
        """
        return self.idx_to_word[idx]
    
    def __len__(self) -> int:
        """Compute length of the dictionnary

        Returns:
            int: length of the dictionnary of words
        """
        return len(self.word_to_idx)

In [214]:
train_words = set()

for sentence in train_data:
    train_words.update(sentence.split(" "))

train_words.update(["<bos>", "<eos>", "<unk>", "<pad>"])

word_dict = WordDict(train_words)

print("Number of words :", len(word_dict))
print(list(word_dict.idx_to_word.items())[0:5]) # Excerpt of the idx_to_word dictionnary

Number of words : 8159
[(0, 'caught'), (1, 'corpus'), (2, 'dragged'), (3, 'freaking'), (4, 'racism')]


## Neural N-Gram model


In [215]:
class NGramDataset(Dataset):
    """Dataset for training the N-gram model, based on the Pytorch Dataset class.
    """
    def __init__(self, sentences:list[str], vocab:WordDict, context_size:int) -> None:
        """N-gram Dataset to use with the DataLoader feature of torch during training

        Args:
            sentences (list[str]): List of sentences
            vocab (WordDict): _description_
            context_size (int): _description_
        """
        super().__init__()
        self.data = []
        self.vocab = vocab
        self.context_size = context_size

        for sentence in sentences:
            tokens = ["<bos>"] * context_size + sentence.split(" ") + ["<eos>"] # Pad the sentence and extract words. We pad the sentence with context_size <bos> tokens for generating the first word.
            indices = [vocab.word_to_id(word) if word in vocab.word_to_idx.keys() else vocab.word_to_id('<unk>') for word in tokens] # Tokenize the sentence using the WordDict
            for i in range(context_size, len(indices)):
                self.data.append((torch.tensor(indices[i-context_size:i]),
                                              torch.tensor(indices[i]))) # Append the training data with a tuple ([word_1, word_2 ,...], word_n)

    def __len__(self) -> int:
        """Compute len of dataset (necessary)

        Returns:
            int: length of the dataset
        """
        return len(self.data)
    
    def __getitem__(self, idx:int) -> tuple[Tensor, Tensor]:
        """Return a tuple of tensor, containing encodings of the n-grams and next word to predict

        Args:
            idx (int): index of the item in the dataset

        Returns:
            tuple[Tensor, Tensor]: Tuple with n-gram data and next word in the sentence
        """
        return self.data[idx]

In [216]:
# Define the context size (e.g., 2 for bigrams)
CONTEXT_SIZE = 5  # For n-grams, context_size = n - 1

# Dataset
ngram_train_dataset = NGramDataset(train_data, word_dict, CONTEXT_SIZE)
ngram_dev_dataset = NGramDataset(dev_data, word_dict, CONTEXT_SIZE)
ngram_test_dataset = NGramDataset(test_data, word_dict, CONTEXT_SIZE)

print("Train_dataset length :", len(ngram_train_dataset))
print("Training item example :", ngram_train_dataset[123]) # Example of item in the training data
print("Training sentence example :", " ".join([word_dict.id_to_word(id) for id in ngram_train_dataset[123][0].tolist()]), "; Target word example :", word_dict.id_to_word(ngram_train_dataset[123][1].item()))

Train_dataset length : 74083
Training item example : (tensor([4547, 1234,  943, 2189, 2597]), tensor(2550))
Training sentence example : youtube and outrage drama is ; Target word example : super


In [217]:
class NeuralNGramModel(nn.Module):
    """Class for the Neural N-gram module, based on the Pytorch based class.
    """
    def __init__(self, vocab_size:int, embed_size:int, context_size:int, hidden_size:int) -> None:
        """Initialize the Neural N-gram model

        Args:
            vocab_size (int): Number of words in the dictionnary
            embed_size (int): Embedding size
            context_size (int): Number 
            hidden_size (int): _description_
        """
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.fc1 = nn.Linear(embed_size * context_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(p=0.3)

    def forward(self, x:Tensor) -> Tensor:
        """Forward pass of the model

        Args:
            x (Tensor): input of the model (batched), size (batch_size, context_size)

        Returns:
            Tensor: output of the model, in the form of a (vocab_size, x.shape(1)) tensor
        """

        embeds = self.embeddings(x) # (batch_size, context_size, embed_size)
        embeds = embeds.view(embeds.size(0), -1) # (batch_size, embed_size * context_size)

        hidden = self.dropout(F.relu(self.fc1(embeds))) # (batch_size, hidden_size)
        output = self.fc2(hidden) # (batch_size, vocab_size)
        
        return output


In [218]:
# Model and training parameters
NGRAM_EMBED_SIZE = 128
NGRAM_HIDDEN_SIZE = 64
NGRAM_BATCH_SIZE = 32
NGRAM_EPOCHS = 10

# DataLoader
ngram_train_loader = DataLoader(ngram_train_dataset, batch_size=NGRAM_BATCH_SIZE, shuffle=True)
ngram_dev_loader = DataLoader(ngram_dev_dataset, batch_size=NGRAM_BATCH_SIZE)

# Initialize Model, Loss, Optimizer
vocab_size = len(word_dict)
ngram_model = NeuralNGramModel(vocab_size, NGRAM_EMBED_SIZE, CONTEXT_SIZE, NGRAM_HIDDEN_SIZE)
ngram_criterion = nn.CrossEntropyLoss(ignore_index=word_dict.word_to_id('<pad>')) # We can see the problem as a multi-label classification problem, classyfying among all possibles words in the vocab
ngram_optimizer = optim.Adam(ngram_model.parameters())

# Training Loop

for epoch in range(NGRAM_EPOCHS):
    ngram_model.train()
    ngram_train_loss = 0

    # Train loop
    for context, target in ngram_train_loader:

        # Forward pass
        output = ngram_model(context)
        loss = ngram_criterion(output, target)

        # Backpropagation
        ngram_optimizer.zero_grad()
        loss.backward()
        ngram_optimizer.step()

        ngram_train_loss += loss.item()
    
    # Dev loop
    ngram_model.eval()
    ngram_dev_loss = 0
    with torch.no_grad():
        for context, target in ngram_dev_loader:

            # Forward pass
            output = ngram_model(context)
            loss = ngram_criterion(output, target)
            ngram_dev_loss += loss.item()

    ngram_train_loss /= len(ngram_train_loader)
    ngram_dev_loss /= len(ngram_dev_loader)

    print(f"Epoch [{epoch+1}/{NGRAM_EPOCHS}], "
            f"Train Loss: {ngram_train_loss:.2f}, "
            f"Dev Loss: {ngram_dev_loss:.2f}")

Epoch [1/10], Train Loss: 6.68, Dev Loss: 6.50
Epoch [2/10], Train Loss: 6.23, Dev Loss: 6.54
Epoch [3/10], Train Loss: 5.99, Dev Loss: 6.70
Epoch [4/10], Train Loss: 5.79, Dev Loss: 6.99
Epoch [5/10], Train Loss: 5.62, Dev Loss: 7.25
Epoch [6/10], Train Loss: 5.47, Dev Loss: 7.70
Epoch [7/10], Train Loss: 5.34, Dev Loss: 8.09
Epoch [8/10], Train Loss: 5.24, Dev Loss: 8.46
Epoch [9/10], Train Loss: 5.15, Dev Loss: 8.93
Epoch [10/10], Train Loss: 5.07, Dev Loss: 9.40


## Sentence Generation


In [219]:
# Generation example
MAX_LEN = 25

ngram_model.eval()

context = ["<bos>"] * CONTEXT_SIZE


generated_sequence = context[:]
while generated_sequence[-1] != "<eos>" and len(generated_sequence) <= MAX_LEN:
    context_indices = torch.tensor([word_dict.word_to_id(word) for word in generated_sequence[-CONTEXT_SIZE:]], dtype = torch.long).unsqueeze(0)
    with torch.no_grad():
        output = ngram_model(context_indices)
        probabilities = torch.softmax(output, dim=-1)

    predicted_index = torch.argmax(probabilities, dim=-1).item()
    predicted_word = word_dict.id_to_word(predicted_index)
    generated_sequence.append(predicted_word)

print("Deterministic generated sentence :", " ".join(generated_sequence[CONTEXT_SIZE:-1]))

Deterministic generated sentence : i have the


In [220]:
# Generation example

ngram_model.eval()

context = ["<bos>"] * CONTEXT_SIZE
context_indices = torch.tensor([word_dict.word_to_id(word) for word in context], dtype = torch.long).unsqueeze(0)

for _ in range(10):
    generated_sequence = context[:]
    while generated_sequence[-1] != "<eos>" and len(generated_sequence) <= MAX_LEN:
        context_indices = torch.tensor([word_dict.word_to_id(word) for word in generated_sequence[-CONTEXT_SIZE:]], dtype = torch.long).unsqueeze(0)
        with torch.no_grad():
            output = ngram_model(context_indices)
            probabilities = torch.softmax(output, dim=-1)

        predicted_index = torch.multinomial(probabilities.squeeze(), num_samples=1).item()
        predicted_word = word_dict.id_to_word(predicted_index)
        generated_sequence.append(predicted_word)

    print("Random generated sequence :", " ".join(generated_sequence[CONTEXT_SIZE:-1]))

Random generated sequence : sorry for seeing one and no brushing from players people do you got guidance
Random generated sequence : no , but the women makeup , , before d do to the talent
Random generated sequence : that wouldn t lose the vehicles , so i dont see an pills ! an ago !
Random generated sequence : won that water started lists
Random generated sequence : it does n't dumb the considering is delusional about lot
Random generated sequence : name ! , i'm seen !
Random generated sequence : so so game that in back ever sometimes language
Random generated sequence : that won i used some have men is make going with stuff
Random generated sequence : very lol awesome , that , when all control
Random generated sequence : you ?


## LSTM-based Autoregressive Model


In [221]:
class LSTMDataset(Dataset):
    """Dataset for training the LSTM model, based on the Pytorch Dataset class.
    """
    def __init__(self, sentences:list[str], vocab:WordDict) -> None:
        """_summary_

        Args:
            sentences (list[str]): List od sentences.
            vocab (WordDict): _description_
        """
        super().__init__()
        self.data = []
        self.vocab = vocab

        for sentence in sentences:
            tokens = ["<bos>"] + sentence.split(" ") + ["<eos>"]
            indices = [vocab.word_to_id(word) if word in vocab.word_to_idx.keys() else vocab.word_to_id('<unk>') for word in tokens] # Tokenize the sentence using the WordDict
            
            self.data.append((torch.tensor(indices),
                              torch.tensor(indices[1:]+[self.vocab.word_to_id('<pad>')])))
            
    def __len__(self) -> int:
        """Compute len of dataset (necessary)

        Returns:
            int: length of the dataset
        """
        return len(self.data)
    
    def __getitem__(self, idx:int) -> tuple[Tensor, Tensor]:
        """Return a tuple of tensor, containing encodings of the n-grams and next word to predict

        Args:
            idx (int): index of the item in the dataset

        Returns:
            tuple[Tensor, Tensor]: Tuple with sentence and shifted sentence for generation
        """
        return self.data[idx]


def collate_fn(batch:list[tuple]) -> tuple[Tensor, Tensor]:
    """Custom collate function for dynamically padding sentences

    Args:
        batch (list[tuple]): batch of a sentence Dataset

    Returns:
        tuple[Tensor, Tensor]: Dynamicallt padded sentences
    """
    sentences, shifted_sentences = zip(*batch)

    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=word_dict.word_to_id('<pad>'))
    padded_shifted_sentences = pad_sequence(shifted_sentences, batch_first=True, padding_value=word_dict.word_to_id('<pad>'))

    return padded_sentences, padded_shifted_sentences

In [222]:
# Dataset
lstm_train_dataset = LSTMDataset(train_data, word_dict)
lstm_dev_dataset = LSTMDataset(dev_data, word_dict)

print("Train_dataset length :", len(lstm_train_dataset))
print("Training item example :", lstm_train_dataset[123]) # Example of item in the training data
print("Training sentence example :", " ".join([word_dict.id_to_word(id) for id in lstm_train_dataset[123][0].tolist()]), " ; Target sentence example :", " ".join([word_dict.id_to_word(id) for id in lstm_train_dataset[123][1].tolist()]))

Train_dataset length : 5000
Training item example : (tensor([7046, 3383, 6047, 1204, 4540, 7763, 3047, 1079, 7365, 6040]), tensor([3383, 6047, 1204, 4540, 7763, 3047, 1079, 7365, 6040, 5820]))
Training sentence example : <bos> three words , no subtlety dude stop seriously <eos>  ; Target sentence example : three words , no subtlety dude stop seriously <eos> <pad>


In [223]:
class LSTMModel(nn.Module):
    """Class for the LSTM module, based on the Pytorch based class.
    """
    def __init__(self, vocab_size:int, embed_size:int, hidden_size:int, dropout_prob:float=0.3) -> None:
        """Initialize the LSTM

        Args:
            vocab_size (int): Number of words in the dictionnary
            embed_size (int): Embedding size
            hidden_size (int): Size of the hidden layer
        """
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

        self.dropout_prob = dropout_prob

    def variational_dropout(self, input:Tensor) -> Tensor:
        if self.training: # Apply variational dropout only during training
            mask = (torch.rand_like(input) > self.dropout_prob).float().to(input.device)
            mask = mask.div_(1.0 - self.dropout_prob)
            return input * mask
        return input # If not in training, return input

    def forward(self, x:Tensor) -> Tensor:
        """Forward pass of the model

        Args:
            x (Tensor): input of the model (batched), size (batch_size, context_size)

        Returns:
            Tensor: output of the model, in the form of a (vocab_size, x.shape(1)) tensor
        """

        embeds = self.embeddings(x) # (batch_size, seq_len, embed_size)
        embeds = self.variational_dropout(embeds) # (batch_size, seq_len, embed_size)

        output, _ = self.lstm(embeds) # (batch_size, seq_len, hidden_size)
        output = self.variational_dropout(output) # (batch_size, seq_len, hidden_size)

        output = self.fc(output) # (batch_size, seq_len, vocab_size)

        
        return output


In [224]:
# Model and training parameters
LSTM_EMBED_SIZE = 128
LSTM_HIDDEN_SIZE = 64
LSTM_BATCH_SIZE = 32
LSTM_EPOCHS = 20

# DataLoader
lstm_train_loader = DataLoader(lstm_train_dataset, batch_size=LSTM_BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
lstm_dev_loader = DataLoader(lstm_dev_dataset, batch_size=LSTM_BATCH_SIZE, collate_fn=collate_fn)

# Initialize Model, Loss, Optimizer
vocab_size = len(word_dict)
lstm_model = LSTMModel(vocab_size, LSTM_EMBED_SIZE, LSTM_HIDDEN_SIZE)
lstm_criterion = nn.CrossEntropyLoss(ignore_index=word_dict.word_to_id('<pad>')) # We can see the problem as a multi-label classification problem, classyfying among all possibles words in the vocab
lstm_optimizer = optim.Adam(lstm_model.parameters())

# Training Loop

for epoch in range(LSTM_EPOCHS):
    lstm_model.train()
    lstm_train_loss = 0

    # Train loop
    for context, target in lstm_train_loader:

        # Forward pass
        output = lstm_model(context)

        # Reshape for loss calculation
        output = output.view(-1, vocab_size) # (batch_size *  seq_len, vocab_size)
        target = target.view(-1) # (batch_size * seq_len)

        # Compute loss
        loss = lstm_criterion(output, target)

        # Backpropagation
        lstm_optimizer.zero_grad()
        loss.backward()
        lstm_optimizer.step()

        lstm_train_loss += loss.item()
    
    # Dev loop
    lstm_model.eval()
    lstm_dev_loss = 0
    with torch.no_grad():
        for context, target in lstm_dev_loader:

            # Forward pass
            output = lstm_model(context)

            # Reshape for loss calculation
            output = output.view(-1, vocab_size) # (batch_size *  seq_len, vocab_size)
            target = target.view(-1) # (batch_size * seq_len)

            # Compute loss
            loss = lstm_criterion(output, target)

            lstm_dev_loss += loss.item()

    lstm_train_loss /= len(lstm_train_loader)
    lstm_dev_loss /= len(lstm_dev_loader)

    print(f"Epoch [{epoch+1}/{LSTM_EPOCHS}], "
            f"Train Loss: {lstm_train_loss:.2f}, "
            f"Dev Loss: {lstm_dev_loss:.2f}")

Epoch [1/20], Train Loss: 7.29, Dev Loss: 6.45
Epoch [2/20], Train Loss: 6.33, Dev Loss: 6.39
Epoch [3/20], Train Loss: 6.21, Dev Loss: 6.33
Epoch [4/20], Train Loss: 6.10, Dev Loss: 6.27
Epoch [5/20], Train Loss: 6.00, Dev Loss: 6.23
Epoch [6/20], Train Loss: 5.91, Dev Loss: 6.19
Epoch [7/20], Train Loss: 5.84, Dev Loss: 6.15
Epoch [8/20], Train Loss: 5.77, Dev Loss: 6.14
Epoch [9/20], Train Loss: 5.71, Dev Loss: 6.11
Epoch [10/20], Train Loss: 5.65, Dev Loss: 6.10
Epoch [11/20], Train Loss: 5.59, Dev Loss: 6.10
Epoch [12/20], Train Loss: 5.54, Dev Loss: 6.09
Epoch [13/20], Train Loss: 5.50, Dev Loss: 6.09
Epoch [14/20], Train Loss: 5.45, Dev Loss: 6.09
Epoch [15/20], Train Loss: 5.41, Dev Loss: 6.08
Epoch [16/20], Train Loss: 5.37, Dev Loss: 6.08
Epoch [17/20], Train Loss: 5.34, Dev Loss: 6.08
Epoch [18/20], Train Loss: 5.30, Dev Loss: 6.09
Epoch [19/20], Train Loss: 5.28, Dev Loss: 6.09
Epoch [20/20], Train Loss: 5.23, Dev Loss: 6.10


## Sentence Generation


In [225]:
# Generation example

lstm_model.eval()

context = ["<bos>"]
context_indices = torch.tensor([word_dict.word_to_id(word) for word in context], dtype = torch.long).unsqueeze(0)

MAX_LEN = 25

generated_sequence = context[:]
while generated_sequence[-1] != "<eos>" and len(generated_sequence) <= MAX_LEN:
    context_indices = torch.tensor([word_dict.word_to_id(word) for word in generated_sequence], dtype = torch.long).unsqueeze(0)
    with torch.no_grad():
        output = lstm_model(context_indices)[:, -1]
        probabilities = torch.softmax(output, dim=-1)

    predicted_index = torch.argmax(probabilities, dim=-1).item()
    predicted_word = word_dict.id_to_word(predicted_index)
    generated_sequence.append(predicted_word)

print("Deterministic generated sentence :", " ".join(generated_sequence[1:-1]))

Deterministic generated sentence : i m not a lot of the same


In [226]:
# Generation example

lstm_model.eval()

context = ["<bos>"]
context_indices = torch.tensor([word_dict.word_to_id(word) for word in context], dtype = torch.long).unsqueeze(0)


for _ in range(10):
    generated_sequence = context[:]
    while generated_sequence[-1] != "<eos>" and len(generated_sequence) <= MAX_LEN:
        context_indices = torch.tensor([word_dict.word_to_id(word) for word in generated_sequence], dtype = torch.long).unsqueeze(0)
        with torch.no_grad():
            output = lstm_model(context_indices)[:, -1]
            probabilities = torch.softmax(output, dim=-1)

        predicted_index = torch.multinomial(probabilities.squeeze(), num_samples=1).item()
        predicted_word = word_dict.id_to_word(predicted_index)
        generated_sequence.append(predicted_word)

    print("Random generated sentence :", " ".join(generated_sequence[1:-1]))

Random generated sentence : name hopefully bud , it 's awful , it 's allowed but small cute to 2018 the people fell
Random generated sentence : it was n't did no a means on getting enough knowledge participants this was saying though
Random generated sentence : wow , i saw the world ca
Random generated sentence : i think it was kind of luck in the hires when you 're gainfully only that is a like an island women ghosting would
Random generated sentence : i'm the authors of 35 sorry you re wo n't see so enough haha better in it
Random generated sentence : drugs are have awful in a for name name
Random generated sentence : we are interest
Random generated sentence : the fuck sunny
Random generated sentence : hello mate and spending very conservative ! !
Random generated sentence : false is okay i hope it makes me feel less


## Perplexity


In [227]:
class Perplexity:
    """Perplexity computation
    """
    def __init__(self) -> None:
        """Init method.
        """
        self.reset()
    def reset(self) -> None:
        """Rset method.
        """
        self.log_sum = 0
        self.total_words = 0
        self.log_sum_list = []

    def add_sentence(self, log_probs) -> None:
        """Compute values for one sentence and store it in the class.
        """
        self.log_sum += log_probs
        self.total_words += 1
        self.log_sum_list += [self.log_sum]

    def compute_perplexity(self) -> float:
        """Compute full Perplexity

        Returns:
            float: Final perplexity
        """
        return math.exp(-self.log_sum / self.total_words)

In [228]:
ngram_model.eval()
perplexity_object = Perplexity()

# Dataset & DataLoader
ngram_test_dataset = NGramDataset(train_data, word_dict, CONTEXT_SIZE)
ngram_test_loader = DataLoader(ngram_test_dataset, batch_size=NGRAM_BATCH_SIZE)

with torch.no_grad():
    for context, target in ngram_test_loader:
        output = ngram_model(context)
        probs = torch.log_softmax(output, dim=1)
        for i in range(len(target)):
            perplexity_object.add_sentence(probs[i, target[i]].item())

perplexity = perplexity_object.compute_perplexity()

print(f"N-Gram Model Perplexity : {perplexity:.2f}")

N-Gram Model Perplexity : 96.34


In [229]:
unk_count = sum(1 for _, next_word in ngram_test_dataset if next_word.item() == word_dict.word_to_id("<unk>"))
print(f"Number of unknown words in test set: {unk_count}")

Number of unknown words in test set: 0


In [230]:
import numpy as np

for i in np.random.choice(len(ngram_train_dataset), 10):
    context, next_word = ngram_train_dataset[i]  
    with torch.no_grad():
        probs = torch.softmax(ngram_model(context.unsqueeze(0)), dim=1)
        prob = probs[0, next_word].item()
        max_prob = torch.max(probs[0])
        max_word = torch.argmax(probs[0])
        context_sentence = " ".join([word_dict.id_to_word(id) for id in context.tolist()])
        print(f"Context: {context_sentence} ; True next word: {word_dict.id_to_word(next_word.item())} ; Probability: {prob} ; Maximum Probability: {max_prob.item()} ; Predicted word: {word_dict.id_to_word(max_word.item())}")
        if prob == 0:
            print(f"Zero probability for context: {context} and next word: {next_word}")


Context: , but other than that ; True next word: <eos> ; Probability: 0.05028929561376572 ; Maximum Probability: 0.05028929561376572 ; Predicted word: <eos>
Context: little improvements , something slightly ; True next word: different ; Probability: 0.0010384854394942522 ; Maximum Probability: 0.050557348877191544 ; Predicted word: <eos>
Context: i called them turd gen ; True next word: , ; Probability: 0.1950800120830536 ; Maximum Probability: 0.23016342520713806 ; Predicted word: <eos>
Context: oh , totally mistaken thanks ; True next word: for ; Probability: 0.256400465965271 ; Maximum Probability: 0.256400465965271 ; Predicted word: for
Context: in a lot further than ; True next word: i ; Probability: 0.01578563265502453 ; Maximum Probability: 0.03158121928572655 ; Predicted word: <eos>
Context: do n't understand humor imagine ; True next word: living ; Probability: 0.0006788246682845056 ; Maximum Probability: 0.05318064242601395 ; Predicted word: the
Context: <bos> <bos> <bos> <bo