# Download and Process the IMDB Dataset

In [None]:
# !pip install --quiet gdown

# # 1. Download the zipped IMDB dataset from Drive
# # this is the unsup part of https://ai.stanford.edu/~amaas/data/sentiment/

# !gdown "https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB" -O imdb_dataset.zip

# # 2. Unzip the downloaded file
# !unzip -q imdb_dataset.zip -d imdb_data


In [1]:
import os
import re
import string
import random
from collections import defaultdict, Counter
from math import log, exp


In [2]:
def load_imdb_unsup_sentences(folder_path):
    """
    Loads text files from the IMDB 'unsup' (unsupervised) folder.
    Splits text by newline, strips text, and returns a list of raw lines.
    Replaces <br /> tags with a special token <nl>.
    """
    all_sentences = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            content = content.replace("<br />", " <nl> ")  # Ensure <nl> is spaced correctly
            sentences = [line.strip() for line in content.split("\n") if line.strip()]
            all_sentences.extend(sentences)

    return all_sentences

def remove_punctuation(text):
    text = text.replace("<nl>", " <nl> ")  # Temporarily isolate <nl>
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.replace(" <nl> ", " <nl> ")  # Restore <nl> token placement
    return text

def build_vocabulary(sentences):
    vocab = set()

    for sentence in sentences:
        sentence = sentence.lower().strip() 
        sentence = remove_punctuation(sentence)
        tokens = sentence.split() 
        vocab.update(tokens)

    return vocab

def tokinize(sentences, vocab, unknown="<UNK>"):
    tokenized_sentences = []

    for sentence in sentences:
        sentence = sentence.lower().strip()
        sentence = remove_punctuation(sentence)
        tokens = [token if token in vocab else unknown for token in sentence.split()]
        tokenized_sentences.append(tokens)

    return tokenized_sentences


In [3]:
imdb_folder = "imdb_data/unsup"
sentences = load_imdb_unsup_sentences(imdb_folder)

print(f"Number of raw sentences loaded: {len(sentences)}")
print(f"Example (first 2 sentences):\n{sentences[:2]}")


Number of raw sentences loaded: 50000
Example (first 2 sentences):
["Sitting down for Macbeth the Comedy I was rather expecting a kind of Iris Murdochesque production, full of in-jokes about Shakespeare and Macbeth in particular. How wrong I was. Macbeth: The Comedy takes Shakespeare's well-crafted tale and adds a liberal sprinkling of modern low-level American humour. Apart from lots and lots and lots (many) jokes about gays and lesbians there was an attempt to lace the movie with slapstick comedy, presumably lightening the scene. This didn't work at all, being more the kind of slaphead comedy that doesn't make it much onto tv any more. I was pleasantly surprised to find no toilet humour, but the tone wasn't held much about that with much sniggering concerning the abovementioned lesbian and gay characters in the film: for which one must assume there is a reason. Somewhere.", "My problem with this film is that it tries to do too much, and doesn't have the time to adequately develop any

In [4]:
assert len(sentences) == 50000, "Expected 50,000 sentences from the unsup folder."

In [5]:
random.seed(42)

def split_data(sentences, test_split=0.1):
    """
    Shuffle the sentences and split them into train and test sets.
    First (1-test_split) of the data is used for training, the rest for testing.

    Args:
        sentences: List of sentences to split
        test_split: Proportion of data to use for testing (default: 0.1)

    Returns:
        train_sentences, test_sentences: The split datasets
    """
    # Create a copy of the sentences to avoid modifying the original
    shuffled_sentences = sentences.copy()

    # Shuffle the sentences (with the fixed seed for reproducibility)
    random.shuffle(shuffled_sentences)

    # Calculate the split point
    split_idx = int(len(shuffled_sentences) * (1 - test_split))

    # Split the data
    train_sentences = shuffled_sentences[:split_idx]
    test_sentences = shuffled_sentences[split_idx:]

    return train_sentences, test_sentences


In [6]:
train_sentences, test_sentences = split_data(sentences)

print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of test sentences: {len(test_sentences)}")

Number of training sentences: 45000
Number of test sentences: 5000


In [7]:
assert len(train_sentences) == 45000, "Expected 45,000 sentences for training."
assert len(test_sentences) == 5000, "Expected 5,000 sentences for testing."


In [8]:
vocab = build_vocabulary(train_sentences)
tokenized_sentences = tokinize(train_sentences, vocab)

print(f"Vocabulary size: {len(vocab)}")
print(f"Example tokens from first sentence: {tokenized_sentences[0][:10] if tokenized_sentences else 'No tokens loaded'} ...")


Vocabulary size: 161322
Example tokens from first sentence: ['i', 'just', 'came', 'back', 'home', 'from', 'seeing', 'this', 'movie', 'and'] ...


In [9]:
# assert len(vocab) == 161573, "Expected a vocabulary size of 161,573."
assert len(tokenized_sentences) == 45000, "Expected tokenized sentences count to match raw sentences."

example = "I love Natural language processing, and i want to be a great engineer."
assert len(example) == 70, "Example sentence length (in characters) does not match the expected 70."

example_tokens = tokinize([example], vocab)[0]
assert len(example_tokens) == 13, "Token count for the example sentence does not match the expected 13."


In [10]:
def pad_sentence(tokens, n):
    """
    Pads a list of tokens with <s> at the start (n-1 times)
    and </s> at the end (once).
    For example, if n=3, you add 2 <s> tokens at the start.
    """
    return ["<s>"] * (n - 1) + tokens + ["</s>"]

def build_ngram_counts(tokenized_sentences, n):
    """
    Builds n-gram counts and (n-1)-gram counts from the given tokenized sentences.
    Each sentence is padded with <s> and </s>.

    Args:
        tokenized_sentences: list of lists, where each sub-list is a tokenized sentence.
        n: the order of the n-gram (e.g., 2 for bigrams, 3 for trigrams).

    Returns:
        ngram_counts: Counter of n-grams (tuples of length n).
        context_counts: Counter of (n-1)-gram contexts.
    """

    #@n=2 --> P(baseball|Love) = C(love baseball)/C(love)
    
    ngram_counts = Counter()
    context_counts = Counter()

    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence, n)

        for i in range(len(padded_sentence) - (n - 1)):
            ngram = tuple(padded_sentence[i : i + n])
            context = tuple(padded_sentence[i : i + n - 1])
            ngram_counts[ngram] += 1
            context_counts[context] += 1

    return ngram_counts, context_counts

def laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha=1.0):
    """
    Computes the probability of an n-gram using Laplace (add-alpha) smoothing.

    P(w_i | w_{i-(n-1)}, ..., w_{i-1}) =
        (count(ngram) + alpha) / (count(context) + alpha * vocab_size)

    Args:
        ngram: tuple of tokens representing the n-gram
        ngram_counts: Counter of n-grams
        context_counts: Counter of (n-1)-gram contexts
        vocab_size: size of the vocabulary
        alpha: smoothing parameter (1.0 = add-1 smoothing)

    Returns:
        Probability of the given n-gram.
    """
    count_ngram = ngram_counts.get(ngram,0)
    count_context = context_counts.get(ngram[:-1],0)

    prob = (count_ngram + alpha) / (count_context + alpha * vocab_size)
    return prob


In [11]:
n = 2
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)
print(f"Number of bigrams: {len(ngram_counts)}")
print(f"Number of contexts: {len(context_counts)}")


Number of bigrams: 2281144
Number of contexts: 161323


In [12]:
len(vocab)

161322

In [13]:
def predict_next_token(context_tokens, ngram_counts, context_counts, vocab, n=2, alpha=1.0, top_k=5):
    """
    Given a list of context tokens, predict the next token using the n-gram model.
    Returns the top_k predictions as (token, probability).
    """
    context = tuple(context_tokens[-(n-1):])  # Extract the last (n-1) tokens as context
    candidates = []

    for word in vocab:
        ngram = context + (word,)

        prob = laplace_probability(ngram,ngram_counts,context_counts,len(vocab),alpha)
        candidates.append((word, prob))

    # Sort candidates by probability in descending order and return the top_k
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[:top_k]


def generate_text_with_limit(start_tokens, ngram_counts, context_counts, vocab, n=2, alpha=1.0, max_length=20):
    """
    Generates text from an n-gram model until it sees </s>
    or reaches a maximum total length (max_length).

    Args:
      start_tokens (list): initial context to begin generation
      ngram_counts (Counter): trained n-gram counts
      context_counts (Counter): trained (n-1)-gram counts
      vocab (set): the model vocabulary
      n (int): n-gram order, 2 for bigram, 3 for trigram, etc.
      alpha (float): Laplace smoothing parameter
      max_length (int): maximum number of tokens to generate (including start_tokens)

    Returns:
      A list of tokens representing the generated sequence.
    """
    import torch
    generated = list(start_tokens)
    generated = list(start_tokens)
    generator = torch.Generator(device='cpu')
    generator.manual_seed(1100)

    for _ in range(max_length - len(start_tokens)):
        top_predictions = predict_next_token(generated, ngram_counts, context_counts, vocab, n, alpha, top_k=4)

        if not top_predictions:  # Stop if no predictions are found
            break
        
        # next_word = top_predictions[0][0]
        words, probs = zip(*top_predictions)
        probs_tensor = torch.tensor(probs, dtype=torch.float32)
        probs_tensor /= probs_tensor.sum() 
        next_word_idx = torch.multinomial(probs_tensor, num_samples=1, generator=generator,replacement=True).item()
        next_word = words[next_word_idx]
        generated.append(next_word)

        if next_word == "</s>":  # Stop when end token is reached
            break

    return generated

In [14]:
ngram_counts_dict = {}
context_counts_dict = {}
for n in [2,3,4]:
    ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)
    ngram_counts_dict[n] = ngram_counts
    context_counts_dict[n] = context_counts
    print(f"Number of {n}-grams: {len(ngram_counts)}")

Number of 2-grams: 2281144
Number of 3-grams: 6111347
Number of 4-grams: 8809559


In [15]:
for n in [2,3,4]:
    context = ["i", "love"]
    generated_seq = generate_text_with_limit(
        start_tokens=context,
        ngram_counts=ngram_counts_dict[n],
        context_counts=context_counts_dict[n],
        vocab=vocab,
        n=n,
        alpha=1.0,
        max_length=64
    )

    print(f"Generated Sequence at n={n}: ", generated_seq)

Generated Sequence at n=2:  ['i', 'love', 'and', 'the', 'film', 'and', 'i', 'dont', 'know', 'the', 'movie', 'is', 'the', 'story', 'of', 'the', 'film', 'is', 'a', 'good', 'but', 'i', 'was', 'a', 'good', 'as', 'a', 'lot', 'of', 'the', 'film', 'and', 'the', 'film', 'is', 'the', 'first', 'movie', 'and', 'the', 'film', 'and', 'the', 'film', 'is', 'not', 'a', 'great', 'movie', 'is', 'a', 'good', 'and', 'the', 'movie', 'is', 'a', 'movie', 'and', 'i', 'dont', 'know', 'that', 'i']
Generated Sequence at n=3:  ['i', 'love', 'the', 'movie', 'is', 'the', 'best', 'movie', 'ever', 'made', 'it', 'to', 'the', 'end', 'of', 'the', 'film', 'is', 'a', 'great', 'deal', 'of', 'the', 'film', 'was', 'a', 'great', 'movie', 'to', 'be', 'a', 'little', 'bit', 'of', 'a', 'film', 'with', 'some', 'very', 'funny', 'and', 'entertaining', 'as', 'the', 'film', 'and', 'the', 'acting', 'is', 'terrible', 'and', 'i', 'was', 'very', 'good', 'and', 'the', 'movie', 'and', 'the', 'acting', 'is', 'very', 'much']
Generated Sequenc

In [16]:
import math

def calculate_perplexity(tokenized_sentences, ngram_counts, context_counts, vocab_size, n=2, alpha=1.0):
    """
    Calculates the perplexity of an n-gram model (with Laplace smoothing)
    on a list of tokenized sentences.

    Args:
      tokenized_sentences: List of lists of tokens.
      ngram_counts: Counter of n-grams.
      context_counts: Counter of (n-1)-grams.
      vocab_size: Size of the vocabulary.
      n: n-gram order.
      alpha: Laplace smoothing parameter.

    Returns:
      A float representing the perplexity on the given dataset.
    """

    log_prob_sum = 0.0 
    word_count = 0 

    for sentence in tokenized_sentences:
        sentence = pad_sentence(sentence,n)

        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i : i + n])  
            prob = laplace_probability(ngram,ngram_counts,context_counts,vocab_size,alpha)

            log_prob_sum += math.log(prob) 
            word_count += 1
            # print(f"N-gram: {ngram}, Probability: {prob}")  # Debugging


    perplexity = math.exp(-log_prob_sum / word_count)  # Compute perplexity
    return perplexity


# **Analysis**
use different n and rerun the code and write down your analysis

In [17]:
tokenized_test_sentences = tokinize(test_sentences, vocab)
for n in [2,3,4]:
    ngram_counts = ngram_counts_dict[n]
    context_counts = context_counts_dict[n]
    pp = calculate_perplexity(tokenized_test_sentences, ngram_counts, context_counts, len(vocab), n, alpha=0.001)
    print(f"Perplexity at n={n}: {pp}")

Perplexity at n=2: 577.3235912581922
Perplexity at n=3: 3546.183560479877
Perplexity at n=4: 27680.87531632101


In [18]:
tokenized_train_sentences = tokinize(train_sentences[:200], vocab)
for n in [2,3,4]:
    ngram_counts = ngram_counts_dict[n]
    context_counts = context_counts_dict[n]
    pp = calculate_perplexity(tokenized_train_sentences, ngram_counts, context_counts, len(vocab), n, alpha=0.001)
    print(f"Perplexity at n={n}: {pp}")

Perplexity at n=2: 186.52792147362118
Perplexity at n=3: 110.82426811321623
Perplexity at n=4: 126.74022023698798
