# Download and Process the IMDB Dataset

In [31]:
!pip install --quiet gdown

# 1. Download the zipped IMDB dataset from Drive
# this is the unsup part of https://ai.stanford.edu/~amaas/data/sentiment/

!gdown "https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB" -O imdb_dataset.zip

# 2. Unzip the downloaded file
!unzip -q imdb_dataset.zip -d imdb_data

Downloading...
From (original): https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB
From (redirected): https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB&confirm=t&uuid=1c97ca0d-b779-4ce4-9ce1-da694e46d5e2
To: /content/imdb_dataset.zip
100% 44.7M/44.7M [00:00<00:00, 94.0MB/s]
replace imdb_data/unsup/0_0.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace imdb_data/unsup/10000_0.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [32]:
import os
import re
import string
import random
from collections import defaultdict, Counter
import math
from math import log, exp

In [33]:

def load_imdb_unsup_sentences(folder_path):
    """
    Loads text files from the IMDB 'unsup' (unsupervised) folder.
    split text by newline, strips text, and returns a list of raw lines.
    replace <br /> tags with special token <nl> token.
    """
    all_sentences = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            content = content.replace("<br />", " <nl> ")  # Ensure <nl> is spaced correctly
            sentences = [line.strip() for line in content.split("\n") if line.strip()]
            all_sentences.extend(sentences)

    return all_sentences

def remove_punctuation(text):
    """
    Removes punctuation from the text,
    but keeps <nl> tokens intact.
    """

    text = text.replace("<nl>", " <nl> ")  # Temporarily isolate <nl>
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.replace(" <nl> ", " <nl> ")  # Restore <nl> token placement
    return text

def build_vocabulary(sentences):
    """
    lower each sentence,
    Splits each sentence on whitespace, removes punctuation,
    and builds a set of unique tokens (vocabulary).
    """
    vocab = set()

    for sentence in sentences:
        sentence = sentence.lower().strip()
        sentence = remove_punctuation(sentence)
        tokens = sentence.split()
        vocab.update(tokens)

    return vocab

def tokinize(sentences, vocab, unknown="<UNK>"):
    """
    lower each sentence,
    Splits each sentence on whitespace, removes punctuation,
    and replaces tokens not in the vocabulary with unknowen token.
    Returns the list of tokenized sentences.
    """
    tokenized_sentences = []

    for sentence in sentences:
        sentence = sentence.lower().strip()
        sentence = remove_punctuation(sentence)
        tokens = [token if token in vocab else unknown for token in sentence.split()]
        tokenized_sentences.append(tokens)

    return tokenized_sentences

In [34]:
imdb_folder = "imdb_data/unsup"
sentences = load_imdb_unsup_sentences(imdb_folder)

print(f"Number of raw sentences loaded: {len(sentences)}")
print(f"Example (first 2 sentences):\n{sentences[:2]}")

Number of raw sentences loaded: 50000
Example (first 2 sentences):
['At the beginning of this film, there\'s a tight shot on Brooke Shields\' baby face: she\'s watching something with interest and we hear a woman moaning just in front of her. Since we all know what "Pretty Baby" is about, one is to assume the child is watching some sexual act with curiosity. Actually, it\'s just the opposite. This is writer-director Louis Malle\'s clever way of laughing at the viewer, saying "You have the dirty mind, not I." It\'s a very smart way to begin to the picture, but little else occupied my mind after it got going. Why would Keith Carradine\'s colorless older man want to marry a pubescent prostitute? Nobody here is saying, especially not Carradine (who has one sullen expression to express every emotion). The photography and background scoring are gorgeous, however the story and characters provide no passion, no emotion. The film is like a stylish painting, but one full of dullards. *1/2 from *

In [35]:
assert len(sentences) == 50000, "Expected 50,000 sentences from the unsup folder."

In [36]:
random.seed(42)

def split_data(sentences, test_split=0.1):
    """
    shuffle the sentences
    split them into train and test sets (first 1-test_split of the data is the training)
    return the train and test sets
    """
    sentences_copy = sentences.copy()
    random.shuffle(sentences_copy)
    split_point = int(len(sentences_copy) * (1 - test_split))

    # Split the data
    train_sentences = sentences_copy[:split_point]
    test_sentences = sentences_copy[split_point:]

    return train_sentences, test_sentences

In [37]:
train_sentences, test_sentences = split_data(sentences)

print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of test sentences: {len(test_sentences)}")

Number of training sentences: 45000
Number of test sentences: 5000


In [38]:
assert len(train_sentences) == 45000, "Expected 45,000 sentences for training."
assert len(test_sentences) == 5000, "Expected 5,000 sentences for testing."


In [39]:
vocab = build_vocabulary(train_sentences)
tokenized_sentences = tokinize(train_sentences, vocab)

print(f"Vocabulary size: {len(vocab)}")
print(f"Example tokens from first sentence: {tokenized_sentences[0][:10] if tokenized_sentences else 'No tokens loaded'} ...")


Vocabulary size: 161573
Example tokens from first sentence: ['i', 'watched', 'this', 'movie', 'just', 'for', 'the', 'sake', 'of', 'a'] ...


In [40]:
# assert len(vocab) == 161292, "Expected a vocabulary size of 171,591." #skip for replication problems
assert len(tokenized_sentences) == 45000, "Expected tokenized sentences count to match raw sentences."

example = "I love Natural language processing, and i want to be a great engineer."
assert len(example) == 70, "Example sentence length (in characters) does not match the expected 70."

example_tokens = tokinize([example], vocab)[0]
assert len(example_tokens) == 13, "Token count for the example sentence does not match the expected 13."


In [41]:
def pad_sentence(tokens, n):
    """
    Pads a list of tokens with <s> at the start (n-1 times)
    and </s> at the end (once).
    For example, if n=3, you add 2 <s> tokens at the start.
    """
    padded = ["<s>"] * (n - 1) + tokens + ["</s>"]
    return padded

def build_ngram_counts(tokenized_sentences, n, vocab=None):
    """
    Builds n-gram counts and (n-1)-gram counts from the given tokenized sentences.
    Each sentence is padded with <s> and </s>.

    Args:
        tokenized_sentences: list of lists, where each sub-list is a tokenized sentence.
        n: the order of the n-gram (e.g., 2 for bigrams, 3 for trigrams).
        vocab: set of known words. If provided, you can choose to handle out-of-vocab tokens.

    Returns:
        ngram_counts: Counter of n-grams (tuples of length n).
        context_counts: Counter of (n-1)-gram contexts.
    """
    #@n=2 --> P(baseball|Love) = C(love baseball)/C(love)
    ngram_counts = Counter()
    context_counts = Counter()

    for tokens in tokenized_sentences:
        # Pad the sentence
        padded_tokens = pad_sentence(tokens, n)

        # Count n-grams and (n-1)-grams
        for i in range(len(padded_tokens) - n + 1):
            # Extract the n-gram
            ngram = tuple(padded_tokens[i:i+n])
            ngram_counts[ngram] += 1

            # Extract the context (first n-1 tokens of the n-gram)
            context = tuple(padded_tokens[i:i+n-1])
            context_counts[context] += 1

    return ngram_counts, context_counts

def laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha=1.0):
    """
    Computes the probability of an n-gram using Laplace (add-alpha) smoothing.

    P(w_i | w_{i-(n-1)}, ..., w_{i-1}) =
        (count(ngram) + alpha) / (count(context) + alpha * vocab_size)

    Args:
        ngram: tuple of tokens representing the n-gram
        ngram_counts: Counter of n-grams
        context_counts: Counter of (n-1)-gram contexts
        vocab_size: size of the vocabulary
        alpha: smoothing parameter (1.0 = add-1 smoothing)

    Returns:
        Probability of the given n-gram.
    """

    # ngram_count = ngram_counts[ngram]
    # context_count = context_counts[context]
    count_ngram = ngram_counts.get(ngram,0)
    context = ngram[:-1]
    count_context = context_counts.get(context,0)

    # Calculate probability using Laplace smoothing
    prob = (count_ngram + alpha) / (count_context + alpha * vocab_size)

    return prob

In [42]:
n = 2
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n, vocab=vocab)
print(f"Number of bigrams: {len(ngram_counts)}")
print(f"Number of contexts: {len(context_counts)}")

Number of bigrams: 2281027
Number of contexts: 161574


In [43]:
from math import log, exp

def predict_next_token(
    context_tokens,
    ngram_counts,
    context_counts,
    vocab,
    n=2,
    alpha=1.0,
    top_k=5
):
    """
    Given a list of context tokens, predict the next token using the n-gram model.
    Returns the top_k predictions as (token, probability).
    """
    if len(context_tokens) >= n-1:
        context = tuple(context_tokens[-(n-1):])
    else:
        padding = ["<s>"] * (n-1 - len(context_tokens))
        context = tuple(padding + context_tokens)

    candidates = []
    vocab_size = len(vocab)

    for token in vocab:
        ngram = context + (token,)
        prob = laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha)
        candidates.append((token, prob))

    candidates.sort(key=lambda x: x[1], reverse=True)

    return candidates[:top_k]


def generate_text_with_limit(
    start_tokens,
    ngram_counts,
    context_counts,
    vocab,
    n=2,
    alpha=1.0,
    max_length=20
):
    """
    Generates text from an n-gram model until it sees </s>
    or reaches a maximum total length (max_length).

    Args:
      start_tokens (list): initial context to begin generation
      ngram_counts (Counter): trained n-gram counts
      context_counts (Counter): trained (n-1)-gram counts
      vocab (set): the model vocabulary
      n (int): n-gram order, 2 for bigram, 3 for trigram, etc.
      alpha (float): Laplace smoothing parameter
      max_length (int): maximum number of tokens to generate (including start_tokens)

    Returns:
      A list of tokens representing the generated sequence.
    """

    generated = start_tokens.copy()
    while len(generated) < max_length:
        next_predictions = predict_next_token(generated,ngram_counts,context_counts,vocab,n,alpha)

        next_token = next_predictions[0][0]

        if next_token == "</s>":
            break

        # Add the token to the generated sequence
        generated.append(next_token)

    return generated

In [44]:
def calculate_perplexity(tokenized_sentences, ngram_counts, context_counts, vocab_size, n=2, alpha=1.0):
    """
    Calculates the perplexity of an n-gram model (with Laplace smoothing)
    on a list of tokenized sentences.

    Args:
      tokenized_sentences: List of lists of tokens.
      ngram_counts: Counter of n-grams.
      context_counts: Counter of (n-1)-grams.
      vocab_size: Size of the vocabulary.
      n: n-gram order.
      alpha: Laplace smoothing parameter.

    Returns:
      A float representing the perplexity on the given dataset.
    """

    log_prob_sum = 0.0
    word_count = 0

    for sentence in tokenized_sentences:
        sentence = pad_sentence(sentence,n)

        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i : i + n])
            prob = laplace_probability(ngram,ngram_counts,context_counts,vocab_size,alpha)

            log_prob_sum += math.log(prob)
            word_count += 1
            # print(f"N-gram: {ngram}, Probability: {prob}")


    perplexity = math.exp(-log_prob_sum / word_count)
    return perplexity


# **Analysis**
use different n and rerun the code and write down your analysis

In [45]:
ngram_counts_dict = {}
context_counts_dict = {}
for n in [2,3,4]:
    ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)
    ngram_counts_dict[n] = ngram_counts
    context_counts_dict[n] = context_counts
    print(f"Number of {n}-grams: {len(ngram_counts)}")

Number of 2-grams: 2281027
Number of 3-grams: 6110588
Number of 4-grams: 8810202


In [46]:
for n in [2, 3, 4]:
    ngram_counts = ngram_counts_dict[n]
    context_counts = context_counts_dict[n]

    context = ["i", "love"]
    generated_seq = generate_text_with_limit(
        start_tokens=context,
        ngram_counts=ngram_counts,
        context_counts=context_counts,
        vocab=vocab,
        n=n,
        alpha=1.0,
        max_length=30
    )
    print(f"Generated Sequence (n={n}): {' '.join(generated_seq)}")

Generated Sequence (n=2): i love with the film is a lot of the film is a lot of the film is a lot of the film is a lot of the film is
Generated Sequence (n=3): i love the film is a very good and the film is a very good and the film is a very good and the film is a very good and
Generated Sequence (n=4): i love this movie and i was not disappointed nl nl i think the film is a bit of a stretch to think that this is a very good film


In [47]:
tokenized_test_sentences = tokinize(test_sentences, vocab)
for n in [2,3,4]:
    ngram_counts = ngram_counts_dict[n]
    context_counts = context_counts_dict[n]
    pp = calculate_perplexity(tokenized_test_sentences, ngram_counts, context_counts, len(vocab), n, alpha=0.001)
    print(f"Perplexity at n={n}: {pp}")

Perplexity at n=2: 583.385285945396
Perplexity at n=3: 3615.619015805097
Perplexity at n=4: 28014.547715826964


In [48]:
tokenized_train_sentences = tokinize(train_sentences[:200], vocab)
for n in [2,3,4]:
    ngram_counts = ngram_counts_dict[n]
    context_counts = context_counts_dict[n]
    pp = calculate_perplexity(tokenized_train_sentences, ngram_counts, context_counts, len(vocab), n, alpha=0.001)
    print(f"Perplexity at n={n}: {pp}")

Perplexity at n=2: 180.84731654987715
Perplexity at n=3: 106.72718856907129
Perplexity at n=4: 123.4175463277665
