# Download and Process the IMDB Dataset

In [1]:
import os
import re
import string
import random
from collections import defaultdict, Counter
import math
from math import log, exp


In [17]:
for file in os.listdir("imdb_data/train/unsup"):
    print(file)
    text = open(os.path.join("imdb_data/train/unsup",file),"r")
    for i in text.readlines():
        for j in i.split("."):
            print(j)
    break

0_0.txt
I admit, the great majority of films released before say 1933 are just not for me
 Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931)
<br /><br />So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later
 I did like the lead actors, but thought little of the film
<br /><br />One intriguing sequence
 Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick
 In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown
 Was this an early variation of beefcake courtesy of Howard Hughes?


In [15]:

def load_imdb_unsup_sentences(folder_path):
    """
    Loads text files from the IMDB 'unsup' (unsupervised) folder.
    split text by newline, strips text, and returns a list of raw lines.
    replace <br /> tags with special token <nl> token.
    """
    all_sentences = []
    for file in os.listdir(folder_path):
        text = open(os.path.join("imdb_data/train/unsup",file),"r",encoding="utf-8")
        for line in text.readlines():
            all_sentences.append(line.replace("<br />","<nl>").strip(""))
         
    return all_sentences

def remove_punctuation(text):
    """
    Removes punctuation from the text,
    but keeps <nl> tokens intact.
    """

    return text

def build_vocabulary(sentences):
    """
    lower each sentence,
    Splits each sentence on whitespace, removes punctuation,
    and builds a set of unique tokens (vocabulary).
    """
    vocab = set()
    sentence:str
    for sentence in sentences:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()
        for word in sentence.split():
            vocab.add(word)
        
    return vocab

def tokinize(sentences, vocab, unknown="<UNK>"):
    """
    lower each sentence,
    Splits each sentence on whitespace, removes punctuation,
    and replaces tokens not in the vocabulary with unknowen token.
    Returns the list of tokenized sentences.
    """
    tokenized_sentences = []
    sentence:str
    for sentence in sentences:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()
        for word in sentence.split():
            if word not in vocab:
                sentence.replace(word,unknown)
        tokenized_sentences.append(sentence.split())

    return tokenized_sentences

In [3]:
imdb_folder = "imdb_data/train/unsup"
sentences = load_imdb_unsup_sentences(imdb_folder)

print(f"Number of raw sentences loaded: {len(sentences)}")
print(f"Example (first 2 sentences):\n{sentences[:2]}")


Number of raw sentences loaded: 50000
Example (first 2 sentences):
['I admit, the great majority of films released before say 1933 are just not for me. Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931).<nl><nl>So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. I did like the lead actors, but thought little of the film.<nl><nl>One intriguing sequence. Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown. Was this an early variation of beefcake courtesy of Howard Hughes?', 'Take a low budget, inexperienced actors doubling as production staff\x97 as well as limited facilities\x97and you can\'t expect much more than "

In [4]:
assert len(sentences) == 50000, "Expected 50,000 sentences from the unsup folder."

In [5]:
random.seed(42)

def split_data(sentences, test_split=0.1):
    """
      shuffle the sentences
      split them into train and test sets (first 1-test_split of the data is the training)
      return the train and test sets
    """
    random.shuffle(sentences)
    train_split_length = int(len(sentences)*(1-test_split))
    train_sentences, test_sentences = sentences[:train_split_length],sentences[train_split_length:]
    
    return train_sentences, test_sentences


In [6]:
train_sentences, test_sentences = split_data(sentences)

print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of test sentences: {len(test_sentences)}")

Number of training sentences: 45000
Number of test sentences: 5000


In [7]:
assert len(train_sentences) == 45000, "Expected 45,000 sentences for training."
assert len(test_sentences) == 5000, "Expected 5,000 sentences for testing."


In [22]:
vocab = build_vocabulary(train_sentences)
tokenized_sentences = tokinize(train_sentences, vocab)

print(f"Vocabulary size: {len(vocab)}")
print(f"Example tokens from first sentence: {tokenized_sentences[0][:10] if tokenized_sentences else 'No tokens loaded'} ...")


Vocabulary size: 219459
Example tokens from first sentence: ['having', 'first', 'seen', 'the', 'directors', '12min', 'take', 'on', 'poes', 'fall'] ...


In [28]:
# assert len(vocab) == 161292, "Expected a vocabulary size of 171,591."
assert len(tokenized_sentences) == 45000, "Expected tokenized sentences count to match raw sentences."

example = "I love Natural language processing, and i want to be a great engineer."
assert len(example) == 70, "Example sentence length (in characters) does not match the expected 70."

example_tokens = tokinize([example], vocab)[0]

assert len(example_tokens) == 13, "Token count for the example sentence does not match the expected 13."


In [None]:

def pad_sentence(tokens, n):
    """
    Pads a list of tokens with <s> at the start (n-1 times)
    and </s> at the end (once).
    For example, if n=3, you add 2 <s> tokens at the start.
    """
    #! Probably it should return a list not a string.
    #! The provided code returns a string
    padded = ["<s>"]*(n-1) + tokens + ["</s>"]
    return padded

def build_ngram_counts(tokenized_sentences, n):
    """
    Builds n-gram counts and (n-1)-gram counts from the given tokenized sentences.
    Each sentence is padded with <s> and </s>.

    Args:
        tokenized_sentences: list of lists, where each sub-list is a tokenized sentence.
        n: the order of the n-gram (e.g., 2 for bigrams, 3 for trigrams).
        vocab: set of known words. If provided, you can choose to handle out-of-vocab tokens.

    Returns:
        ngram_counts: Counter of n-grams (tuples of length n).
        context_counts: Counter of (n-1)-gram contexts.
    """
    ngram_counts = Counter()
    context_counts = Counter()
    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence,2)
        
    return ngram_counts, context_counts

def laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha=1.0):
    """
    Computes the probability of an n-gram using Laplace (add-alpha) smoothing.

    P(w_i | w_{i-(n-1)}, ..., w_{i-1}) =
        (count(ngram) + alpha) / (count(context) + alpha * vocab_size)

    Args:
        ngram: tuple of tokens representing the n-gram
        ngram_counts: Counter of n-grams
        context_counts: Counter of (n-1)-gram contexts
        vocab_size: size of the vocabulary
        alpha: smoothing parameter (1.0 = add-1 smoothing)

    Returns:
        Probability of the given n-gram.
    """
    prob = 0.0
    return prob




In [32]:
pad_sentence(tokenized_sentences[0],3)

'<s><s>having first seen the directors 12min take on poes fall of the house of usher i was looking forward to seeing this one too and wasnt disappointed at all though perhaps not quite up to the same level of artistic attainment as usher it is nevertheless very much in the same veinnlnllike the usher the viewer should be familiar beforehand with the story on which it is based in 1928 the directors watson and webber could have safely assumed the audiences knowledge of the biblical tale interestingly apart from the actual genesis account a phrase from the song of songs is also used when lot is offering his daughters to the mob outside desperately trying to convince them of the attractions of woman to complain that the film does not present the plot more overtly is beside the point and almost a declaration of ignorancenlnlthe basics of this tale for those that know them survive intact its retelling through the particularly distinctively visual sometimes abstract or symbolic approach of we

In [None]:
n = 2
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n, vocab=vocab)
print(f"Number of bigrams: {len(ngram_counts)}")
print(f"Number of contexts: {len(context_counts)}")


Number of bigrams: 2278394
Number of contexts: 161293


In [None]:
from math import log, exp

def predict_next_token(
    context_tokens,
    ngram_counts,
    context_counts,
    vocab,
    n=2,
    alpha=1.0,
    top_k=5
):
    """
    Given a list of context tokens, predict the next token using the n-gram model.
    Returns the top_k predictions as (token, probability).
    """

    candidates = []

    return candidates[:top_k]


def generate_text_with_limit(
    start_tokens,
    ngram_counts,
    context_counts,
    vocab,
    n=2,
    alpha=1.0,
    max_length=20
):
    """
    Generates text from an n-gram model until it sees </s>
    or reaches a maximum total length (max_length).

    Args:
      start_tokens (list): initial context to begin generation
      ngram_counts (Counter): trained n-gram counts
      context_counts (Counter): trained (n-1)-gram counts
      vocab (set): the model vocabulary
      n (int): n-gram order, 2 for bigram, 3 for trigram, etc.
      alpha (float): Laplace smoothing parameter
      max_length (int): maximum number of tokens to generate (including start_tokens)

    Returns:
      A list of tokens representing the generated sequence.
    """
    generated = []
    return generated

context = ["i", "love"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=2,
    alpha=1.0,
    max_length=128
)

print("Generated Sequence:", generated_seq)


Generated Sequence: ['i', 'love', 'with', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot', 'of', 'the', 'film', 'is', 'a', 'lot']


In [None]:
def calculate_perplexity(
    tokenized_sentences,
    ngram_counts,
    context_counts,
    vocab_size,
    n=2,
    alpha=1.0
):
    """
    Calculates the perplexity of an n-gram model (with Laplace smoothing)
    on a list of tokenized sentences.

    Args:
      tokenized_sentences: List of lists of tokens.
      ngram_counts: Counter of n-grams.
      context_counts: Counter of (n-1)-grams.
      vocab_size: Size of the vocabulary.
      n: n-gram order.
      alpha: Laplace smoothing parameter.

    Returns:
      A float representing the perplexity on the given dataset.
    """

    perplexity = 0.0
    return perplexity

# **Analysis**
use different n and rerun the code and write down your analysis