In [2]:
#important libraries

import re
import requests
from collections import defaultdict, Counter
import math
import pandas as pd
import numpy as np
import string

##Question 1: Implementing an N-Gram Language Model and Testing Perplexity [20]

In [3]:
#Function to preprocess text (lowercase, remove special characters, tokenize)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return tokens

#Function to load data from a URL
def load_data_from_url(url):
    response = requests.get(url)
    text = response.text
    return preprocess_text(text)

train_url = "https://raw.githubusercontent.com/Kushal-Chandani/NLP-Homeworks/refs/heads/main/Homework2/Data/Question1/train.txt"
test_url = "https://raw.githubusercontent.com/Kushal-Chandani/NLP-Homeworks/refs/heads/main/Homework2/Data/Question1/test.txt"

train_data = load_data_from_url(train_url)
test_data = load_data_from_url(test_url)

In [4]:
print(f"Train data: {train_data}")

Train data: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'the', 'fox', 'is', 'quick', 'and', 'the', 'dog', 'is', 'lazy', 'quick', 'brown', 'foxes', 'are', 'smarter', 'than', 'lazy', 'dogs', 'dogs', 'are', 'loyal', 'but', 'sometimes', 'lazy', 'the', 'quick', 'brown', 'fox', 'is', 'too', 'clever', 'for', 'the', 'lazy', 'dog', 'foxes', 'and', 'dogs', 'do', 'not', 'always', 'get', 'along', 'a', 'clever', 'fox', 'can', 'easily', 'trick', 'a', 'lazy', 'dog']


In [5]:
print(f"Test data: {test_data}")

Test data: ['the', 'fox', 'and', 'the', 'dog', 'are', 'friends', 'sometimes', 'the', 'quick', 'fox', 'plays', 'with', 'the', 'lazy', 'dog', 'foxes', 'are', 'clever', 'and', 'dogs', 'are', 'loyal', 'a', 'lazy', 'dog', 'can', 'be', 'tricky', 'to', 'train']


In [6]:
#Function to generate n-grams
def generate_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

#Initialize the NGram model variables
def init_ngram_model(n):
    return {
        "n": n,
        "ngram_counts": defaultdict(Counter),
        "context_counts": defaultdict(int),
        "vocab": set()
    }

#Train the NGram model
def train_ngram_model(model, tokens):
    n = model["n"]
    ngrams = generate_ngrams(tokens, n)
    for ngram in ngrams:
        context = ngram[:-1]
        word = ngram[-1]
        model["ngram_counts"][context][word] += 1
        model["context_counts"][context] += 1
        model["vocab"].add(word)

#Get the probability of a word given a context with add-one smoothing
def get_prob(model, context, word):
    ngram_counts = model["ngram_counts"]
    context_counts = model["context_counts"]
    vocab = model["vocab"]

    return (ngram_counts[context][word] + 1) / (context_counts[context] + len(vocab))

In [7]:
def calculate_perplexity(model, test_tokens):
    n = model["n"]
    ngrams = generate_ngrams(test_tokens, n)
    log_prob_sum = 0
    N = len(ngrams)

    for ngram in ngrams:
        context = ngram[:-1]
        word = ngram[-1]
        prob = get_prob(model, context, word)
        log_prob_sum += math.log(prob)

    perplexity = math.exp(-log_prob_sum / N)
    return perplexity

In [8]:
for n in [1, 2, 3]:
    model = init_ngram_model(n)
    train_ngram_model(model, train_data)

    perplexity = calculate_perplexity(model, test_data)
    print(f"{n}-Gram Perplexity: {perplexity}")

1-Gram Perplexity: 30.458558025975183
2-Gram Perplexity: 22.415903584375318
3-Gram Perplexity: 26.756721385458494


#The results are shown above
