Lab Assignment 5: N-gram Language Model
•	Implement unigram, bigram, and trigram models using NLTK.
•	Train on a small text dataset and compute probabilities of word sequences.
•	Use Laplace smoothing to handle unseen words.

In [None]:
# Step 1: Install and import libraries
!pip install -U nltk

In [2]:
import nltk
import math
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter, defaultdict

nltk.download('punkt_tab')

# Step 2: Sample small dataset (you can extend this or load from file)
text = """
Natural language processing enables computers to understand human language.
It involves linguistics and machine learning. Language models are essential in NLP.
"""

# Step 3: Tokenization and preprocessing
tokens = word_tokenize(text.lower())
tokens = [''] + tokens + ['']  # Start and end tokens

# Step 4: Build Unigram, Bigram, and Trigram Models
unigrams = list(ngrams(tokens, 1))
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

vocab = set(tokens)
V = len(vocab)  # Vocabulary size

# Step 5: Define Probability Functions with Laplace Smoothing

def unigram_prob(word):
    return (unigram_counts[(word,)] + 1) / (sum(unigram_counts.values()) + V)

def bigram_prob(w1, w2):
    return (bigram_counts[(w1, w2)] + 1) / (unigram_counts[(w1,)] + V)

def trigram_prob(w1, w2, w3):
    return (trigram_counts[(w1, w2, w3)] + 1) / (bigram_counts[(w1, w2)] + V)

# Step 6: Compute Probabilities of Sample Sequences

def compute_sequence_prob(sequence):
    tokens = [''] + word_tokenize(sequence.lower()) + ['']

    print(f"\nSequence: {sequence}")

    # Unigram
    uni_prob = 1.0
    for w in tokens:
        prob = unigram_prob(w)
        uni_prob *= prob
    print(f"Unigram Probability: {uni_prob:.10f} | LogProb: {math.log(uni_prob):.4f}")

    # Bigram
    bi_prob = 1.0
    for w1, w2 in ngrams(tokens, 2):
        prob = bigram_prob(w1, w2)
        bi_prob *= prob
    print(f"Bigram Probability: {bi_prob:.10f} | LogProb: {math.log(bi_prob):.4f}")

    # Trigram
    tri_prob = 1.0
    for w1, w2, w3 in ngrams(tokens, 3):
        prob = trigram_prob(w1, w2, w3)
        tri_prob *= prob
    print(f"Trigram Probability: {tri_prob:.10f} | LogProb: {math.log(tri_prob):.4f}")

# Step 7: Test the model on a sample input
compute_sequence_prob("language models are essential")
compute_sequence_prob("computers learn language")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Sequence: language models are essential
Unigram Probability: 0.0000000267 | LogProb: -17.4379
Bigram Probability: 0.0000013611 | LogProb: -13.5072
Trigram Probability: 0.0000178884 | LogProb: -10.9314

Sequence: computers learn language
Unigram Probability: 0.0000003139 | LogProb: -14.9741
Bigram Probability: 0.0000039212 | LogProb: -12.4491
Trigram Probability: 0.0001079797 | LogProb: -9.1336
