In [6]:

#  Q1: N-gram Models


from collections import defaultdict
from google.colab import files

# ✅ Upload tokenized dataset (Assignment 1 output)
uploaded = files.upload()
filename = list(uploaded.keys())[0]

with open(filename, "r") as f:
    tokens = f.read().lower().split()

print("Total tokens:", len(tokens))
print("Unique words:", len(set(tokens)))

# Function to build n-grams
def build_ngram(tokens, n):
    ngram_counts = defaultdict(int)
    context_counts = defaultdict(int)
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        context = tuple(tokens[i:i+n-1]) if n > 1 else ()
        ngram_counts[ngram] += 1
        context_counts[context] += 1
    return ngram_counts, context_counts

# Build all 4 models
unigrams, unigram_context = build_ngram(tokens, 1)
bigrams, bigram_context = build_ngram(tokens, 2)
trigrams, trigram_context = build_ngram(tokens, 3)
quadrigrams, quadrigram_context = build_ngram(tokens, 4)

print("Top 10 unigrams:", list(unigrams.items())[:10])
print("Top 10 bigrams:", list(bigrams.items())[:10])
print("Top 10 trigrams:", list(trigrams.items())[:10])
print("Top 10 quadrigrams:", list(quadrigrams.items())[:10])


Saving q3_data.txt to q3_data (1).txt
Total tokens: 28323
Unique words: 1368
Top 10 unigrams: [(('1.',), 1), (('ગાંધીનગરમાં',), 36), (('હાલમાં',), 209), (('રાજકારણ',), 52), (('ક્ષેત્રે',), 1000), (('શાળા',), 42), (('વિકાસ',), 101), (('સમિતિએ',), 42), (('સ્થગિત',), 45), (('થયું;',), 98)]
Top 10 bigrams: [(('1.', 'ગાંધીનગરમાં'), 1), (('ગાંધીનગરમાં', 'હાલમાં'), 9), (('હાલમાં', 'રાજકારણ'), 9), (('રાજકારણ', 'ક્ષેત્રે'), 52), (('ક્ષેત્રે', 'શાળા'), 42), (('શાળા', 'વિકાસ'), 42), (('વિકાસ', 'સમિતિએ'), 42), (('સમિતિએ', 'સ્થગિત'), 2), (('સ્થગિત', 'થયું;'), 45), (('થયું;', 'તે'), 11)]
Top 10 trigrams: [(('1.', 'ગાંધીનગરમાં', 'હાલમાં'), 1), (('ગાંધીનગરમાં', 'હાલમાં', 'રાજકારણ'), 1), (('હાલમાં', 'રાજકારણ', 'ક્ષેત્રે'), 9), (('રાજકારણ', 'ક્ષેત્રે', 'શાળા'), 3), (('ક્ષેત્રે', 'શાળા', 'વિકાસ'), 42), (('શાળા', 'વિકાસ', 'સમિતિએ'), 42), (('વિકાસ', 'સમિતિએ', 'સ્થગિત'), 2), (('સમિતિએ', 'સ્થગિત', 'થયું;'), 2), (('સ્થગિત', 'થયું;', 'તે'), 4), (('થયું;', 'તે', 'ઉપરાંત'), 11)]
Top 10 quadrigrams: [(('1.', 'ગાં

In [7]:

#  Q2: Smoothing Techniques


V = len(set(tokens))  # Vocabulary size

def prob_ngram(ngram, ngram_counts, context_counts, method="none", k=1):
    count_ngram = ngram_counts.get(ngram, 0)
    context = ngram[:-1]
    count_context = context_counts.get(context, 0)

    if method == "none":   # No smoothing
        return count_ngram / count_context if count_context > 0 else 0

    elif method == "add1": # Add-One (Laplace)
        return (count_ngram + 1) / (count_context + V)

    elif method == "addk": # Add-K
        return (count_ngram + k) / (count_context + k * V)

    elif method == "add_type": # Add Token Type
        return (count_ngram + V) / (count_context + V * V)

# ✅ Example: probability of ("the","election") bigram
print("No smoothing:", prob_ngram(("the","election"), bigrams, bigram_context, "none"))
print("Add-One smoothing:", prob_ngram(("the","election"), bigrams, bigram_context, "add1"))
print("Add-K smoothing (k=0.5):", prob_ngram(("the","election"), bigrams, bigram_context, "addk", k=0.5))
print("Add-Type smoothing:", prob_ngram(("the","election"), bigrams, bigram_context, "add_type"))


No smoothing: 0
Add-One smoothing: 0.0007309941520467836
Add-K smoothing (k=0.5): 0.0007309941520467836
Add-Type smoothing: 0.0007309941520467836


In [8]:

#  Q3: Sentence Probabilities


import random

def sentence_prob(sentence, n, ngram_counts, context_counts, method="add1", k=1):
    words = sentence.lower().split()
    total_prob = 1.0
    for i in range(len(words) - n + 1):
        ngram = tuple(words[i:i+n])
        p = prob_ngram(ngram, ngram_counts, context_counts, method, k)
        total_prob *= p
    return total_prob

# ✅ Test sentence
test_sentence = "the election was held yesterday"

print("\nSentence:", test_sentence, "\n")
print("Unigram (Add-1):", sentence_prob(test_sentence, 1, unigrams, unigram_context, "add1"))
print("Bigram   (Add-1):", sentence_prob(test_sentence, 2, bigrams, bigram_context, "add1"))
print("Trigram  (Add-1):", sentence_prob(test_sentence, 3, trigrams, trigram_context, "add1"))
print("Quadgram (Add-1):", sentence_prob(test_sentence, 4, quadrigrams, quadrigram_context, "add1"))

# ✅ Apply on 1000 random sentences
sentences = []
for i in range(1000):
    start = random.randint(0, len(tokens)-10)
    sent = " ".join(tokens[start:start+10])
    sentences.append(sent)

for n, (counts, context) in [(1, (unigrams, unigram_context)),
                             (2, (bigrams, bigram_context)),
                             (3, (trigrams, trigram_context)),
                             (4, (quadrigrams, quadrigram_context))]:
    results = [sentence_prob(s, n, counts, context, method="add1") for s in sentences]
    avg_prob = sum(results) / len(results)
    print(f"Average Probability (n={n}, Add-1): {avg_prob}")



Sentence: the election was held yesterday 

Unigram (Add-1): 4.33386995128763e-23
Bigram   (Add-1): 2.855325411700374e-13
Trigram  (Add-1): 3.906085163206112e-10
Quadgram (Add-1): 5.343524503265961e-07
Average Probability (n=1, Add-1): 3.549542059491244e-21
Average Probability (n=2, Add-1): 1.9959012137597614e-11
Average Probability (n=3, Add-1): 3.0419143237664583e-11
Average Probability (n=4, Add-1): 7.597090215993513e-11
