In [1]:


import random
import re
from collections import Counter, defaultdict
import math

In [3]:

# 1. Load and Split the Dataset

with open("/content/Guj_3000.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]

random.shuffle(sentences)

# Create splits
val_set = sentences[:1000]
test_set = sentences[1000:2000]
train_set = sentences[2000:]

print(f"Train: {len(train_set)}, Val: {len(val_set)}, Test: {len(test_set)}")

Train: 1000, Val: 1000, Test: 1000


In [4]:

# 2. Preprocessing

def tokenize(text):
    return re.findall(r'\w+', text.lower())

train_tokens = [tokenize(s) for s in train_set]
val_tokens = [tokenize(s) for s in val_set]
test_tokens = [tokenize(s) for s in test_set]

# Vocabulary
vocab = set([w for sent in train_tokens for w in sent])
V = len(vocab)
print("Vocabulary Size:", V)

Vocabulary Size: 953


In [5]:

# 3. Build N-gram Models

def build_ngrams(tokens_list, n):
    counts = Counter()
    for sent in tokens_list:
        sent = ["<s>"]*(n-1) + sent + ["</s>"]
        for i in range(len(sent)-n+1):
            ngram = tuple(sent[i:i+n])
            counts[ngram] += 1
    return counts

unigrams = build_ngrams(train_tokens, 1)
bigrams = build_ngrams(train_tokens, 2)
trigrams = build_ngrams(train_tokens, 3)
quadgrams = build_ngrams(train_tokens, 4)

print("Example 5 unigrams:", list(unigrams.items())[:5])

Example 5 unigrams: [(('529',), 2), (('આણ',), 26), (('દમ',), 121), (('હ',), 719), (('લમ',), 242)]


In [6]:

# 4. Good Turing Smoothing

def good_turing_probs(ngram_counts, n, V):
    N = sum(ngram_counts.values())
    Nc = Counter(ngram_counts.values())
    N1 = Nc[1]

    # Seen probabilities
    probs = {}
    for ng, c in ngram_counts.items():
        c_star = ( (c+1) * (Nc[c+1]/Nc[c]) ) if (c+1) in Nc else c
        probs[ng] = c_star / N

    # Unseen probability
    total_seen = len(ngram_counts)
    if n == 1:
        unseen_prob = N1 / N / (V - len(unigrams))
    else:
        unseen_prob = N1 / N / ((V**n) - total_seen)

    return probs, unseen_prob

uni_probs, uni_unseen = good_turing_probs(unigrams, 1, V)
bi_probs, bi_unseen = good_turing_probs(bigrams, 2, V)
tri_probs, tri_unseen = good_turing_probs(trigrams, 3, V)
quad_probs, quad_unseen = good_turing_probs(quadgrams, 4, V)

In [7]:

# 5. Sentence Probability

def sentence_prob(tokens, probs, unseen_prob, n):
    tokens = ["<s>"]*(n-1) + tokens + ["</s>"]
    log_prob = 0
    for i in range(len(tokens)-n+1):
        ng = tuple(tokens[i:i+n])
        p = probs.get(ng, unseen_prob)
        log_prob += math.log(p)
    return log_prob

# Example validation computation
print("Sentence Probability Example:")
print(val_set[0])
print(sentence_prob(tokenize(val_set[0]), bi_probs, bi_unseen, 2))

Sentence Probability Example:
459. ભુજમાં આ મહિને વિજ્ઞાન ક્ષેત્રે મેટ્રોલોજીકલ સેન્ટરએ અહેવાલ બહાર પાડ્યો; પછી 23 મુદ્દાઓ પર ભાર મૂકાયો અને જનજાગૃતિ અભિયાન તેજ બન્યું. પ્રોજેક્ટ માટે ૨૦ કરોડ રૂપિયાની ફાળવણી કરવામાં આવી છે.
-478.90136008304427


In [8]:

# 6. Deleted Interpolation (Quadrigram)

def deleted_interpolation(train_tokens):
    lambdas = [0.25, 0.25, 0.25, 0.25]  # initial equal weights
    # TODO: implement EM algorithm to tune lambdas on held-out data
    return lambdas

lambdas = deleted_interpolation(train_tokens)
print("Interpolation weights:", lambdas)

Interpolation weights: [0.25, 0.25, 0.25, 0.25]
