In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import random
import re
from math import log
import matplotlib.pyplot as plt

In [2]:
# Load the tokenized Hindi data
with open('tokenized_hi.txt', 'r', encoding='utf-8') as f:
    sentences = f.readlines()

# Remove newline characters and filter out empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
print(f"Total sentences in dataset: {len(sentences)}")

# 1. Create data splits using random sampling
random.seed(42)  # For reproducibility
random.shuffle(sentences)

# Create splits
validation_set = sentences[:1000]
test_set = sentences[1000:2000]
training_set = sentences[2000:]

print(f"Validation set: {len(validation_set)} sentences")
print(f"Test set: {len(test_set)} sentences")
print(f"Training set: {len(training_set)} sentences")
print(f"Total: {len(validation_set) + len(test_set) + len(training_set)} sentences")

Total sentences in dataset: 141536
Validation set: 1000 sentences
Test set: 1000 sentences
Training set: 139536 sentences
Total: 141536 sentences


In [3]:
class LanguageModel:
    def __init__(self, n):
        self.n = n  # n-gram size
        self.ngrams = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocabulary = set()
        
    def preprocess_sentence(self, sentence):
        """Add start and end tokens to sentence"""
        tokens = sentence.split()
        if self.n > 1:
            # Add start tokens
            padded_tokens = ['<s>'] * (self.n - 1) + tokens + ['</s>']
        else:
            padded_tokens = tokens + ['</s>']
        return padded_tokens
    
    def train(self, sentences):
        """Train the language model on sentences"""
        for sentence in sentences:
            tokens = self.preprocess_sentence(sentence)
            self.vocabulary.update(tokens)
            
            # Generate n-grams
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i + self.n])
                self.ngrams[ngram] += 1
                
                # For context counts (n-1 grams)
                if self.n > 1:
                    context = ngram[:-1]
                    self.context_counts[context] += 1
                else:
                    # For unigram, context is total count
                    self.context_counts[('',)] += 1
    
    def get_probability(self, ngram):
        """Get probability of n-gram"""
        if self.n == 1:
            total_count = sum(self.ngrams.values())
            return self.ngrams[ngram] / total_count if total_count > 0 else 0
        else:
            context = ngram[:-1]
            context_count = self.context_counts[context]
            return self.ngrams[ngram] / context_count if context_count > 0 else 0

In [4]:
class GoodTuringLanguageModel(LanguageModel):
    def __init__(self, n):
        super().__init__(n)
        self.frequency_counts = defaultdict(int)  # N_c: count of counts
        self.good_turing_probs = {}
        self.unseen_prob = 0
        
    def train(self, sentences):
        """Train the language model and compute Good-Turing probabilities"""
        super().train(sentences)
        self._compute_frequency_counts()
        self._compute_good_turing_probabilities()
    
    def _compute_frequency_counts(self):
        """Compute frequency of frequencies (N_c)"""
        # Count how many n-grams appear c times
        for count in self.ngrams.values():
            self.frequency_counts[count] += 1
    
    def _compute_good_turing_probabilities(self):
        """Compute Good-Turing smoothed probabilities"""
        total_ngrams = sum(self.ngrams.values())
        N = len(self.ngrams)  # Total number of seen n-grams
        
        # N1 = number of n-grams that occur exactly once
        N1 = self.frequency_counts[1]
        
        # Calculate probability for unseen n-grams
        if self.n == 1:
            # For unigram: V - U (vocabulary size - unique seen unigrams)
            V = len(self.vocabulary)
            U = len(self.ngrams)
            num_unseen = V - U
        else:
            # For n>1: V^n - N
            V = len(self.vocabulary)
            num_unseen = V**self.n - N
        
        if num_unseen > 0:
            self.unseen_prob = N1 / (total_ngrams * num_unseen)
        else:
            self.unseen_prob = 0
        
        # Calculate Good-Turing probabilities for seen n-grams
        for ngram, count in self.ngrams.items():
            if count + 1 in self.frequency_counts:
                N_c = self.frequency_counts[count]
                N_c_plus_1 = self.frequency_counts[count + 1]
                c_star = (count + 1) * N_c_plus_1 / N_c
            else:
                c_star = count
            
            self.good_turing_probs[ngram] = c_star / total_ngrams
    
    def get_good_turing_probability(self, ngram):
        """Get Good-Turing smoothed probability"""
        if ngram in self.good_turing_probs:
            return self.good_turing_probs[ngram]
        else:
            return self.unseen_prob
    
    def sentence_good_turing_probability(self, sentence):
        """Calculate Good-Turing probability of a sentence"""
        tokens = self.preprocess_sentence(sentence)
        prob = 1.0
        
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            ngram_prob = self.get_good_turing_probability(ngram)
            prob *= ngram_prob
        
        return prob
    
    def sentence_good_turing_log_probability(self, sentence):
        """Calculate Good-Turing log probability of a sentence"""
        tokens = self.preprocess_sentence(sentence)
        log_prob = 0.0
        
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            ngram_prob = self.get_good_turing_probability(ngram)
            if ngram_prob > 0:
                log_prob += log(ngram_prob)
            else:
                return float('-inf')
        
        return log_prob

In [5]:
# Train all four language models with Good-Turing smoothing
print("Training Good-Turing language models...")

# 1. Unigram Model
unigram_gt = GoodTuringLanguageModel(n=1)
unigram_gt.train(training_set)
print(f"Unigram GT model: {len(unigram_gt.ngrams)} unique unigrams, unseen prob: {unigram_gt.unseen_prob:.2e}")

# 2. Bigram Model
bigram_gt = GoodTuringLanguageModel(n=2)
bigram_gt.train(training_set)
print(f"Bigram GT model: {len(bigram_gt.ngrams)} unique bigrams, unseen prob: {bigram_gt.unseen_prob:.2e}")

# 3. Trigram Model
trigram_gt = GoodTuringLanguageModel(n=3)
trigram_gt.train(training_set)
print(f"Trigram GT model: {len(trigram_gt.ngrams)} unique trigrams, unseen prob: {trigram_gt.unseen_prob:.2e}")

# 4. Quadrigram Model
quadrigram_gt = GoodTuringLanguageModel(n=4)
quadrigram_gt.train(training_set)
print(f"Quadrigram GT model: {len(quadrigram_gt.ngrams)} unique quadrigrams, unseen prob: {quadrigram_gt.unseen_prob:.2e}")

print(f"Vocabulary size: {len(unigram_gt.vocabulary)}")
print("Good-Turing training completed!")

Training Good-Turing language models...
Unigram GT model: 103071 unique unigrams, unseen prob: 0.00e+00
Bigram GT model: 929486 unique bigrams, unseen prob: 1.98e-11
Trigram GT model: 2037829 unique trigrams, unseen prob: 5.09e-16
Quadrigram GT model: 2613887 unique quadrigrams, unseen prob: 6.86e-21
Vocabulary size: 103071
Good-Turing training completed!


In [6]:
# Evaluate Good-Turing models on validation and test sets
def evaluate_good_turing_models(sentences, models, model_names):
    """Evaluate Good-Turing models on sentences"""
    results = []
    
    for sentence in sentences:
        sentence_results = {'sentence': sentence}
        
        for model, name in zip(models, model_names):
            try:
                log_prob = model.sentence_good_turing_log_probability(sentence)
                sentence_results[name] = log_prob
            except Exception as e:
                sentence_results[name] = float('-inf')
        
        results.append(sentence_results)
    
    return results

# Define models and names
gt_models = [unigram_gt, bigram_gt, trigram_gt, quadrigram_gt]
gt_model_names = ['Unigram_GT', 'Bigram_GT', 'Trigram_GT', 'Quadrigram_GT']

print("Evaluating Good-Turing models on validation set...")
validation_results = evaluate_good_turing_models(validation_set, gt_models, gt_model_names)
validation_df = pd.DataFrame(validation_results)

print("Evaluating Good-Turing models on test set...")
test_results = evaluate_good_turing_models(test_set, gt_models, gt_model_names)
test_df = pd.DataFrame(test_results)

print("Evaluation completed!")
print(f"Validation results shape: {validation_df.shape}")
print(f"Test results shape: {test_df.shape}")

Evaluating Good-Turing models on validation set...
Evaluating Good-Turing models on test set...
Evaluation completed!
Validation results shape: (1000, 5)
Test results shape: (1000, 5)


In [7]:
# 3. Show frequency table for top 100 frequencies for each model
def create_frequency_table(model, model_name, top_n=100):
    """Create frequency table showing C, N_C, and C* for top frequencies"""
    
    # Get frequency counts (N_c)
    freq_counts = dict(model.frequency_counts)
    
    # Calculate C* (Good-Turing adjusted counts)
    total_ngrams = sum(model.ngrams.values())
    c_star_values = {}
    
    for c in freq_counts.keys():
        if c + 1 in freq_counts:
            N_c = freq_counts[c]
            N_c_plus_1 = freq_counts[c + 1]
            c_star = (c + 1) * N_c_plus_1 / N_c
        else:
            c_star = c
        c_star_values[c] = c_star
    
    # Create table
    table_data = []
    for c in sorted(freq_counts.keys())[:top_n]:
        table_data.append({
            'C (MLE)': c,
            'N_C': freq_counts[c],
            'C*': round(c_star_values[c], 4)
        })
    
    return pd.DataFrame(table_data)

print("="*80)
print("FREQUENCY TABLES FOR TOP 100 FREQUENCIES")
print("="*80)

for model, name in zip(gt_models, gt_model_names):
    print(f"\n{name} Model:")
    print("-" * 40)
    freq_table = create_frequency_table(model, name)
    print(freq_table.head(20))  # Show first 20 rows
    print(f"... (showing first 20 of {len(freq_table)} entries)")
    
    # Save full table
    freq_table.to_csv(f'{name}_frequency_table.csv', index=False)
    print(f"Full table saved to {name}_frequency_table.csv")

FREQUENCY TABLES FOR TOP 100 FREQUENCIES

Unigram_GT Model:
----------------------------------------
    C (MLE)    N_C       C*
0         1  53838   0.5038
1         2  13563   1.4955
2         3   6761   2.4411
3         4   4126   3.4695
4         5   2863   4.6420
5         6   2215   5.1260
6         7   1622   6.5845
7         8   1335   7.3146
8         9   1085   8.9862
9        10    975   9.0821
10       11    805  10.4348
11       12    700  11.6814
12       13    629  13.1542
13       14    591  12.6142
14       15    497  14.2938
15       16    444  14.6644
16       17    383  19.7859
17       18    421  16.7435
18       19    371  17.0350
19       20    316  21.1994
... (showing first 20 of 100 entries)
Full table saved to Unigram_GT_frequency_table.csv

Bigram_GT Model:
----------------------------------------
    C (MLE)     N_C       C*
0         1  667192   0.3318
1         2  110688   1.2080
2         3   44571   2.1982
3         4   24494   3.1336
4         5   1535

In [8]:
class DeletedInterpolationQuadrigram:
    def __init__(self):
        self.unigram_model = None
        self.bigram_model = None
        self.trigram_model = None
        self.quadrigram_model = None
        self.lambdas = [0.25, 0.25, 0.25, 0.25]  # Initial equal weights
        
    def train(self, sentences):
        """Train all component models"""
        print("Training component models for deleted interpolation...")
        
        self.unigram_model = LanguageModel(1)
        self.unigram_model.train(sentences)
        
        self.bigram_model = LanguageModel(2)
        self.bigram_model.train(sentences)
        
        self.trigram_model = LanguageModel(3)
        self.trigram_model.train(sentences)
        
        self.quadrigram_model = LanguageModel(4)
        self.quadrigram_model.train(sentences)
        
        print("Component models trained.")
    
    def get_deleted_count(self, model, ngram):
        """Get count with current ngram deleted (for deleted interpolation)"""
        if ngram in model.ngrams:
            return max(0, model.ngrams[ngram] - 1)
        return 0
    
    def get_deleted_context_count(self, model, context):
        """Get context count with current context deleted"""
        if context in model.context_counts:
            return max(0, model.context_counts[context] - 1)
        return 0
    
    def compute_deleted_probability(self, ngram):
        """Compute probability using deleted interpolation"""
        if len(ngram) != 4:
            raise ValueError("This method is for quadrigrams only")
        
        # Get components
        quadrigram = ngram
        trigram = ngram[1:]
        bigram = ngram[2:]
        unigram = (ngram[3],)
        
        # Quadrigram probability (deleted)
        quad_context = quadrigram[:-1]
        quad_count = self.get_deleted_count(self.quadrigram_model, quadrigram)
        quad_context_count = self.get_deleted_context_count(self.quadrigram_model, quad_context)
        p4 = quad_count / quad_context_count if quad_context_count > 0 else 0
        
        # Trigram probability (deleted)
        tri_context = trigram[:-1]
        tri_count = self.get_deleted_count(self.trigram_model, trigram)
        tri_context_count = self.get_deleted_context_count(self.trigram_model, tri_context)
        p3 = tri_count / tri_context_count if tri_context_count > 0 else 0
        
        # Bigram probability (deleted)
        bi_context = bigram[:-1]
        bi_count = self.get_deleted_count(self.bigram_model, bigram)
        bi_context_count = self.get_deleted_context_count(self.bigram_model, bi_context)
        p2 = bi_count / bi_context_count if bi_context_count > 0 else 0
        
        # Unigram probability (deleted)
        uni_count = self.get_deleted_count(self.unigram_model, unigram)
        total_count = sum(self.unigram_model.ngrams.values()) - 1
        p1 = uni_count / total_count if total_count > 0 else 0
        
        # Interpolated probability
        prob = (self.lambdas[0] * p1 + 
                self.lambdas[1] * p2 + 
                self.lambdas[2] * p3 + 
                self.lambdas[3] * p4)
        
        return prob
    
    def optimize_lambdas(self, validation_sentences, max_iterations=10):
        """Optimize lambda parameters using EM algorithm on validation set"""
        print("Optimizing lambda parameters...")
        
        # Extract all quadrigrams from validation set
        validation_quadrigrams = []
        for sentence in validation_sentences:
            tokens = ['<s>'] * 3 + sentence.split() + ['</s>']
            for i in range(len(tokens) - 3):
                quadrigram = tuple(tokens[i:i + 4])
                validation_quadrigrams.append(quadrigram)
        
        for iteration in range(max_iterations):
            print(f"Iteration {iteration + 1}/{max_iterations}")
            
            # E-step: compute expectations
            lambda_numerators = [0, 0, 0, 0]
            total_count = 0
            
            for quadrigram in validation_quadrigrams:
                # Get component probabilities
                trigram = quadrigram[1:]
                bigram = quadrigram[2:]
                unigram = (quadrigram[3],)
                
                # Component probabilities (using deleted interpolation concept)
                quad_context = quadrigram[:-1]
                quad_count = self.get_deleted_count(self.quadrigram_model, quadrigram)
                quad_context_count = self.get_deleted_context_count(self.quadrigram_model, quad_context)
                p4 = quad_count / quad_context_count if quad_context_count > 0 else 0
                
                tri_context = trigram[:-1]
                tri_count = self.get_deleted_count(self.trigram_model, trigram)
                tri_context_count = self.get_deleted_context_count(self.trigram_model, tri_context)
                p3 = tri_count / tri_context_count if tri_context_count > 0 else 0
                
                bi_context = bigram[:-1]
                bi_count = self.get_deleted_count(self.bigram_model, bigram)
                bi_context_count = self.get_deleted_context_count(self.bigram_model, bi_context)
                p2 = bi_count / bi_context_count if bi_context_count > 0 else 0
                
                uni_count = self.get_deleted_count(self.unigram_model, unigram)
                total_uni_count = sum(self.unigram_model.ngrams.values()) - 1
                p1 = uni_count / total_uni_count if total_uni_count > 0 else 0
                
                # Current interpolated probability
                current_prob = (self.lambdas[0] * p1 + 
                               self.lambdas[1] * p2 + 
                               self.lambdas[2] * p3 + 
                               self.lambdas[3] * p4)
                
                if current_prob > 0:
                    # Compute expectations (responsibilities)
                    lambda_numerators[0] += (self.lambdas[0] * p1) / current_prob
                    lambda_numerators[1] += (self.lambdas[1] * p2) / current_prob
                    lambda_numerators[2] += (self.lambdas[2] * p3) / current_prob
                    lambda_numerators[3] += (self.lambdas[3] * p4) / current_prob
                    total_count += 1
            
            # M-step: update lambdas
            if total_count > 0:
                new_lambdas = [num / total_count for num in lambda_numerators]
                
                # Check convergence
                change = sum(abs(new_lambdas[i] - self.lambdas[i]) for i in range(4))
                self.lambdas = new_lambdas
                
                print(f"  Lambdas: {[f'{l:.4f}' for l in self.lambdas]}, Change: {change:.6f}")
                
                if change < 1e-6:
                    print("  Converged!")
                    break
        
        print(f"Final lambdas: {[f'{l:.4f}' for l in self.lambdas]}")
    
    def sentence_probability(self, sentence):
        """Calculate sentence probability using deleted interpolation"""
        tokens = ['<s>'] * 3 + sentence.split() + ['</s>']
        prob = 1.0
        
        for i in range(len(tokens) - 3):
            quadrigram = tuple(tokens[i:i + 4])
            quadrigram_prob = self.compute_deleted_probability(quadrigram)
            if quadrigram_prob <= 0:
                return 0
            prob *= quadrigram_prob
        
        return prob
    
    def sentence_log_probability(self, sentence):
        """Calculate log probability of sentence"""
        tokens = ['<s>'] * 3 + sentence.split() + ['</s>']
        log_prob = 0.0
        
        for i in range(len(tokens) - 3):
            quadrigram = tuple(tokens[i:i + 4])
            quadrigram_prob = self.compute_deleted_probability(quadrigram)
            if quadrigram_prob <= 0:
                return float('-inf')
            log_prob += log(quadrigram_prob)
        
        return log_prob

In [9]:
# 4. Implement deleted interpolation for quadrigram model
print("="*80)
print("DELETED INTERPOLATION QUADRIGRAM MODEL")
print("="*80)

# Create and train deleted interpolation model
deleted_interp_model = DeletedInterpolationQuadrigram()
deleted_interp_model.train(training_set)

# Optimize parameters using validation set
deleted_interp_model.optimize_lambdas(validation_set[:100])  # Use subset for efficiency

print("\nEvaluating deleted interpolation model on validation set...")
di_validation_results = []
for sentence in validation_set:
    try:
        log_prob = deleted_interp_model.sentence_log_probability(sentence)
        di_validation_results.append({'sentence': sentence, 'Deleted_Interpolation': log_prob})
    except:
        di_validation_results.append({'sentence': sentence, 'Deleted_Interpolation': float('-inf')})

di_validation_df = pd.DataFrame(di_validation_results)

print("\nEvaluating deleted interpolation model on test set...")
di_test_results = []
for sentence in test_set:
    try:
        log_prob = deleted_interp_model.sentence_log_probability(sentence)
        di_test_results.append({'sentence': sentence, 'Deleted_Interpolation': log_prob})
    except:
        di_test_results.append({'sentence': sentence, 'Deleted_Interpolation': float('-inf')})

di_test_df = pd.DataFrame(di_test_results)

print("Deleted interpolation evaluation completed!")

DELETED INTERPOLATION QUADRIGRAM MODEL
Training component models for deleted interpolation...
Component models trained.
Optimizing lambda parameters...
Iteration 1/10
  Lambdas: ['0.3402', '0.4156', '0.1816', '0.0626'], Change: 0.511687
Iteration 2/10
  Lambdas: ['0.3367', '0.4904', '0.1488', '0.0241'], Change: 0.149567
Iteration 3/10
  Lambdas: ['0.3292', '0.5273', '0.1321', '0.0114'], Change: 0.073681
Iteration 4/10
  Lambdas: ['0.3251', '0.5456', '0.1232', '0.0061'], Change: 0.036709
Iteration 5/10
  Lambdas: ['0.3231', '0.5550', '0.1184', '0.0035'], Change: 0.018733
Iteration 6/10
  Lambdas: ['0.3222', '0.5599', '0.1158', '0.0021'], Change: 0.009792
Iteration 7/10
  Lambdas: ['0.3217', '0.5625', '0.1145', '0.0013'], Change: 0.005198
Iteration 8/10
  Lambdas: ['0.3215', '0.5639', '0.1139', '0.0008'], Change: 0.002771
Iteration 9/10
  Lambdas: ['0.3214', '0.5646', '0.1135', '0.0005'], Change: 0.001468
Iteration 10/10
  Lambdas: ['0.3213', '0.5650', '0.1134', '0.0003'], Change: 0.0007

In [10]:
# Performance analysis and comparison
print("="*80)
print("PERFORMANCE ANALYSIS")
print("="*80)

# Good-Turing models performance
print("\nGOOD-TURING MODELS PERFORMANCE:")
print("-" * 50)

for col in ['Unigram_GT', 'Bigram_GT', 'Trigram_GT', 'Quadrigram_GT']:
    # Validation set
    val_valid = validation_df[col][validation_df[col] != float('-inf')]
    val_coverage = len(val_valid) / len(validation_df) * 100
    val_mean = val_valid.mean() if len(val_valid) > 0 else 0
    
    # Test set
    test_valid = test_df[col][test_df[col] != float('-inf')]
    test_coverage = len(test_valid) / len(test_df) * 100
    test_mean = test_valid.mean() if len(test_valid) > 0 else 0
    
    print(f"{col}:")
    print(f"  Validation: {val_coverage:.1f}% coverage, mean log prob: {val_mean:.4f}")
    print(f"  Test: {test_coverage:.1f}% coverage, mean log prob: {test_mean:.4f}")

# Deleted interpolation performance
di_val_valid = di_validation_df['Deleted_Interpolation'][di_validation_df['Deleted_Interpolation'] != float('-inf')]
di_val_coverage = len(di_val_valid) / len(di_validation_df) * 100
di_val_mean = di_val_valid.mean() if len(di_val_valid) > 0 else 0

di_test_valid = di_test_df['Deleted_Interpolation'][di_test_df['Deleted_Interpolation'] != float('-inf')]
di_test_coverage = len(di_test_valid) / len(di_test_df) * 100
di_test_mean = di_test_valid.mean() if len(di_test_valid) > 0 else 0

print(f"\nDELETED INTERPOLATION QUADRIGRAM:")
print(f"  Validation: {di_val_coverage:.1f}% coverage, mean log prob: {di_val_mean:.4f}")
print(f"  Test: {di_test_coverage:.1f}% coverage, mean log prob: {di_test_mean:.4f}")
print(f"  Final lambda parameters: {[f'{l:.4f}' for l in deleted_interp_model.lambdas]}")

# Sample results
print("\n" + "="*80)
print("SAMPLE RESULTS")
print("="*80)

print("\nFirst 5 validation sentences with all model probabilities:")
sample_val = validation_df.head(5)
sample_di_val = di_validation_df.head(5)

for i in range(5):
    print(f"\nSentence {i+1}: {sample_val.iloc[i]['sentence']}")
    print("-" * 60)
    for col in ['Unigram_GT', 'Bigram_GT', 'Trigram_GT', 'Quadrigram_GT']:
        score = sample_val.iloc[i][col]
        score_str = f"{score:.4f}" if score != float('-inf') else "-inf"
        print(f"{col:<15}: {score_str}")
    
    di_score = sample_di_val.iloc[i]['Deleted_Interpolation']
    di_score_str = f"{di_score:.4f}" if di_score != float('-inf') else "-inf"
    print(f"{'Deleted_Interp':<15}: {di_score_str}")

PERFORMANCE ANALYSIS

GOOD-TURING MODELS PERFORMANCE:
--------------------------------------------------
Unigram_GT:
  Validation: 75.0% coverage, mean log prob: -141.2295
  Test: 76.6% coverage, mean log prob: -140.5559
Bigram_GT:
  Validation: 100.0% coverage, mean log prob: -324.3017
  Test: 100.0% coverage, mean log prob: -324.5032
Trigram_GT:
  Validation: 100.0% coverage, mean log prob: -581.4157
  Test: 100.0% coverage, mean log prob: -582.0854
Quadrigram_GT:
  Validation: 100.0% coverage, mean log prob: -891.9291
  Test: 100.0% coverage, mean log prob: -890.9431

DELETED INTERPOLATION QUADRIGRAM:
  Validation: 63.9% coverage, mean log prob: -102.8221
  Test: 68.0% coverage, mean log prob: -105.5530
  Final lambda parameters: ['0.3213', '0.5650', '0.1134', '0.0003']

SAMPLE RESULTS

First 5 validation sentences with all model probabilities:

Sentence 1: मंत्री मिश्रा को बताने के बाद भी अतिथि शिक्षको का बेतन आज तक नहीं मिला।
-------------------------------------------------------

In [11]:
# Save all results
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save Good-Turing results
validation_df.to_csv('good_turing_validation_results.csv', index=False, encoding='utf-8')
test_df.to_csv('good_turing_test_results.csv', index=False, encoding='utf-8')

# Save Deleted Interpolation results
di_validation_df.to_csv('deleted_interpolation_validation_results.csv', index=False, encoding='utf-8')
di_test_df.to_csv('deleted_interpolation_test_results.csv', index=False, encoding='utf-8')

print("✓ Good-Turing validation results saved to: good_turing_validation_results.csv")
print("✓ Good-Turing test results saved to: good_turing_test_results.csv")
print("✓ Deleted interpolation validation results saved to: deleted_interpolation_validation_results.csv")
print("✓ Deleted interpolation test results saved to: deleted_interpolation_test_results.csv")

# Summary
print("\n" + "="*80)
print("IMPLEMENTATION SUMMARY")
print("="*80)
print("✓ Created data splits:")
print(f"  - Training set: {len(training_set)} sentences")
print(f"  - Validation set: {len(validation_set)} sentences")
print(f"  - Test set: {len(test_set)} sentences")
print()
print("✓ Implemented Good-Turing smoothing for:")
print("  - Unigram Model")
print("  - Bigram Model")
print("  - Trigram Model")
print("  - Quadrigram Model")
print()
print("✓ Generated frequency tables (C, N_C, C*) for all models")
print()
print("✓ Implemented Deleted Interpolation for Quadrigram model")
print(f"  - Optimized lambda parameters: {[f'{l:.4f}' for l in deleted_interp_model.lambdas]}")
print()
print("✓ Evaluated all models on validation and test sets")
print("✓ Computed sentence probabilities using smoothed models")


SAVING RESULTS
✓ Good-Turing validation results saved to: good_turing_validation_results.csv
✓ Good-Turing test results saved to: good_turing_test_results.csv
✓ Deleted interpolation validation results saved to: deleted_interpolation_validation_results.csv
✓ Deleted interpolation test results saved to: deleted_interpolation_test_results.csv

IMPLEMENTATION SUMMARY
✓ Created data splits:
  - Training set: 139536 sentences
  - Validation set: 1000 sentences
  - Test set: 1000 sentences

✓ Implemented Good-Turing smoothing for:
  - Unigram Model
  - Bigram Model
  - Trigram Model
  - Quadrigram Model

✓ Generated frequency tables (C, N_C, C*) for all models

✓ Implemented Deleted Interpolation for Quadrigram model
  - Optimized lambda parameters: ['0.3213', '0.5650', '0.1134', '0.0003']

✓ Evaluated all models on validation and test sets
✓ Computed sentence probabilities using smoothed models
