In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import random
import re
from math import log

In [2]:
# Load the tokenized Hindi data
with open('tokenized_hi.txt', 'r', encoding='utf-8') as f:
    sentences = f.readlines()

# Remove newline characters and filter out empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
print(f"Total sentences in dataset: {len(sentences)}")
print(f"First 5 sentences:")
for i in range(5):
    print(f"{i+1}: {sentences[i]}")

Total sentences in dataset: 141536
First 5 sentences:
1: लोगों को बिलों संबंधी सुविधा देना ही उनका काम
2: इनेलो 1987 में उस वक्त ऐसे ही दोराहे पर खड़ी थी , जब पूर्व उपप्रधानमंत्री देवीलाल ने अपने पुत्र ओमप्रकाश चौटाला को अपना राजनीतिक उत्तराधिकारी घोषित किया था।
3: हालांकि तब पार्टी पर देवीलाल की मजबूत पकड़ के चलते पार्टी टूटने से बच गई थी।
4: 1989 में देवीलाल केन्द्र की राजनीति में सक्रिय हो गए थे और उनके उपप्रधानमंत्री बनने के पश्चात् उनके तीन बेटों जगदीश सिंह , रणजीत सिंह और ओमप्रकाश चौटाला में से रणजीत और ओमप्रकाश के बीच हरियाणा में उनकी राजनीतिक विरासत को लेकर जंग शुरू हो गई थी।
5: उन परिस्थितियों में देवीलाल ने कड़ा निर्णय लेते हुए पार्टी की बागडोर ओमप्रकाश चौटाला के हवाले कर दी थी , जिसके बाद रणजीत की बगावत का असर पार्टी , संगठन और उनकी सरकार पर भी पड़ा था।


In [3]:
class LanguageModel:
    def __init__(self, n):
        self.n = n  # n-gram size
        self.ngrams = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocabulary = set()
        
    def preprocess_sentence(self, sentence):
        """Add start and end tokens to sentence"""
        tokens = sentence.split()
        if self.n > 1:
            # Add start tokens
            padded_tokens = ['<s>'] * (self.n - 1) + tokens + ['</s>']
        else:
            padded_tokens = tokens + ['</s>']
        return padded_tokens
    
    def train(self, sentences):
        """Train the language model on sentences"""
        for sentence in sentences:
            tokens = self.preprocess_sentence(sentence)
            self.vocabulary.update(tokens)
            
            # Generate n-grams
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i + self.n])
                self.ngrams[ngram] += 1
                
                # For context counts (n-1 grams)
                if self.n > 1:
                    context = ngram[:-1]
                    self.context_counts[context] += 1
                else:
                    # For unigram, context is total count
                    self.context_counts[('',)] += 1
    
    def get_probability(self, ngram):
        """Get probability of n-gram"""
        if self.n == 1:
            total_count = sum(self.ngrams.values())
            return self.ngrams[ngram] / total_count if total_count > 0 else 0
        else:
            context = ngram[:-1]
            context_count = self.context_counts[context]
            return self.ngrams[ngram] / context_count if context_count > 0 else 0
    
    def sentence_probability(self, sentence):
        """Calculate probability of a sentence"""
        tokens = self.preprocess_sentence(sentence)
        prob = 1.0
        
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            ngram_prob = self.get_probability(ngram)
            if ngram_prob == 0:
                return 0  # If any n-gram has zero probability
            prob *= ngram_prob
        
        return prob
    
    def sentence_log_probability(self, sentence):
        """Calculate log probability of a sentence to avoid underflow"""
        tokens = self.preprocess_sentence(sentence)
        log_prob = 0.0
        
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            ngram_prob = self.get_probability(ngram)
            if ngram_prob == 0:
                return float('-inf')  # Log of zero is negative infinity
            log_prob += log(ngram_prob)
        
        return log_prob

In [4]:
class SmoothedLanguageModel(LanguageModel):
    def __init__(self, n, smoothing_type='add_one', k=1):
        super().__init__(n)
        self.smoothing_type = smoothing_type
        self.k = k  # For add-k smoothing
        
    def get_smoothed_probability(self, ngram):
        """Get smoothed probability of n-gram"""
        if self.smoothing_type == 'add_one':
            return self._add_one_smoothing(ngram)
        elif self.smoothing_type == 'add_k':
            return self._add_k_smoothing(ngram)
        elif self.smoothing_type == 'add_token_type':
            return self._add_token_type_smoothing(ngram)
        else:
            return self.get_probability(ngram)
    
    def _add_one_smoothing(self, ngram):
        """Add-one (Laplace) smoothing"""
        if self.n == 1:
            total_count = sum(self.ngrams.values())
            vocab_size = len(self.vocabulary)
            return (self.ngrams[ngram] + 1) / (total_count + vocab_size)
        else:
            context = ngram[:-1]
            context_count = self.context_counts[context]
            vocab_size = len(self.vocabulary)
            return (self.ngrams[ngram] + 1) / (context_count + vocab_size)
    
    def _add_k_smoothing(self, ngram):
        """Add-k smoothing"""
        if self.n == 1:
            total_count = sum(self.ngrams.values())
            vocab_size = len(self.vocabulary)
            return (self.ngrams[ngram] + self.k) / (total_count + self.k * vocab_size)
        else:
            context = ngram[:-1]
            context_count = self.context_counts[context]
            vocab_size = len(self.vocabulary)
            return (self.ngrams[ngram] + self.k) / (context_count + self.k * vocab_size)
    
    def _add_token_type_smoothing(self, ngram):
        """Add token type smoothing (may not be a probability distribution)"""
        if self.n == 1:
            total_count = sum(self.ngrams.values())
            vocab_size = len(self.vocabulary)
            # Add the number of unique token types in vocabulary
            return (self.ngrams[ngram] + vocab_size) / (total_count + vocab_size * vocab_size)
        else:
            context = ngram[:-1]
            context_count = self.context_counts[context]
            vocab_size = len(self.vocabulary)
            return (self.ngrams[ngram] + vocab_size) / (context_count + vocab_size * vocab_size)
    
    def sentence_smoothed_probability(self, sentence):
        """Calculate smoothed probability of a sentence"""
        tokens = self.preprocess_sentence(sentence)
        prob = 1.0
        
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            ngram_prob = self.get_smoothed_probability(ngram)
            prob *= ngram_prob
        
        return prob
    
    def sentence_smoothed_log_probability(self, sentence):
        """Calculate smoothed log probability of a sentence"""
        tokens = self.preprocess_sentence(sentence)
        log_prob = 0.0
        
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            ngram_prob = self.get_smoothed_probability(ngram)
            if ngram_prob > 0:
                log_prob += log(ngram_prob)
            else:
                return float('-inf')
        
        return log_prob

In [5]:
# Create and train all four language models
print("Training language models...")

# 1. Unigram Model
unigram_model = LanguageModel(n=1)
unigram_model.train(sentences)
print(f"Unigram model: {len(unigram_model.ngrams)} unique unigrams")

# 2. Bigram Model
bigram_model = LanguageModel(n=2)
bigram_model.train(sentences)
print(f"Bigram model: {len(bigram_model.ngrams)} unique bigrams")

# 3. Trigram Model
trigram_model = LanguageModel(n=3)
trigram_model.train(sentences)
print(f"Trigram model: {len(trigram_model.ngrams)} unique trigrams")

# 4. Quadrigram Model
quadrigram_model = LanguageModel(n=4)
quadrigram_model.train(sentences)
print(f"Quadrigram model: {len(quadrigram_model.ngrams)} unique quadrigrams")

print(f"Vocabulary size: {len(unigram_model.vocabulary)}")
print("Training completed!")

Training language models...
Unigram model: 103893 unique unigrams
Bigram model: 939355 unique bigrams
Trigram model: 2063410 unique trigrams
Quadrigram model: 2649242 unique quadrigrams
Vocabulary size: 103893
Training completed!


In [6]:
# Create smoothed models for each n-gram type
print("Creating smoothed language models...")

# Smoothed Unigram Models
unigram_add_one = SmoothedLanguageModel(n=1, smoothing_type='add_one')
unigram_add_one.train(sentences)

unigram_add_k = SmoothedLanguageModel(n=1, smoothing_type='add_k', k=0.5)
unigram_add_k.train(sentences)

unigram_add_token = SmoothedLanguageModel(n=1, smoothing_type='add_token_type')
unigram_add_token.train(sentences)

# Smoothed Bigram Models
bigram_add_one = SmoothedLanguageModel(n=2, smoothing_type='add_one')
bigram_add_one.train(sentences)

bigram_add_k = SmoothedLanguageModel(n=2, smoothing_type='add_k', k=0.5)
bigram_add_k.train(sentences)

bigram_add_token = SmoothedLanguageModel(n=2, smoothing_type='add_token_type')
bigram_add_token.train(sentences)

# Smoothed Trigram Models
trigram_add_one = SmoothedLanguageModel(n=3, smoothing_type='add_one')
trigram_add_one.train(sentences)

trigram_add_k = SmoothedLanguageModel(n=3, smoothing_type='add_k', k=0.5)
trigram_add_k.train(sentences)

trigram_add_token = SmoothedLanguageModel(n=3, smoothing_type='add_token_type')
trigram_add_token.train(sentences)

# Smoothed Quadrigram Models
quadrigram_add_one = SmoothedLanguageModel(n=4, smoothing_type='add_one')
quadrigram_add_one.train(sentences)

quadrigram_add_k = SmoothedLanguageModel(n=4, smoothing_type='add_k', k=0.5)
quadrigram_add_k.train(sentences)

quadrigram_add_token = SmoothedLanguageModel(n=4, smoothing_type='add_token_type')
quadrigram_add_token.train(sentences)

print("Smoothed models created successfully!")

Creating smoothed language models...
Smoothed models created successfully!


In [7]:
# Select 1000 random sentences for testing
random.seed(42)  # For reproducibility
test_sentences = random.sample(sentences, min(1000, len(sentences)))
print(f"Selected {len(test_sentences)} sentences for testing")
print("Sample test sentences:")
for i in range(3):
    print(f"{i+1}: {test_sentences[i]}")

Selected 1000 sentences for testing
Sample test sentences:
1: रालोसपा के प्रदेश अध्यक्ष भूदेव चौधरी के पाला बदलकर राजद में सोमवार को शामिल होने के बारे में पूछे जाने पर कुशवाहा ने कहा कि मैंने दूसरे किनारे तक पहुंचने के इरादे से समुद्र में अपनी नाव डाली है पर बीच में छोड़कर जो जाना चाहते हैं तो उनका स्वागत है।
2: नए नियम के फायदे
3: अगर राहुल कंवल सवालों का जवाब हमें देते हैं तो इस खबर को उनके पक्ष के साथ अपडेट किया जाएगा।


In [8]:
# Function to evaluate models on test sentences
def evaluate_models(test_sentences, models, model_names):
    """Evaluate multiple models on test sentences and return results"""
    results = []
    
    for sentence in test_sentences:
        sentence_results = {'sentence': sentence}
        
        for model, name in zip(models, model_names):
            try:
                if hasattr(model, 'sentence_smoothed_log_probability'):
                    log_prob = model.sentence_smoothed_log_probability(sentence)
                else:
                    log_prob = model.sentence_log_probability(sentence)
                sentence_results[name] = log_prob
            except Exception as e:
                sentence_results[name] = float('-inf')
        
        results.append(sentence_results)
    
    return results

# Define all models and their names
all_models = [
    # Original models
    unigram_model, bigram_model, trigram_model, quadrigram_model,
    # Add-one smoothed models
    unigram_add_one, bigram_add_one, trigram_add_one, quadrigram_add_one,
    # Add-k smoothed models
    unigram_add_k, bigram_add_k, trigram_add_k, quadrigram_add_k,
    # Add token type smoothed models
    unigram_add_token, bigram_add_token, trigram_add_token, quadrigram_add_token
]

model_names = [
    # Original models
    'Unigram', 'Bigram', 'Trigram', 'Quadrigram',
    # Add-one smoothed models
    'Unigram_Add1', 'Bigram_Add1', 'Trigram_Add1', 'Quadrigram_Add1',
    # Add-k smoothed models
    'Unigram_AddK', 'Bigram_AddK', 'Trigram_AddK', 'Quadrigram_AddK',
    # Add token type smoothed models
    'Unigram_AddToken', 'Bigram_AddToken', 'Trigram_AddToken', 'Quadrigram_AddToken'
]

print("Evaluating all models on test sentences...")
print("This may take a few minutes...")

Evaluating all models on test sentences...
This may take a few minutes...


In [9]:
# Evaluate models on test sentences
results = evaluate_models(test_sentences, all_models, model_names)

# Convert results to DataFrame for better analysis
import pandas as pd
df_results = pd.DataFrame(results)
print("Evaluation completed!")
print(f"Results DataFrame shape: {df_results.shape}")
print("\nFirst few results:")
print(df_results.head())

Evaluation completed!
Results DataFrame shape: (1000, 17)

First few results:
                                            sentence     Unigram      Bigram  \
0  रालोसपा के प्रदेश अध्यक्ष भूदेव चौधरी के पाला ... -371.224642 -239.755235   
1                                   नए नियम के फायदे  -34.191738  -25.064741   
2  अगर राहुल कंवल सवालों का जवाब हमें देते हैं तो... -147.843928  -91.292771   
3             शशांक की दो महीने पहले ही शादी हुई है।  -65.330563  -40.877413   
4                      किंतु मामले में कुछ नहीं हुआ।  -44.776374  -27.317377   

      Trigram  Quadrigram  Unigram_Add1  Bigram_Add1  Trigram_Add1  \
0 -105.535552  -54.829386   -372.390006  -431.561550   -514.437669   
1  -16.956102  -12.841139    -34.334650   -43.445464    -48.454023   
2  -53.351091  -22.865604   -148.354561  -166.553783   -207.680047   
3  -30.945912  -17.420991    -65.581545   -72.999099    -98.262664   
4  -22.218029  -15.849293    -44.983329   -49.302828    -64.636086   

   Quadrigram_Add1  

In [10]:
# Calculate and display statistics for each model
print("="*80)
print("LANGUAGE MODEL PERFORMANCE ANALYSIS")
print("="*80)

# Exclude 'sentence' column for statistical analysis
model_columns = [col for col in df_results.columns if col != 'sentence']

for col in model_columns:
    valid_scores = df_results[col][df_results[col] != float('-inf')]
    
    print(f"\n{col}:")
    print(f"  Valid predictions: {len(valid_scores)}/{len(df_results)} ({len(valid_scores)/len(df_results)*100:.1f}%)")
    
    if len(valid_scores) > 0:
        print(f"  Mean log probability: {valid_scores.mean():.4f}")
        print(f"  Std log probability: {valid_scores.std():.4f}")
        print(f"  Min log probability: {valid_scores.min():.4f}")
        print(f"  Max log probability: {valid_scores.max():.4f}")
    else:
        print("  No valid predictions")

LANGUAGE MODEL PERFORMANCE ANALYSIS

Unigram:
  Valid predictions: 1000/1000 (100.0%)
  Mean log probability: -167.0437
  Std log probability: 148.3488
  Min log probability: -2009.3965
  Max log probability: -8.5496

Bigram:
  Valid predictions: 1000/1000 (100.0%)
  Mean log probability: -100.7951
  Std log probability: 90.5417
  Min log probability: -1266.0692
  Max log probability: -7.5296

Trigram:
  Valid predictions: 1000/1000 (100.0%)
  Mean log probability: -48.0706
  Std log probability: 40.0302
  Min log probability: -608.9864
  Max log probability: -7.5296

Quadrigram:
  Valid predictions: 1000/1000 (100.0%)
  Mean log probability: -22.5874
  Std log probability: 13.8057
  Min log probability: -190.0451
  Max log probability: -7.5296

Unigram_Add1:
  Valid predictions: 1000/1000 (100.0%)
  Mean log probability: -167.2119
  Std log probability: 148.3774
  Min log probability: -2014.7825
  Max log probability: -8.6133

Bigram_Add1:
  Valid predictions: 1000/1000 (100.0%)
  Mea

In [11]:
# Compare original models vs smoothed models
print("\n" + "="*80)
print("COMPARISON: ORIGINAL vs SMOOTHED MODELS")
print("="*80)

# Group models by type
original_models = ['Unigram', 'Bigram', 'Trigram', 'Quadrigram']
add_one_models = ['Unigram_Add1', 'Bigram_Add1', 'Trigram_Add1', 'Quadrigram_Add1']
add_k_models = ['Unigram_AddK', 'Bigram_AddK', 'Trigram_AddK', 'Quadrigram_AddK']
add_token_models = ['Unigram_AddToken', 'Bigram_AddToken', 'Trigram_AddToken', 'Quadrigram_AddToken']

print("\nPERCENTAGE OF VALID PREDICTIONS:")
print("-" * 50)
for i, ngram_type in enumerate(['Unigram', 'Bigram', 'Trigram', 'Quadrigram']):
    orig_valid = len(df_results[original_models[i]][df_results[original_models[i]] != float('-inf')])
    add1_valid = len(df_results[add_one_models[i]][df_results[add_one_models[i]] != float('-inf')])
    addk_valid = len(df_results[add_k_models[i]][df_results[add_k_models[i]] != float('-inf')])
    token_valid = len(df_results[add_token_models[i]][df_results[add_token_models[i]] != float('-inf')])
    
    total = len(df_results)
    
    print(f"{ngram_type}:")
    print(f"  Original:     {orig_valid}/{total} ({orig_valid/total*100:.1f}%)")
    print(f"  Add-One:      {add1_valid}/{total} ({add1_valid/total*100:.1f}%)")
    print(f"  Add-K:        {addk_valid}/{total} ({addk_valid/total*100:.1f}%)")
    print(f"  Add-Token:    {token_valid}/{total} ({token_valid/total*100:.1f}%)")
    print()


COMPARISON: ORIGINAL vs SMOOTHED MODELS

PERCENTAGE OF VALID PREDICTIONS:
--------------------------------------------------
Unigram:
  Original:     1000/1000 (100.0%)
  Add-One:      1000/1000 (100.0%)
  Add-K:        1000/1000 (100.0%)
  Add-Token:    1000/1000 (100.0%)

Bigram:
  Original:     1000/1000 (100.0%)
  Add-One:      1000/1000 (100.0%)
  Add-K:        1000/1000 (100.0%)
  Add-Token:    1000/1000 (100.0%)

Trigram:
  Original:     1000/1000 (100.0%)
  Add-One:      1000/1000 (100.0%)
  Add-K:        1000/1000 (100.0%)
  Add-Token:    1000/1000 (100.0%)

Quadrigram:
  Original:     1000/1000 (100.0%)
  Add-One:      1000/1000 (100.0%)
  Add-K:        1000/1000 (100.0%)
  Add-Token:    1000/1000 (100.0%)



In [12]:
# Show examples of sentence probabilities
print("\n" + "="*80)
print("SAMPLE SENTENCE PROBABILITY CALCULATIONS")
print("="*80)

# Show results for first 5 test sentences
sample_results = df_results.head(5)

for idx, row in sample_results.iterrows():
    print(f"\nSentence {idx+1}: {row['sentence']}")
    print("-" * 60)
    
    # Show results for each smoothing technique across all n-grams
    print("Model Type        | Add-One      | Add-K        | Add-Token")
    print("-" * 60)
    
    for i, ngram in enumerate(['Unigram', 'Bigram', 'Trigram', 'Quadrigram']):
        add_one_score = row[f'{ngram}_Add1']
        add_k_score = row[f'{ngram}_AddK']
        add_token_score = row[f'{ngram}_AddToken']
        
        add_one_str = f"{add_one_score:.4f}" if add_one_score != float('-inf') else "-inf"
        add_k_str = f"{add_k_score:.4f}" if add_k_score != float('-inf') else "-inf"
        add_token_str = f"{add_token_score:.4f}" if add_token_score != float('-inf') else "-inf"
        
        print(f"{ngram:<14} | {add_one_str:<12} | {add_k_str:<12} | {add_token_str}")
    
    print()


SAMPLE SENTENCE PROBABILITY CALCULATIONS

Sentence 1: रालोसपा के प्रदेश अध्यक्ष भूदेव चौधरी के पाला बदलकर राजद में सोमवार को शामिल होने के बारे में पूछे जाने पर कुशवाहा ने कहा कि मैंने दूसरे किनारे तक पहुंचने के इरादे से समुद्र में अपनी नाव डाली है पर बीच में छोड़कर जो जाना चाहते हैं तो उनका स्वागत है।
------------------------------------------------------------
Model Type        | Add-One      | Add-K        | Add-Token
------------------------------------------------------------
Unigram        | -372.3900    | -371.8078    | -590.9135
Bigram         | -431.5615    | -406.5328    | -600.1768
Trigram        | -514.4377    | -488.8869    | -600.6138
Quadrigram     | -552.1246    | -529.4567    | -600.6573


Sentence 2: नए नियम के फायदे
------------------------------------------------------------
Model Type        | Add-One      | Add-K        | Add-Token
------------------------------------------------------------
Unigram        | -34.3346     | -34.2638     | -56.1186
Bigram         |

In [13]:
# Save results to CSV for further analysis
output_file = 'language_model_results.csv'
df_results.to_csv(output_file, index=False, encoding='utf-8')
print(f"Results saved to {output_file}")

# Summary of the implementation
print("\n" + "="*80)
print("IMPLEMENTATION SUMMARY")
print("="*80)
print("✓ Built 4 Language Models:")
print("  - Unigram Model")
print("  - Bigram Model") 
print("  - Trigram Model")
print("  - Quadrigram Model")
print()
print("✓ Implemented 3 Smoothing Techniques:")
print("  - Add-One Smoothing (Laplace)")
print("  - Add-K Smoothing (k=0.5)")
print("  - Add Token Type Smoothing")
print()
print("✓ Evaluated on 1000 random sentences from the dataset")
print("✓ Calculated log probabilities to avoid numerical underflow")
print("✓ Applied smoothing to all 4 n-gram models")
print()
print(f"Total dataset size: {len(sentences)} sentences")
print(f"Vocabulary size: {len(unigram_model.vocabulary)} unique tokens")
print(f"Test set size: {len(test_sentences)} sentences")

Results saved to language_model_results.csv

IMPLEMENTATION SUMMARY
✓ Built 4 Language Models:
  - Unigram Model
  - Bigram Model
  - Trigram Model
  - Quadrigram Model

✓ Implemented 3 Smoothing Techniques:
  - Add-One Smoothing (Laplace)
  - Add-K Smoothing (k=0.5)
  - Add Token Type Smoothing

✓ Evaluated on 1000 random sentences from the dataset
✓ Calculated log probabilities to avoid numerical underflow
✓ Applied smoothing to all 4 n-gram models

Total dataset size: 141536 sentences
Vocabulary size: 103893 unique tokens
Test set size: 1000 sentences
