In [5]:
# importing required libraries
import re
from collections import Counter, defaultdict
import random
import math
import requests
import os

## 1. N-Gram

In [6]:
def generate_ngrams(text, n):
    # added padding with '#' characters to handle the start of sequences
    padded_text = '#' * (n-1) + text
    ngrams = []
    for i in range(len(padded_text) - n + 1):
        ngram = tuple(padded_text[i:i+n])
        ngrams.append(ngram)
    return ngrams

In [7]:
text = "hello world"

# generate and display bigrams
bigrams = generate_ngrams(text, 2)
print("Character-Level Bigrams:", bigrams)

Character-Level Bigrams: [('#', 'h'), ('h', 'e'), ('e', 'l'), ('l', 'l'), ('l', 'o'), ('o', ' '), (' ', 'w'), ('w', 'o'), ('o', 'r'), ('r', 'l'), ('l', 'd')]


In [8]:
def build_ngram_model(corpus, n):
    model = defaultdict(Counter)
    ngrams = generate_ngrams(corpus, n)
    
    # build the model
    for ngram in ngrams:
        context = ngram[:-1]
        char = ngram[-1]
        model[context][char] += 1
        
    # convert counts to probabilities
    for context in model:
        total_count = sum(model[context].values())
        for char in model[context]:
            model[context][char] = model[context][char] / total_count
    
    return model

## 2. Smoothing

In [9]:
def add_smoothing(model, vocabulary_size, alpha=1.0):
    smoothed_model = defaultdict(Counter)
    for prefix, char_counts in model.items():
        total_count = sum(char_counts.values()) + alpha * vocabulary_size
        for char in char_counts:
            smoothed_model[prefix][char] = (char_counts[char] + alpha) / total_count
        for char in range(vocabulary_size):
            if char not in char_counts:
                smoothed_model[prefix][char] = alpha / total_count
    return smoothed_model

## 3. Generating Text Using the N-Gram Model

In [15]:
def generate_text(model, n, start_text, length=100):
    current_text = list(start_text)
    
    # generate characters
    for _ in range(length):
        context = tuple(current_text[-(n-1):]) if len(current_text) >= n-1 else tuple('#' * (n-1 - len(current_text)) + ''.join(current_text))
        
        if context not in model:
            break
        
        # get probability distribution for next character
        char_dist = model[context]
        
        # sample next character
        chars, probs = zip(*char_dist.items())
        next_char = random.choices(chars, weights=probs)[0]
        
        # append to generated text
        current_text.append(next_char)
        
    return ''.join(current_text)

In [16]:
text = "hello world this is a sample text for testing the n-gram model"

bigram_model = build_ngram_model(text, 2)

generated = generate_text(bigram_model, 2, "he", 10)
print(f"Generated text: {generated}")

Generated text: hexthe thele


## 4. Evaluating the Language Model

In [17]:
def calculate_perplexity(model, n, test_text):
    ngrams = generate_ngrams(test_text, n)
    log_prob = 0
    total_ngrams = len(ngrams)
    
    for ngram in ngrams:
        context = ngram[:-1]
        char = ngram[-1]
        
        if context in model and char in model[context]:
            prob = model[context][char]
            log_prob += -1 * math.log2(prob)
        else:
            return float('inf')
    return 2 ** (log_prob / total_ngrams)

In [18]:
training_corpus = """
The quick brown fox jumps over the lazy dog. 
She sells seashells by the seashore. 
How much wood would a woodchuck chuck if a woodchuck could chuck wood? 
To be or not to be, that is the question. 
All that glitters is not gold. 
A journey of a thousand miles begins with a single step. 
Actions speak louder than words. 
Beauty is in the eye of the beholder. 
Every cloud has a silver lining. 
Fortune favors the bold and brave. 
Life is like a box of chocolates. 
The early bird catches the worm. 
Where there's smoke, there's fire. 
Time heals all wounds and teaches all things. 
Knowledge is power, and power corrupts. 
Practice makes perfect, but nobody's perfect. 
The pen is mightier than the sword. 
When in Rome, do as the Romans do. 
A picture is worth a thousand words. 
Better late than never, but never late is better.
Experience is the best teacher of all things.
Laughter is the best medicine for the soul.
Music soothes the savage beast within us.
Nothing ventured, nothing gained in life.
The grass is always greener on the other side.
"""

# clean the corpus
training_corpus = ''.join(c.lower() for c in training_corpus if c.isalnum() or c.isspace())

In [20]:
training_corpus

'\nthe quick brown fox jumps over the lazy dog \nshe sells seashells by the seashore \nhow much wood would a woodchuck chuck if a woodchuck could chuck wood \nto be or not to be that is the question \nall that glitters is not gold \na journey of a thousand miles begins with a single step \nactions speak louder than words \nbeauty is in the eye of the beholder \nevery cloud has a silver lining \nfortune favors the bold and brave \nlife is like a box of chocolates \nthe early bird catches the worm \nwhere theres smoke theres fire \ntime heals all wounds and teaches all things \nknowledge is power and power corrupts \npractice makes perfect but nobodys perfect \nthe pen is mightier than the sword \nwhen in rome do as the romans do \na picture is worth a thousand words \nbetter late than never but never late is better\nexperience is the best teacher of all things\nlaughter is the best medicine for the soul\nmusic soothes the savage beast within us\nnothing ventured nothing gained in life\n

In [21]:
def build_models(corpus):
    models = {}
    for n in [2, 3, 4]:
        models[n] = build_ngram_model(corpus, n)
    return models

models = build_models(training_corpus)

In [22]:
def evaluate_samples(models, num_samples=10, sample_length=40):
    results = defaultdict(list)
    
    for n, model in models.items():
        print(f"\n=== {n}-gram Model Evaluation ===")
        
        start_text = training_corpus[:n-1]
        for i in range(num_samples):
            generated = generate_text(model, n, start_text, sample_length)
            perplexity = calculate_perplexity(model, n, generated)
            
            print(f"\nSample {i+1}:")
            print(f"Text: {generated}")
            print(f"Perplexity: {perplexity:.2f}")
            
            results[n].append({
                'text': generated,
                'perplexity': perplexity
            })
        
        avg_perplexity = sum(sample['perplexity'] for sample in results[n]) / len(results[n])
        print(f"\nAverage Perplexity for {n}-gram model: {avg_perplexity}:.2f")
        
    return results

In [24]:
results = evaluate_samples(models)

print("\n=== Overall Statistics ===")
for n in models.keys():
    perplexities = [sample['perplexity'] for sample in results[n]]
    min_prep = min(perplexities)
    max_prep = max(perplexities)
    avg_prep = sum(perplexities) / len(perplexities)
    
    print(f"\n{n}-gram Model Statistics:")
    print(f"Minimum Perplexity: {min_prep:.2f}")
    print(f"Maximum Perplexity: {max_prep:.2f}")
    print(f"Average Perplexity: {avg_prep:.2f}")


=== 2-gram Model Evaluation ===

Sample 1:
Text: 
winicather s ince pioro p lwo thack fowo
Perplexity: 8.28

Sample 2:
Text: 
ndy ioldchtox qud ld ir wha thethte insh
Perplexity: 8.40

Sample 3:
Text: 
ectermisins atha ts hun ferneanotereeswh
Perplexity: 9.67

Sample 4:
Text: 
bex the t thes tuinglore ownsagisen mir 
Perplexity: 7.62

Sample 5:
Text: 
be tofer s inglitthed 
nted thean 
whe t
Perplexity: 6.25

Sample 6:
Text: 
f s boucthell
ttud sisa s 
the
tees 
azy
Perplexity: 7.34

Sample 7:
Text: 
th s 
e werogllord s hexpthe 
sty theras
Perplexity: 7.15

Sample 8:
Text: 
s tha dis ponghttinord 
muche thangr
be 
Perplexity: 7.02

Sample 9:
Text: 
he prtell wous ber mathols agermins chay
Perplexity: 7.87

Sample 10:
Text: 
t llox cke s n erd sindche l cte byellot
Perplexity: 8.07

Average Perplexity for 2-gram model: 7.7661008231753375:.2f

=== 3-gram Model Evaluation ===

Sample 1:
Text: 
to 
shors peaughtions ney bodys ned 
a be
Perplexity: 3.29

Sample 2:
Text: 
the beging ver li