In [1]:
import random
from collections import Counter, defaultdict
import random
import math
import requests
import os

In [2]:
def generate_ngrams(text, n):
    """
    Generate n-grams (character-level) from a given text.

    Parameters:
    text (str): Input text
    n (int): Size of the n-grams

    Returns:
    list: A list of n-grams as tuples
    """
    # Added padding with '#' characters to handle the start of sequences
    padded_text = "#" * (n - 1) + text
    ngrams = []
    for i in range(len(padded_text) - n + 1):
        ngram = tuple(padded_text[i:i+n])
        ngrams.append(ngram)
    return ngrams

generate_ngrams("I love machine",3)

[('#', '#', 'I'),
 ('#', 'I', ' '),
 ('I', ' ', 'l'),
 (' ', 'l', 'o'),
 ('l', 'o', 'v'),
 ('o', 'v', 'e'),
 ('v', 'e', ' '),
 ('e', ' ', 'm'),
 (' ', 'm', 'a'),
 ('m', 'a', 'c'),
 ('a', 'c', 'h'),
 ('c', 'h', 'i'),
 ('h', 'i', 'n'),
 ('i', 'n', 'e')]

In [3]:
def build_ngram_model(corpus, n):
    """
    Build an n-gram language model from the corpus.

    Parameters:
    corpus (str): Text corpus for building the model
    n (int): Size of the n-grams

    Returns:
    dict: A probability distribution for each context
    """
    # Initialize the model
    model = defaultdict(Counter)

    # Generate n-grams
    ngrams = generate_ngrams(corpus, n)

    # Build the model
    for ngram in ngrams:
        context = ngram[:-1]  # all but the last character
        char = ngram[-1]      # the last character
        model[context][char] += 1

    # Convert counts to probabilities
    for context in model:
        total_count = sum(model[context].values())
        for char in model[context]:
            model[context][char] = model[context][char] / total_count

    return model

In [4]:
def add_smoothing(model, vocabulary_size, alpha=1.0):
    """
    Apply smoothing to an n-gram model.

    Parameters:
    model (defaultdict): N-gram model.
    vocabulary_size (int): Total number of unique characters in the vocabulary.
    alpha (float): Smoothing parameter (default is 1.0).

    Returns:
    defaultdict: Smoothed n-gram model.
    """
    smoothed_model = defaultdict(Counter)
    for prefix, char_counts in model.items():
        total_count = sum(char_counts.values()) + alpha * vocabulary_size
        for char in char_counts:
            smoothed_model[prefix][char] = (char_counts[char] + alpha) / total_count
        for char in range(vocabulary_size):
            if char not in char_counts:
                smoothed_model[prefix][char] = alpha / total_count
    return smoothed_model

#### Generating texts using N-Gram model.

In [20]:
def generate_text(model, n, start_text, length = 100):
    """
    Generate text using N-Gram model.
    Parameters : 
    model(dict) : trained n-gram model
    n (int size) : size of n-gram model
    start_text (str) : initial text to start generation
    length(int) : number of characters to generate
    Returns : Generated Text (str)
    """
    #Initialise start_text
    current_text = list(start_text)
    
    #Generate characters
    for _ in range(length):
        context = tuple(current_text[-n+1:]) if len(current_text) >= n-1 else tuple('#' * (n-1-len(current_text))+''.join(current_text))
        if context not in model:
            break
        char_dist = model[context]
        #Sample next character
        chars,probs = zip(*char_dist.items())
        next_char = random.choices(chars, weights=probs)[0]
        #Append to current text
        current_text.append(next_char)
    return "".join(current_text)

In [40]:
##Sample text
text = "hello world this is a sample text for testing the n-gram model."
#Build the n-gram model
bigram_model = build_ngram_model(text,2)
##Generate text
generated = generate_text(bigram_model,2,"he",10)
print(f"Generated Text : {generated}")

Generated Text : he text amo 


#### Evaluation of model
##### Perplexity

In [41]:
def calculate_perplexity(model, n , test_text):
    """
    Calculate the perplexity of a given text using an n-gram model.

    Parameters:
    model (dict): N-gram model.
    n (int): Size of the n-gram model.
    test_text (str): Text to calculate perplexity for.

    Returns:
    float: Perplexity value.
    """
    ngrams = generate_ngrams(test_text,n)
    log_prob = 0
    total_ngrams = len(ngrams)
    for ngram in ngrams:
        context = ngram[:-1]
        char = ngram[-1]
        if context in model and char in model[context]:
            prob = model[context][char]
            log_prob += -1 * math.log2(prob)
        else:
            return float('inf') #Return infinity for unseen n-grams
        return 2**(log_prob/total_ngrams)


In [43]:
## Training corpus
training_corpus = """
The quick brown fox jumps over the lazy dog.
She sells seashells by the seashore.
How much wood would a woodchuck chuck if a woodchuck could chuck wood?
To be or not to be, that is the question.
All that glitters is not gold.
A journey of a thousand miles begins with a single step.
Actions speak louder than words.
Beauty is in the eye of the beholder.
Every cloud has a silver lining.
Fortune favors the bold and brave.
Life is like a box of chocolates.
The early bird catches the worm.
Where there's smoke, there's fire.
Time heals all wounds and teaches all things.
Knowledge is power, and power corrupts.
Practice makes perfect, but nobody's perfect.
The pen is mightier than the sword.
When in Rome, do as the Romans do.
A picture is worth a thousand words.
Better late than never, but never late is better.
Experience is the best teacher of all things.
Laughter is the best medicine for the soul.
Music soothes the savage beast within us.
"""

training_corpus = ''.join(c.lower() for c in training_corpus if c.isalnum() or c.isspace())

In [44]:
training_corpus

'\nthe quick brown fox jumps over the lazy dog\nshe sells seashells by the seashore\nhow much wood would a woodchuck chuck if a woodchuck could chuck wood\nto be or not to be that is the question\nall that glitters is not gold\na journey of a thousand miles begins with a single step\nactions speak louder than words\nbeauty is in the eye of the beholder\nevery cloud has a silver lining\nfortune favors the bold and brave\nlife is like a box of chocolates\nthe early bird catches the worm\nwhere theres smoke theres fire\ntime heals all wounds and teaches all things\nknowledge is power and power corrupts\npractice makes perfect but nobodys perfect\nthe pen is mightier than the sword\nwhen in rome do as the romans do\na picture is worth a thousand words\nbetter late than never but never late is better\nexperience is the best teacher of all things\nlaughter is the best medicine for the soul\nmusic soothes the savage beast within us\n'

In [45]:
## build models of different orders 
def build_models(corpus):
    models = {}
    for n in [1,2,3]:
        models[n] = build_ngram_model(corpus,n)
        return models
#Build the models
models = build_models(training_corpus)

In [61]:
## Generate samples and calculate perplexity
def evaluate_samples(models, num_samples=10, sample_length = 40):
    """
    Evaluate multiple n-gram models by generating text samples and calculating their perplexity scores.
    
    Parameters :
    models : dict
    Dictionay where keys are n-gram model size (eg. 2 for bigram model)
    and values are trained n-gram models.

    num_smaples : int, optional(default = 10)
    Number of text samples to generate for each n-gram model.

    sample_length : int, optional(default = 40)
    length of each generated text sample in characters

    """
    results = defaultdict(list)
    for n, model in models.items():
        print(f"\n=== {n}-gram Model Evaluation ===")

    # Generate multiple samples
        start_text = training_corpus[:n-1]
        for i in range(num_samples):
        # Generate sample
            generated = generate_text(model, n, start_text, sample_length)
        
        # Calculate perplexity
            perplexity = calculate_perplexity(model, n, generated)
        
            print(f"\nSample {i+1}:")
            print(f"Text: {generated}")
            print(f"Perplexity: {perplexity:.2f}")  
            
            results[n].append({
                'text' : generated,
                'perplexity' : perplexity
            })

        # Calculate average perplexity for this n-gram model

        avg_perplexity = sum(sample['perplexity'] for sample in results[n]) / len(results[n])
        
        print(f"\nAverage Perplexity for {n}-gram Model: {avg_perplexity:.2f}")
    return results  

In [62]:
# Evaluate samples
results = evaluate_samples(models)

# Calculate statistics for each model
print("\n== Overall Statistics ==")
for n in models.keys():
    perplexities = [sample['perplexity'] for sample in results[n]]
    min_perp = min(perplexities)
    max_perp = max(perplexities)
    avg_perp = sum(perplexities) / len(perplexities)
    
    print(f"\n{n}-gram Model Statistics:")
    print(f"Minimum Perplexity: {min_perp:.2f}")
    print(f"Maximum Perplexity: {max_perp:.2f}")
    print(f"Average Perplexity: {avg_perp:.2f}")


=== 1-gram Model Evaluation ===

Sample 1:
Text: o
Perplexity: 15.00

Sample 2:
Text: b
Perplexity: 48.16

Sample 3:
Text: o
Perplexity: 15.00

Sample 4:
Text: g
Perplexity: 76.25

Sample 5:
Text: a
Perplexity: 19.06

Sample 6:
Text:  
Perplexity: 6.14

Sample 7:
Text: l
Perplexity: 26.14

Sample 8:
Text: h
Perplexity: 17.94

Sample 9:
Text: c
Perplexity: 32.68

Sample 10:
Text: m
Perplexity: 76.25

Average Perplexity for 1-gram Model: 33.26

== Overall Statistics ==

1-gram Model Statistics:
Minimum Perplexity: 6.14
Maximum Perplexity: 76.25
Average Perplexity: 33.26
