In [1]:
import re
from collections import Counter, defaultdict
import random
import math
import requests
import os

In [2]:

def generate_ngrams(text, n):
    padded_text = '#' * (n-1) + text
    ngrams = []
    for i in range(len(padded_text) - n + 1):
        ngram = tuple(padded_text[i:i+n])
        ngrams.append(ngram)
    return ngrams

In [4]:
text = "Jai Hind"
bigrams = generate_ngrams(text, 3)
print("Character-Level Bigrams:", bigrams)

Character-Level Bigrams: [('#', '#', 'J'), ('#', 'J', 'a'), ('J', 'a', 'i'), ('a', 'i', ' '), ('i', ' ', 'H'), (' ', 'H', 'i'), ('H', 'i', 'n'), ('i', 'n', 'd')]


In [5]:
def build_ngram_model(corpus, n):
    """
    Build an n-gram language model from the corpus.

    Parameters:
    corpus (str): Text corpus for building the model
    n (int): Size of the n-grams

    Returns:
    dict: A probability distribution for each context
    """
    # Initialize the model
    model = defaultdict(Counter)

    # Generate n-grams
    ngrams = generate_ngrams(corpus, n)

    # Build the model
    for ngram in ngrams:
        context = ngram[:-1]  # all but the last character
        char = ngram[-1]      # the last character
        model[context][char] += 1

    # Convert counts to probabilities
    for context in model:
        total_count = sum(model[context].values())
        for char in model[context]:
            model[context][char] = model[context][char] / total_count

    return model

In [6]:
def add_smoothing(model, vocabulary_size, alpha=1.0):
    """
    Apply smoothing to an n-gram model.

    Parameters:
    model (defaultdict): N-gram model.
    vocabulary_size (int): Total number of unique characters in the vocabulary.
    alpha (float): Smoothing parameter (default is 1.0).

    Returns:
    defaultdict: Smoothed n-gram model.
    """
    smoothed_model = defaultdict(Counter)
    for prefix, char_counts in model.items():
        total_count = sum(char_counts.values()) + alpha * vocabulary_size
        for char in char_counts:
            smoothed_model[prefix][char] = (char_counts[char] + alpha) / total_count
        for char in range(vocabulary_size):
            if char not in char_counts:
                smoothed_model[prefix][char] = alpha / total_count
    return smoothed_model

In [7]:
def generate_text(model, n, start_text, length=100):
    """
    Generate text using the n-gram model.

    Parameters:
    model (dict): Trained n-gram model
    n (int): Size of the n-grams
    start_text (str): Initial text to start generation
    length (int): Number of characters to generate

    Returns:
    str: Generated text
    """
    # Initialize with start text
    current_text = list(start_text)

    # Generate characters
    for _ in range(length):
        # Get the current context
        context = tuple(current_text[-(n-1):]) if len(current_text) >= n-1 else tuple('#' * (n-1 - len(current_text)) + ''.join(current_text))

        # If context not in model, break
        if context not in model:
            break

        # Get probability distribution for next character
        char_dist = model[context]

        # Sample next character
        chars, probs = zip(*char_dist.items())
        next_char = random.choices(chars, weights=probs)[0]

        # Append to generated text
        current_text.append(next_char)

    return ''.join(current_text)

In [14]:
# Sample text
text = "Aryton Senna was the greatest racer the world could ever see in F1, Ken Miles was the overpowered man for sure"

# Build a bigram model
bigram_model = build_ngram_model(text, 2)

# Generate text
generated = generate_text(bigram_model, 2, "ar", 10)
print(f"Generated text: {generated}")

Generated text: are t the F1


In [15]:
def calculate_perplexity(model, n, test_text):
    """
    Calculate perplexity of the model on test text.

    Parameters:
    model (dict): Trained n-gram model
    n (int): Size of the n-grams
    test_text (str): Text to evaluate on

    Returns:
    float: Perplexity score
    """
    ngrams = generate_ngrams(test_text, n)
    log_prob = 0
    total_ngrams = len(ngrams)

    for ngram in ngrams:
        context = ngram[:-1]
        char = ngram[-1]

        if context in model and char in model[context]:
            prob = model[context][char]
            log_prob += -1 * math.log2(prob)
        else:
            return float('inf')  # Return infinity for unseen n-grams

    return 2 ** (log_prob / total_ngrams)

In [16]:
# First, let's create a more substantial training corpus
training_corpus = """
The quick brown fox jumps over the lazy dog.
She sells seashells by the seashore.
How much wood would a woodchuck chuck if a woodchuck could chuck wood?
To be or not to be, that is the question.
All that glitters is not gold.
A journey of a thousand miles begins with a single step.
Actions speak louder than words.
Beauty is in the eye of the beholder.
Every cloud has a silver lining.
Fortune favors the bold and brave.
Life is like a box of chocolates.
The early bird catches the worm.
Where there's smoke, there's fire.
Time heals all wounds and teaches all things.
Knowledge is power, and power corrupts.
Practice makes perfect, but nobody's perfect.
The pen is mightier than the sword.
When in Rome, do as the Romans do.
A picture is worth a thousand words.
Better late than never, but never late is better.
Experience is the best teacher of all things.
Laughter is the best medicine for the soul.
Music soothes the savage beast within us.
Nothing ventured, nothing gained in life.
The grass is always greener on the other side.
"""

# Clean the corpus
training_corpus = ''.join(c.lower() for c in training_corpus if c.isalnum() or c.isspace())