An N-gram model uses the frequency of N-grams in a text to calculate the probabiliy of he next word in a sequence



# Creating a triagram based language model 
Why?

Cause Trigram provides a balance betweeen simplicity and contextual accuracy by considering two precding words.

## Objective
1. Create a trigram lang model from  a given corpus
2. Use the model to predict the nxt word in a given sequence
3. Evaluate the model performance using perplexity as a metric

In [2]:
import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist
from collections import defaultdict, Counter
import math 
import random

In [3]:
nltk.download("gutenberg")
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /home/joe/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /home/joe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
#Loading a sample text

from nltk.corpus import gutenberg
text = gutenberg.words('carroll-alice.txt')

In [6]:
#Preprocessing the text: Lowrrcase and tokenization

sentences = nltk.sent_tokenize(' '.join(text))
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]

print(f"Number of sentences: {len(tokenized_sentences)}")
print(f"Sample sentences: {tokenized_sentences[0]}")

Number of sentences: 1629
Sample sentences: ['[', 'alice', "'", 's', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', '1865', ']', 'chapter', 'i', '.']


## Building the trigram model
### Steps
1. Generate the trigrams from the text
2. Count the occurence of each trigram and bigram
3. Calculate the conditional probability of each trigram

In [8]:
# Generate trigram and count their frequncies

trigram_count = defaultdict(Counter)
bigram_counts = Counter()

for sentence in tokenized_sentences:
    trigrams = list(ngrams(sentence, 3))
    bigrams = list(ngrams(sentence, 2))

    for trigram in trigrams:
        trigram_count[(trigram[0], trigram[1])][trigram[2]] += 1

    for bigram in bigrams:
        bigram_counts[bigram] += 1

print(f"Number of unique bigrams: {len(bigram_counts)}")
print(f"Number of unique trigrams: {len(trigram_count)}")

Number of unique bigrams: 14163
Number of unique trigrams: 13932


In [11]:
# Predicting the next word

def predict_next_word(w1, w2, trigram_counts):
    """Predict the next word, based on tigram probabilities"""

    next_words_probs = trigram_counts.get((w1, w2), {})
    if not next_words_probs:
        return  "Unknown"
    return max(next_words_probs, key=next_words_probs.get)


w1, w2 = "alice", "was"
precidted_word = predict_next_word(w1, w2, trigram_count)

print(f"Given words: '{w1} {w2}', Predicted next word: '{precidted_word}'")

Given words: 'alice was', Predicted next word: 'not'


In [14]:
# Evaluating the model with perplexity

def calculate_perpleity(test_sentences, trigram_counts, bigram_counts):
    """Calculate the perplexity of the trigram model on test data."""

    total_log_prob = 0
    total_words = 0

    for sentence in test_sentences:
        trigrams = list(ngrams(sentence, 3))
        for trigram in trigrams:
            w1, w2, w3 = trigram
            trigram_prob = (trigram_counts[(w1, w2)][w3] / bigram_counts[(w1, w2)]
                            if (w1, w2) in bigram_counts else 1e-6)
            total_log_prob += math.log2(trigram_prob)
            total_words += 1

    perplexity = 2 ** (-total_log_prob / total_words)
    return perplexity


# Evaluate the model on a subset of the data
test_sentences = tokenized_sentences[:100]
perplexity = calculate_perpleity(test_sentences, trigram_count, bigram_counts)
print(f"Perplexity of the trigram model:  {perplexity}")


Perplexity of the trigram model:  2.923313470850842


In [None]:
# Impovement of the above code


# def calculate_perplexity(test_sentences, trigram_counts, bigram_counts, vocab_size, alpha=1.0):
#     """
#     Calculate the perplexity of a trigram language model using Laplace smoothing.
    
#     Parameters:
#     - test_sentences: List of tokenized sentences
#     - trigram_counts: Dictionary of trigram frequencies
#     - bigram_counts: Dictionary of bigram frequencies
#     - vocab_size: Total number of unique words in the dataset
#     - alpha: Smoothing parameter (default=1 for Laplace smoothing)
    
#     Returns:
#     - perplexity: Measure of how well the model predicts unseen data
#     """

#     total_log_prob = 0
#     total_words = 0

#     for sentence in test_sentences:
#         trigrams = list(ngrams(sentence, 3))

#         for trigram in trigrams:
#             w1, w2, w3 = trigram
            
#             # Apply Laplace Smoothing to avoid zero probabilities
#             trigram_freq = trigram_counts[(w1, w2)][w3] + alpha
#             bigram_freq = bigram_counts[(w1, w2)] + (alpha * vocab_size)

#             trigram_prob = trigram_freq / bigram_freq  # Smoothed probability

#             total_log_prob += math.log2(trigram_prob)
#             total_words += 1

#     # Prevent division by zero
#     if total_words == 0:
#         return float('inf')

#     perplexity = 2 ** (-total_log_prob / total_words)
#     return perplexity

# # Example Usage:
# test_sentences = tokenized_sentences[:100]
# vocab_size = len(set(word for sentence in tokenized_sentences for word in sentence))  # Unique word count
# perplexity = calculate_perplexity(test_sentences, trigram_count, bigram_counts, vocab_size)

# print(f"Perplexity of the improved trigram model: {perplexity}")