<a href="https://colab.research.google.com/github/Gousepasha789/Information-Retrieval-System/blob/main/Implement_N_gram_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import nltk
import string
from nltk.tokenize import word_tokenize
from collections import Counter

# Sample text
irs_text = """
The Internal Revenue Service (IRS) is responsible for tax collection in the United States.
It ensures compliance with tax laws and provides guidelines for taxpayers.
Taxpayers can claim deductions for education, medical expenses, and mortgage interest.
"""

# Function to preprocess the text
def preprocess_irs_text(text):
    text = text.lower()
    text = ''.join([char if char not in string.punctuation or char in [',', '.'] else ' ' for char in text])
    text = text.replace('internal revenue service', 'internal_revenue_service')
    return text

# Preprocessing
processed_irs_text = preprocess_irs_text(irs_text)
tokens = word_tokenize(processed_irs_text)

print("Tokens:", tokens)

# Function to generate n-grams
def generate_ngrams(tokens, n):
    return list(nltk.ngrams(tokens, n))

# Generating n-grams
unigrams = generate_ngrams(tokens, 1)
bigrams = generate_ngrams(tokens, 2)
trigrams = generate_ngrams(tokens, 3)

# Calculating frequency
unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

print("\nUnigram Frequencies:", unigram_freq)
print("\nBigram Frequencies:", bigram_freq)
print("\nTrigram Frequencies:", trigram_freq)

# Function to calculate n-gram probability
def calculate_ngram_probability(ngram, ngram_freq, n):
    ngram = tuple(ngram)
    if n == 1:
        total = sum(ngram_freq.values())
        return ngram_freq[ngram] / total
    elif n == 2:
        prev_word = ngram[0]
        total = sum(count for (word1, word2), count in ngram_freq.items() if word1 == prev_word)
        return ngram_freq[ngram] / total if total > 0 else 0
    elif n == 3:
        prev_words = ngram[:2]
        total = sum(count for (word1, word2, word3), count in ngram_freq.items() if (word1, word2) == tuple(prev_words))
        return ngram_freq[ngram] / total if total > 0 else 0

# Example probability calculation
sequence = ['taxpayers', 'can', 'claim']
print("\nProbability of trigram:", calculate_ngram_probability(sequence, trigram_freq, 3))

# Function to predict the next word based on trigram frequencies
def predict_next_word(context, trigram_freq):
    context_tuple = tuple(context)
    candidates = [(word, count) for (w1, w2, word), count in trigram_freq.items() if (w1, w2) == context_tuple]

    if candidates:
        candidates.sort(key=lambda x: x[1], reverse=True)
        return candidates[0][0]
    else:
        return None

# Predicting the next word
context = ['taxpayers', 'can']
predicted_word = predict_next_word(context, trigram_freq)
print("\nPredicted next word:", predicted_word)


Tokens: ['the', 'internal_revenue_service', 'irs', 'is', 'responsible', 'for', 'tax', 'collection', 'in', 'the', 'united', 'states', '.', 'it', 'ensures', 'compliance', 'with', 'tax', 'laws', 'and', 'provides', 'guidelines', 'for', 'taxpayers', '.', 'taxpayers', 'can', 'claim', 'deductions', 'for', 'education', ',', 'medical', 'expenses', ',', 'and', 'mortgage', 'interest', '.']

Unigram Frequencies: Counter({('for',): 3, ('.',): 3, ('the',): 2, ('tax',): 2, ('and',): 2, ('taxpayers',): 2, (',',): 2, ('internal_revenue_service',): 1, ('irs',): 1, ('is',): 1, ('responsible',): 1, ('collection',): 1, ('in',): 1, ('united',): 1, ('states',): 1, ('it',): 1, ('ensures',): 1, ('compliance',): 1, ('with',): 1, ('laws',): 1, ('provides',): 1, ('guidelines',): 1, ('can',): 1, ('claim',): 1, ('deductions',): 1, ('education',): 1, ('medical',): 1, ('expenses',): 1, ('mortgage',): 1, ('interest',): 1})

Bigram Frequencies: Counter({('the', 'internal_revenue_service'): 1, ('internal_revenue_service