In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter
import math

nltk.download('punkt')

# Sample sentences
sentences = [
    "I love natural language processing",
    "I love programming",
    "Natural language processing is fascinating",
    "Programming is fun and rewarding"
]

# Preprocessing: tokenize and lowercase
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Function to generate n-grams
def generate_ngrams(tokenized_sentences, n):
    ngrams_list = []
    for sentence in tokenized_sentences:
        ngrams_list.extend(list(ngrams(sentence, n)))
    return ngrams_list

# Function to calculate probabilities
def calculate_ngram_probabilities(ngrams_list, n_minus_1_grams=None):
    ngram_counts = Counter(ngrams_list)
    probabilities = {}

    if n_minus_1_grams:  # For bigrams, trigrams
        n_minus_1_counts = Counter(n_minus_1_grams)
        for ngram in ngram_counts:
            prefix = ngram[:-1]
            probabilities[ngram] = ngram_counts[ngram] / n_minus_1_counts[prefix]
    else:  # For unigrams
        total_count = sum(ngram_counts.values())
        for ngram in ngram_counts:
            probabilities[ngram] = ngram_counts[ngram] / total_count

    return probabilities

# Generate n-grams
unigrams = generate_ngrams(tokenized_sentences, 1)
bigrams = generate_ngrams(tokenized_sentences, 2)
trigrams = generate_ngrams(tokenized_sentences, 3)

# Calculate probabilities
unigram_probabilities = calculate_ngram_probabilities(unigrams)
bigram_probabilities = calculate_ngram_probabilities(bigrams, unigrams)
trigram_probabilities = calculate_ngram_probabilities(trigrams, bigrams)

# Display probabilities
print("\nUnigram Probabilities:")
for unigram, prob in unigram_probabilities.items():
    print(f"{unigram}: {prob:.4f}")

print("\nBigram Probabilities:")
for bigram, prob in bigram_probabilities.items():
    print(f"{bigram}: {prob:.4f}")

print("\nTrigram Probabilities:")
for trigram, prob in trigram_probabilities.items():
    print(f"{trigram}: {prob:.4f}")


# Function to calculate sentence probability
def calculate_sentence_probability(sentence, n, probabilities):
    tokens = word_tokenize(sentence.lower())
    sentence_ngrams = list(ngrams(tokens, n))
    probability = 1

    for ngram in sentence_ngrams:
        prob = probabilities.get(ngram, 0.0001)  # smoothing
        probability *= prob

    return probability


# Test sentence
test_sentence = "I love programming"

print("\nSentence Probability (Unigram):",
      calculate_sentence_probability(test_sentence, 1, unigram_probabilities))

print("Sentence Probability (Bigram):",
      calculate_sentence_probability(test_sentence, 2, bigram_probabilities))

print("Sentence Probability (Trigram):",
      calculate_sentence_probability(test_sentence, 3, trigram_probabilities))



Unigram Probabilities:
('i',): 0.1111
('love',): 0.1111
('natural',): 0.1111
('language',): 0.1111
('processing',): 0.1111
('programming',): 0.1111
('is',): 0.1111
('fascinating',): 0.0556
('fun',): 0.0556
('and',): 0.0556
('rewarding',): 0.0556

Bigram Probabilities:
('i', 'love'): 1.0000
('love', 'natural'): 0.5000
('natural', 'language'): 1.0000
('language', 'processing'): 1.0000
('love', 'programming'): 0.5000
('processing', 'is'): 0.5000
('is', 'fascinating'): 0.5000
('programming', 'is'): 0.5000
('is', 'fun'): 0.5000
('fun', 'and'): 1.0000
('and', 'rewarding'): 1.0000

Trigram Probabilities:
('i', 'love', 'natural'): 0.5000
('love', 'natural', 'language'): 1.0000
('natural', 'language', 'processing'): 1.0000
('i', 'love', 'programming'): 0.5000
('language', 'processing', 'is'): 0.5000
('processing', 'is', 'fascinating'): 1.0000
('programming', 'is', 'fun'): 1.0000
('is', 'fun', 'and'): 1.0000
('fun', 'and', 'rewarding'): 1.0000

Sentence Probability (Unigram): 0.0013717421124828

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DSATM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
