In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter
import numpy as np
from gensim.models import Phrases

In [None]:
# Sample corpus
corpus = [
    "Thank you so much for your help.",
    "I really appreciate your help.",
    "Excuse me, do you know what time it is?",
    "I’m really sorry for not inviting you.",
    "I really like your watch."
]

In [None]:
# Function to preprocess and tokenize the text
def preprocess_tokenize(corpus):
    tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in corpus]
    return tokenized_corpus

In [None]:
# Function to generate and display unique token frequency table
def generate_token_frequency_table(tokenized_corpus):
    flattened_corpus = [word for sentence in tokenized_corpus for word in sentence]
    token_frequency = Counter(flattened_corpus)
    print("Token Frequency Table:")
    print(token_frequency)

In [None]:
# Function to generate bigrams and compute their frequency
def generate_bigrams(tokenized_corpus):
    bigrams = list(ngrams([word for sentence in tokenized_corpus for word in sentence], 2))
    bigram_frequency = Counter(bigrams)
    return bigram_frequency

In [None]:
# Function to calculate probability of a sentence using user-defined functions
def calculate_sentence_probability(sentence, bigram_frequency):
    tokenized_sentence = nltk.word_tokenize(sentence.lower())
    probability = 1
    for i in range(len(tokenized_sentence) - 1):
        bigram = (tokenized_sentence[i], tokenized_sentence[i+1])
        probability *= (bigram_frequency[bigram] + 1) / (sum(bigram_frequency.values()) + len(bigram_frequency))
    return probability

In [None]:
# Function to calculate probability of a sentence using Gensim package
def calculate_sentence_probability_gensim(sentence, bigram_model):
    tokenized_sentence = nltk.word_tokenize(sentence.lower())
    bigrams = list(bigram_model[tokenized_sentence])
    print(bigrams)  # Print bigrams for debugging
    if bigrams and all(isinstance(bigram, tuple) and len(bigram) == 2 for bigram in bigrams):
        probability = sum(score for _, score in bigrams)
    else:
        probability = 0
    return probability


In [None]:
import nltk
nltk.download('punkt')
# Step 1: Preprocessing and tokenization
tokenized_corpus = preprocess_tokenize(corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Step 2: Generate and display unique token frequency table
generate_token_frequency_table(tokenized_corpus)

Token Frequency Table:
Counter({'.': 4, 'you': 3, 'your': 3, 'i': 3, 'really': 3, 'for': 2, 'help': 2, 'thank': 1, 'so': 1, 'much': 1, 'appreciate': 1, 'excuse': 1, 'me': 1, ',': 1, 'do': 1, 'know': 1, 'what': 1, 'time': 1, 'it': 1, 'is': 1, '?': 1, '’': 1, 'm': 1, 'sorry': 1, 'not': 1, 'inviting': 1, 'like': 1, 'watch': 1})


In [None]:
# Step 3: Generate bigrams and compute their frequency
bigram_frequency = generate_bigrams(tokenized_corpus)

In [None]:
# Step 4: Take sample sentence as an input to the system and compute its probability
test_sentences = [
    "I really like your garden",
    "I really sorry for your garden",
    "I really appropriate your garden",
    "I really appreciate your garden"
]

In [None]:
# Calculate probability using user-defined functions
print("\nProbability of sentences using user-defined functions:")
for sentence in test_sentences:
    probability = calculate_sentence_probability(sentence, bigram_frequency)
    print(f"Sentence: '{sentence}', Probability: {probability}")



Probability of sentences using user-defined functions:
Sentence: 'I really like your garden', Probability: 3.5968876850239016e-07
Sentence: 'I really sorry for your garden', Probability: 9.465493907957636e-09
Sentence: 'I really appropriate your garden', Probability: 8.992219212559754e-08
Sentence: 'I really appreciate your garden', Probability: 3.5968876850239016e-07


In [None]:
# Calculate probability using Gensim package
print("\nProbability of sentences using Gensim package:")
bigram_model = Phrases(tokenized_corpus)
for sentence in test_sentences:
    probability = calculate_sentence_probability_gensim(sentence, bigram_model)
    print(f"Sentence: '{sentence}', Probability: {probability}")


Probability of sentences using Gensim package:
['i', 'really', 'like', 'your', 'garden']
Sentence: 'I really like your garden', Probability: 0
['i', 'really', 'sorry', 'for', 'your', 'garden']
Sentence: 'I really sorry for your garden', Probability: 0
['i', 'really', 'appropriate', 'your', 'garden']
Sentence: 'I really appropriate your garden', Probability: 0
['i', 'really', 'appreciate', 'your', 'garden']
Sentence: 'I really appreciate your garden', Probability: 0
