In [27]:
import nltk
print(nltk.__version__)

3.8.1


In [32]:
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends
# from nltk.lm.preprocessing import flatten

In [28]:
nltk.download('punkt')
text = "I read a book by Danielle"
tokens = nltk.tokenize.word_tokenize(text.lower())
tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['i', 'read', 'a', 'book', 'by', 'danielle']

In [33]:
list(bigrams(tokens))

[('i', 'read'),
 ('read', 'a'),
 ('a', 'book'),
 ('book', 'by'),
 ('by', 'danielle')]

In [34]:
list(ngrams(tokens, n=2)) # n = no of grams

[('i', 'read'),
 ('read', 'a'),
 ('a', 'book'),
 ('book', 'by'),
 ('by', 'danielle')]

In [35]:
list(everygrams(tokens, max_len=3)) # max_len will set the no of maximum grams

[('i',),
 ('i', 'read'),
 ('i', 'read', 'a'),
 ('read',),
 ('read', 'a'),
 ('read', 'a', 'book'),
 ('a',),
 ('a', 'book'),
 ('a', 'book', 'by'),
 ('book',),
 ('book', 'by'),
 ('book', 'by', 'danielle'),
 ('by',),
 ('by', 'danielle'),
 ('danielle',)]

In [36]:
from nltk.util import pad_sequence
list(pad_sequence(tokens, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))
# The n order of n-grams, if it's 2-grams, you pad once, 3-grams pad twice, etc.

['<s>', 'i', 'read', 'a', 'book', 'by', 'danielle', '</s>']

In [37]:
padded_sent = list(pad_sequence(tokens, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))
list(ngrams(padded_sent, n=2)) # bigram

[('<s>', 'i'),
 ('i', 'read'),
 ('read', 'a'),
 ('a', 'book'),
 ('book', 'by'),
 ('by', 'danielle'),
 ('danielle', '</s>')]

In [39]:
from nltk.lm.preprocessing import pad_both_ends
list(pad_both_ends(tokens, n=2))

['<s>', 'i', 'read', 'a', 'book', 'by', 'danielle', '</s>']

In [40]:
list(bigrams(pad_both_ends(tokens, n=2)))

[('<s>', 'i'),
 ('i', 'read'),
 ('read', 'a'),
 ('a', 'book'),
 ('book', 'by'),
 ('by', 'danielle'),
 ('danielle', '</s>')]

In [41]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(tokens, n=2))
list(everygrams(padded_bigrams, max_len=1))

[('<s>',),
 ('i',),
 ('read',),
 ('a',),
 ('book',),
 ('by',),
 ('danielle',),
 ('</s>',)]

In [42]:
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('<s>', 'i'),
 ('i',),
 ('i', 'read'),
 ('read',),
 ('read', 'a'),
 ('a',),
 ('a', 'book'),
 ('book',),
 ('book', 'by'),
 ('by',),
 ('by', 'danielle'),
 ('danielle',),
 ('danielle', '</s>'),
 ('</s>',)]

In [43]:
import nltk
nltk.download('punkt')
text = "I read a book by Danielle"
# Tokenize the text.
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(text)))]
print(tokenized_text)

[['i', 'read', 'a', 'book', 'by', 'danielle']]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
# Preprocess the tokenized text for 3-grams language modelling
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE # Maximum Likelihood Estimation

n = 2 # No of grams
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

model = MLE(n) # Lets train a 3-grams maximum likelihood estimation model.
model.fit(train_data, padded_sents) # model building

In [46]:
model

<nltk.lm.models.MLE at 0x26e2c2c8b90>

### Unsmoothed Bigram model (Method 1)

In [8]:
#Function of calculating bigram probability
def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))
    return listOfProb

# Given corpus
corpus = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"
]

# Function to extract bigrams from a list of sentences
def extract_bigrams(sentences):
    bigrams = []
    for sentence in sentences:
        words = sentence.split()  # Include <s> and </s> tokens
        for i in range(len(words) - 1):
            bigrams.append((words[i], words[i+1]))
    return bigrams

# Count unigrams and bigrams
unigrams = [word for sentence in corpus for word in sentence.split()]
bigrams = extract_bigrams(corpus)

unigram_counts = {word: unigrams.count(word) for word in unigrams}
bigram_counts = {bigram: bigrams.count(bigram) for bigram in bigrams}

# Sentence for which we want to calculate the probability
sentence = "<s> I read a book by Danielle </s>"
sentence_bigrams = extract_bigrams([sentence])

# Calculate the probability using the provided function
bigram_probabilities = calcBigramProb(sentence_bigrams, unigram_counts, bigram_counts)

# Compute the probability of the sentence
sentence_probability = 1
for bigram in sentence_bigrams:
    sentence_probability *= bigram_probabilities.get(bigram, 0)

# Print the probability
print("Probability of the sentence:", sentence_probability)


Probability of the sentence: 0.07407407407407407


### Unsmoothed Bigram model (Method 2)

In [1]:
# Function to calculate bigram probability
def bigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))
    return listOfProb

# Function to read corpus from a text file and filter specific lines
def read_and_filter_corpus(file_path, lines_to_include):
    with open(file_path, 'r') as file:
        corpus = file.readlines()
    filtered_corpus = [corpus[line_number] for line_number in lines_to_include] 
    return filtered_corpus

# Function to extract bigrams from a list of sentences
def extract_bigrams(sentences):
    bigrams = []
    for sentence in sentences:
        words = sentence.split()  # This is to include <s> and </s> tokens
        for i in range(len(words) - 1):
            bigrams.append((words[i], words[i+1]))
    return bigrams

# Count unigrams and bigrams
corpus_file_path = "Data_3.txt"
lines_to_include = [2, 3, 4]  # Lines to include from the file (Training Corpus)
corpus = read_and_filter_corpus(corpus_file_path, lines_to_include)

unigrams = [word for sentence in corpus for word in sentence.split()]
bigrams = extract_bigrams(corpus)

unigram_counts = {word: unigrams.count(word) for word in unigrams}
bigram_counts = {bigram: bigrams.count(bigram) for bigram in bigrams}

# Calculate the probability of the sentence below
sentence = "<s> I read a book by Danielle </s>"
sentence_bigrams = extract_bigrams([sentence])

# Calculate the probability using the function created above
bigram_probabilities = bigramProb(sentence_bigrams, unigram_counts, bigram_counts)

# Compute the probability of the sentence
sentence_probability = 1
for bigram in sentence_bigrams:
    sentence_probability *= bigram_probabilities.get(bigram, 0)

# Print the probability
print("Probability of the sentence:", sentence_probability)


Probability of the sentence: 0.07407407407407407


### Smoothed Bigram model

In [14]:
def smoothedBigramProb(listOfBigrams, unigramCounts, bigramCounts, vocabulary_size):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        count_bigram = bigramCounts.get(bigram, 0)
        count_unigram = unigramCounts.get(word1, 0)
        # Apply Laplace smoothing
        prob = (count_bigram + 1) / (count_unigram + vocabulary_size)
        listOfProb[bigram] = prob
    return listOfProb

# Training corpus
corpus = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"
]

# Function to extract bigrams from a list of sentences
def extract_bigrams(sentences):
    bigrams = []
    for sentence in sentences:
        words = sentence.split()  # to include <s> and </s> tokens
        for i in range(len(words) - 1):
            bigrams.append((words[i], words[i+1]))
    return bigrams

# Count unigrams and bigrams
unigrams = [word for sentence in corpus for word in sentence.split()]
bigrams = extract_bigrams(corpus)
vocabulary_size = len(set(unigrams))  # Vocabulary size for Laplace smoothing , in this case its 10

unigram_counts = {word: unigrams.count(word) for word in unigrams}
bigram_counts = {bigram: bigrams.count(bigram) for bigram in bigrams}

# Sentence for which we want to calculate the probability
sentence = "<s> I read a book by Danielle </s>"
sentence_bigrams = extract_bigrams([sentence])

# Calculate the smoothed probability using the provided function
smoothed_bigram_probabilities = smoothedBigramProb(sentence_bigrams, unigram_counts, bigram_counts, vocabulary_size)

# Compute the probability of the sentence
sentence_probability = 1
for bigram in sentence_bigrams:
    sentence_probability *= smoothed_bigram_probabilities.get(bigram, 1/(vocabulary_size))  # Default to 1/V if bigram not found

# Print the probability
print("Probability of the sentence:", sentence_probability)


Probability of the sentence: 1.0101357919757919e-05
