In [2]:
from nltk.util import bigrams
from nltk.lm import MLE, Laplace
from nltk.lm.preprocessing import everygrams

In [3]:
# 1. Read the file
file_path = 'Dataset/Data_3.txt'
with open(file_path, 'r') as f:
    # ONLY take lines that contain both <s> and </s>
    lines = [line.strip() for line in f.readlines() if '<s>' in line and '</s>' in line]

for line in lines:
    print(line)

<s> He read a book </s>
<s> I read a different book </s>
<s> He read a book by Danielle </s>
<s> I read a different book by Danielle </s>


In [4]:
# 2. Keep sentences as lists of words
# We use all lines for training except the last line (last line is the target)
tokenized_sentences = [line.split() for line in lines[:-1]]
unique_words = set(word for sent in tokenized_sentences for word in sent)

print(f"Total Tokens: {sum(len(sent) for sent in tokenized_sentences)}")
print(f"Unique Vocabulary: {unique_words}")

Total Tokens: 21
Unique Vocabulary: {'Danielle', 'read', 'I', 'book', 'a', 'different', 'by', '<s>', '</s>', 'He'}


In [5]:
n = 2
train_data = [list(everygrams(sent, max_len=n)) for sent in tokenized_sentences]

# 3. Generate Vocab Data manually
vocab_data = [word for sent in tokenized_sentences for word in sent]

# 4. Initialize and Fit the model
model = MLE(n)
model.fit(train_data, vocab_data)

In [6]:
print("--- Unigram Counts ---")
for word in model.vocab:
    print(f"Count({word}): {model.counts[word]}")

--- Unigram Counts ---
Count(<s>): 3
Count(He): 2
Count(read): 3
Count(a): 3
Count(book): 3
Count(</s>): 3
Count(I): 1
Count(different): 1
Count(by): 1
Count(Danielle): 1
Count(<UNK>): 0


In [7]:
# Print specific bigram counts for the corpus sentences
print("\n--- Bigram Counts for Corpus Sentences ---")
for w1, w2 in set(bigrams([word for sent in tokenized_sentences for word in sent])):
    # model.counts[[w1]][w2] retrieves the count of w2 following w1 
    count = model.counts[[w1]][w2]
    print(f"Count({w1}, {w2}): {count}")


--- Bigram Counts for Corpus Sentences ---
Count(He, read): 2
Count(<s>, I): 1
Count(different, book): 1
Count(book, </s>): 2
Count(by, Danielle): 1
Count(a, different): 1
Count(Danielle, </s>): 1
Count(</s>, <s>): 0
Count(I, read): 1
Count(read, a): 3
Count(book, by): 1
Count(<s>, He): 2
Count(a, book): 2


In [8]:
# Re-prepare generators for Laplace
train_data_lap = [list(everygrams(sent, max_len=n)) for sent in tokenized_sentences]
model_laplace = Laplace(n)
model_laplace.fit(train_data_lap, vocab_data)

In [9]:
list(model_laplace.vocab)

['<s>',
 'He',
 'read',
 'a',
 'book',
 '</s>',
 'I',
 'different',
 'by',
 'Danielle',
 '<UNK>']

In [12]:
# 3. Calculate Sentence Probability
target = lines[-1].strip().split()
print(f"Target Sentence: {' '.join(target)}")
target_bgs = list(bigrams(target))
print(f"Target Bigrams: {target_bgs}\n")

def get_prob(model, bgs, smoothed=False):
    p = 1.0
    # Use the number of unique words excluding any <UNK>
    # This should be 10 for your specific corpus
    V_manual = 10 
    
    for w1, w2 in bgs:
        count_bg = model.counts[[w1]][w2]
        count_uni = model.counts[w1]
        
        if smoothed:
            # Manually apply Laplace: (C(bg) + 1) / (C(unigram) + 10)
            score = (count_bg + 1) / (count_uni + V_manual)
            print(f"P({w2}|{w1}): Count={count_bg}+1, Denom={count_uni}+{V_manual} -> {score:.4f}")
        else:
            score = model.score(w2, [w1])
            print(f"P({w2}|{w1}): Count={count_bg}, Denom={count_uni} -> {score:.4f}")
        p *= score
        
    return p

# Call the function
print("Unsmoothed")
u_p = get_prob(model, target_bgs, smoothed=False)
print(f"\nUnsmoothed Probability: {u_p:.5f}")

print("\nLaplace Smooth ")
s_p = get_prob(model_laplace, target_bgs, smoothed=True)
print(f"\nSmoothed Probability: {s_p:.10f}")

Target Sentence: <s> I read a different book by Danielle </s>
Target Bigrams: [('<s>', 'I'), ('I', 'read'), ('read', 'a'), ('a', 'different'), ('different', 'book'), ('book', 'by'), ('by', 'Danielle'), ('Danielle', '</s>')]

Unsmoothed
P(I|<s>): Count=1, Denom=3 -> 0.3333
P(read|I): Count=1, Denom=1 -> 1.0000
P(a|read): Count=3, Denom=3 -> 1.0000
P(different|a): Count=1, Denom=3 -> 0.3333
P(book|different): Count=1, Denom=1 -> 1.0000
P(by|book): Count=1, Denom=3 -> 0.3333
P(Danielle|by): Count=1, Denom=1 -> 1.0000
P(</s>|Danielle): Count=1, Denom=1 -> 1.0000

Unsmoothed Probability: 0.03704

Laplace Smooth 
P(I|<s>): Count=1+1, Denom=3+10 -> 0.1538
P(read|I): Count=1+1, Denom=1+10 -> 0.1818
P(a|read): Count=3+1, Denom=3+10 -> 0.3077
P(different|a): Count=1+1, Denom=3+10 -> 0.1538
P(book|different): Count=1+1, Denom=1+10 -> 0.1818
P(by|book): Count=1+1, Denom=3+10 -> 0.1538
P(Danielle|by): Count=1+1, Denom=1+10 -> 0.1818
P(</s>|Danielle): Count=1+1, Denom=1+10 -> 0.1818

Smoothed Probab