In [1]:
from nltk import ngrams, FreqDist, word_tokenize
import pandas as pd
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

def calculate_probabilities(text, smoothing_method='mle', delta=0.5):
    text = preprocess_text(text)
    words = word_tokenize(text)
    v = len(set(words))
    bigram = list(ngrams(words, 2))
    fidst = FreqDist(bigram)
    
    if smoothing_method == 'mle':
        word_probabilities = {bigram: freq / fidst.N() for bigram, freq in fidst.items()}
    elif smoothing_method == 'add-one':
        fidst_smoothed = {bigram: freq + 1 for bigram, freq in fidst.items()}
        word_probabilities = {bigram: (freq_smoothed) / (fidst.N() + v) for bigram, freq_smoothed in fidst_smoothed.items()}
    elif smoothing_method == 'add-delta':
        fidst_smoothed = {bigram: freq + delta for bigram, freq in fidst.items()}
        word_probabilities = {bigram: (freq_smoothed) / (fidst.N() + delta * v) for bigram, freq_smoothed in fidst_smoothed.items()}
    elif smoothing_method == 'good-turing':
        freq_of_freq = FreqDist(fidst.values())
        N1 = freq_of_freq[1]
        smoothed_probabilities = {}
        for bigram, freq in fidst.items():
            c_star = (freq + 1) * (freq_of_freq[freq + 1] / freq_of_freq[freq]) if freq + 1 in freq_of_freq else freq_of_freq[1] / N1
            smoothed_probabilities[bigram] = c_star / fidst.N()
        word_probabilities = smoothed_probabilities
    elif smoothing_method == 'held-out':
        split_index = len(words) // 2
        train_data = words[:split_index]
        held_out_data = words[split_index:]
        train_bigram = list(ngrams(train_data, 2))
        fidst_train = FreqDist(train_bigram)
        N_train = fidst_train.N()
        freq_of_freq_train = FreqDist(fidst_train.values())
        held_out_bigram = list(ngrams(held_out_data, 2))
        T = len(held_out_bigram)
        held_out_probabilities = {}
        for bigram in held_out_bigram:
            r = fidst_train[bigram] if bigram in fidst_train else 0
            Tr = freq_of_freq_train[r]
            if freq_of_freq_train[r] != 0:
                held_out_probabilities[bigram] = Tr / (T * freq_of_freq_train[r])
            else:
                held_out_probabilities[bigram] = 0
        return held_out_probabilities

    return word_probabilities

def main():
    with open("input_smoothing.txt", encoding="utf-8") as f:
        text = f.read()
    
    smoothing_methods = ['mle', 'add-one', 'add-delta', 'good-turing', 'held-out']
    
    for method in smoothing_methods:
        print(f"\nSmoothing Method: {method}")
        probabilities = calculate_probabilities(text, method)
        for bigram, prob in probabilities.items():
            print(bigram, prob)

if __name__ == "__main__":
    main()



Smoothing Method: mle
('title', 'the') 0.0013908205841446453
('the', 'evolution') 0.004172461752433936
('evolution', 'of') 0.004172461752433936
('of', 'technology') 0.006954102920723227
('technology', 'from') 0.0013908205841446453
('from', 'stone') 0.0013908205841446453
('stone', 'tools') 0.006954102920723227
('tools', 'to') 0.005563282336578581
('to', 'artificial') 0.0013908205841446453
('artificial', 'intelligence') 0.006954102920723227
('intelligence', 'introduction') 0.0013908205841446453
('introduction', 'technology') 0.0013908205841446453
('technology', 'has') 0.0013908205841446453
('has', 'been') 0.0013908205841446453
('been', 'an') 0.0013908205841446453
('an', 'integral') 0.0013908205841446453
('integral', 'part') 0.0013908205841446453
('part', 'of') 0.0013908205841446453
('of', 'human') 0.0013908205841446453
('human', 'civilization') 0.0013908205841446453
('civilization', 'shaping') 0.0013908205841446453
('shaping', 'our') 0.0013908205841446453
('our', 'societies') 0.00139082