In [6]:
import math
import nltk
from nltk.tokenize import word_tokenize
from nltk import bigrams
from collections import Counter

# ------------------------
# Load Corpus
# ------------------------
filepath = "corpus.txt"
with open(filepath, 'r', encoding='latin-1') as file:  
    corpus = file.read()

# ------------------------
# Tokenization
# ------------------------
tokens = word_tokenize(corpus.lower())
elements = Counter(tokens)              # unigram counts
bi_grams = list(bigrams(tokens))
bigrams_count = Counter(bi_grams)       # bigram counts

# ------------------------
# User Input
# ------------------------
noun = input("Enter the noun: ").lower()
prep = input("Enter the preposition: ").lower()
verb = input("Enter the verb: ").lower()

# ------------------------
# Counts
# ------------------------
n = elements.get(noun, 0)
v = elements.get(verb, 0)
p_n = bigrams_count.get((noun, prep), 0)
p_v = bigrams_count.get((verb, prep), 0)

print("\nCounts:")
print(f"{noun} = {n}")
print(f"{verb} = {v}")
print(f"({noun}, {prep}) = {p_n}")
print(f"({verb}, {prep}) = {p_v}")

# ------------------------
# λ Calculation with probabilities
# ------------------------
def cal_lambda(p_v, p_n, n, v):
    # Laplace smoothing
    prob_v = (p_v + 1) / (v + 2) if v > 0 else 0
    prob_n = (p_n + 1) / (n + 2) if n > 0 else 0

    print("\nProbability Calculations:")
    print(f"P({prep}|{verb}) = ({p_v} + 1) / ({v} + 2) = {prob_v:.4f}")
    print(f"P({prep}|{noun}) = ({p_n} + 1) / ({n} + 2) = {prob_n:.4f}")

    if prob_n == 0:  # safeguard
        return 0

    lamda = math.log((prob_v * (1 - prob_n)) / prob_n, 2)
    return lamda

lamda = cal_lambda(p_v, p_n, n, v)

print(f"\nλ = {lamda:.4f}")
if lamda > 0:
    print("➡ PREPOSITION IS ATTACHED WITH VERB")
elif lamda < 0:
    print("➡ PREPOSITION IS ATTACHED WITH NOUN")
else:
    print("➡ PREPOSITION IS NOT ATTACHED")


Enter the noun:  dog
Enter the preposition:  over
Enter the verb:  jumps



Counts:
dog = 1
jumps = 1
(dog, over) = 0
(jumps, over) = 1

Probability Calculations:
P(over|jumps) = (1 + 1) / (1 + 2) = 0.6667
P(over|dog) = (0 + 1) / (1 + 2) = 0.3333

λ = 0.4150
➡ PREPOSITION IS ATTACHED WITH VERB
