In [1]:
import pandas as pd
import pickle
import math
import re


In [3]:
with open("../ass4/unigram_model.pkl", "rb") as f:
    unigram_counts = pickle.load(f)

with open("../ass4/bigram_model.pkl", "rb") as f:
    bigram_counts = pickle.load(f)

print("Models loaded (unigram size:", len(unigram_counts), 
      ", bigram size:", len(bigram_counts), ")")

Models loaded (unigram size: 1 , bigram size: 8264 )


In [4]:
def get_count(model, key):
    # Unigram: wrap in tuple if string
    if isinstance(key, str):
        key = (key,)
    value = model.get(key, 0)

    if isinstance(value, int):
        return value
    if isinstance(value, dict):
        return value.get("count", 0)
    if isinstance(value, tuple):
        return value[0]
    return 0

In [5]:
N = sum(get_count(unigram_counts, k) for k in unigram_counts.keys())

In [6]:
def extract_tokens(df):
    tokens = []
    for row in df["sentences"]:
        if isinstance(row, (list, tuple)) or hasattr(row, "__iter__"):
            for item in row:
                if isinstance(item, dict) and "tokens" in item:
                    tokens.extend([t for t in item["tokens"] if isinstance(t, str)])
    return tokens

In [7]:
val_df = pd.read_csv("../ass5/validation.csv")
test_df = pd.read_csv("../ass5/test.csv")

In [8]:
val_tokens = [t for sent in val_df["sentence"] for t in str(sent).split()]
test_tokens = [t for sent in test_df["sentence"] for t in str(sent).split()]

print("Validation tokens:", len(val_tokens))
print("Test tokens:", len(test_tokens))

Validation tokens: 2239
Test tokens: 2234


In [9]:
def pmi(w1, w2):
    # Wrap unigram keys in tuple
    c_bigram = get_count(bigram_counts, (w1, w2))
    if c_bigram == 0:
        return float("-inf")
    c_w1 = get_count(unigram_counts, (w1,))   # <-- wrap as tuple
    c_w2 = get_count(unigram_counts, (w2,))   # <-- wrap as tuple
    return math.log((c_bigram * N) / (c_w1 * c_w2 + 1e-10))

In [10]:
def compute_pmi_table(tokens, split_name="Validation"):
    bigrams = list(zip(tokens[:-1], tokens[1:]))
    records = []
    for w1, w2 in bigrams:
        records.append({
            "Bigram": (w1, w2),
            "Count_bigram": get_count(bigram_counts, (w1, w2)),
            "Count_w1": get_count(unigram_counts, w1),
            "Count_w2": get_count(unigram_counts, w2),
            "PMI": pmi(w1, w2)
        })

    df = pd.DataFrame(records).drop_duplicates(subset=["Bigram"])
    print(f"\nTop 10 PMI scores for {split_name} set:")
    display(df.sort_values("PMI", ascending=False).head(10))
    return df

In [11]:
val_pmi  = compute_pmi_table(val_tokens, "Validation")
test_pmi = compute_pmi_table(test_tokens, "Test")


Top 10 PMI scores for Validation set:


Unnamed: 0,Bigram,Count_bigram,Count_w1,Count_w2,PMI
2237,"(પ્રમુખ, છે)",0,0,0,-inf
0,"(જે, ધ્યાને)",0,0,0,-inf
1,"(ધ્યાને, લેતા)",0,0,0,-inf
2,"(લેતા, દેશભરમાં)",0,0,0,-inf
3,"(દેશભરમાં, નવીનતમ)",0,0,0,-inf
4,"(નવીનતમ, ટેકનોલોજીનો)",0,0,0,-inf
5,"(ટેકનોલોજીનો, ઉપયોગ)",0,0,0,-inf
6,"(ઉપયોગ, કરીને)",0,0,0,-inf
7,"(કરીને, એકીસાથે)",0,0,0,-inf
2221,"(છે, તેની)",0,0,0,-inf



Top 10 PMI scores for Test set:


Unnamed: 0,Bigram,Count_bigram,Count_w1,Count_w2,PMI
2231,"(પ્રસિદ્ધ, થાય)",0,0,0,-inf
0,"(રાત્રીના, સમયે)",0,0,0,-inf
1,"(સમયે, એકલ)",0,0,0,-inf
2,"(એકલ, દોકલ)",0,0,0,-inf
3,"(દોકલ, વ્યકિત)",0,0,0,-inf
4,"(વ્યકિત, દેખાય)",0,0,0,-inf
5,"(દેખાય, તો)",0,0,0,-inf
6,"(તો, તેના)",0,0,0,-inf
7,"(તેના, વાહનને)",0,0,0,-inf
2215,"(જોઈએ, તે)",0,0,0,-inf


In [12]:
val_pmi.to_parquet("validation_pmi.parquet", index=False)
test_pmi.to_parquet("test_pmi.parquet", index=False)

print("\nPMI tables saved")


PMI tables saved


In [13]:
print("Example val token:", val_tokens[1])
print("As tuple:", (val_tokens[1],))
print("Unigram exists in model:", (val_tokens[1],) in unigram_counts)
print("Raw string exists:", val_tokens[1] in unigram_counts)

Example val token: ધ્યાને
As tuple: ('ધ્યાને',)
Unigram exists in model: False
Raw string exists: False


In [14]:
print("Example val bigram:", (val_tokens[2], val_tokens[3]))
print("Exists in bigram model:", (val_tokens[2], val_tokens[3]) in bigram_counts)

Example val bigram: ('લેતા', 'દેશભરમાં')
Exists in bigram model: False
