In [1]:
import math
from collections import Counter

In [2]:
def read_file(file_path):
    words = []
    with open(file_path, "r", encoding='utf-8') as file:
        for line in file:
            line_tokens = line.lower().split(" ")
            words += line_tokens
    return words

# # calculate word pairs pmi
def calculate_pmi(corpus, min_count=10):
    # # use Counter to count every single word occur times
    unigram_freq = Counter(corpus)
    # # count every word pair w1,w2 occur times
    bigram_freq = Counter(zip(corpus[:-1], corpus[1:]))
    # # count the size of the corpus N
    N = len(corpus)
    
    # # ignore the words which occur less than 10 times
    valid_words = {word for word, count in unigram_freq.items() if count >= min_count}
    
    pmi_values = {}
    for (w1, w2), bigram_count in bigram_freq.items():
        if w1 in valid_words and w2 in valid_words:
            c_w1 = unigram_freq[w1]
            c_w2 = unigram_freq[w2]
            c_w1_w2 = bigram_count
            pmi = math.log2((c_w1_w2 * N) / (c_w1 * c_w2))
            pmi_values[(w1, w2)] = pmi
    
    return pmi_values

In [3]:
file_path = './kingjamesbible_tokenized.txt'
corpus = read_file(file_path)
    
pmi_values = calculate_pmi(corpus)
    
# # sort the pmi in descending order
sorted_pmi = sorted(pmi_values.items(), key=lambda item: item[1], reverse=True)
    
top_20_pmi = sorted_pmi[:20]
bottom_20_pmi = sorted_pmi[-20:]
    
print("highest 20 pmi:")
for pair, pmi in top_20_pmi:
    print(f"{pair}: {pmi}")
    
print("lowest 20 pmi:")
for pair, pmi in bottom_20_pmi:
    print(f"{pair}: {pmi}")

highest 20 pmi:
('ill', 'favoured'): 14.674907888354861
('judas', 'iscariot'): 14.47029016874639
('curious', 'girdle'): 14.23433529696888
('brook', 'kidron'): 14.229282069242593
('poureth', 'contempt'): 14.160334715525103
('measuring', 'reed'): 14.107720089361681
('persecution', 'ariseth'): 14.022831191775168
('divers', 'colours'): 14.009775038949721
('mary', 'magdalene'): 13.9237725210525
('overflowing', 'scourge'): 13.759796785941374
('wreathen', 'chains'): 13.594737539670877
('fiery', 'furnace'): 13.575372214803947
('sharp', 'sickle'): 13.575372214803947
('committeth', 'adultery'): 13.556263391856241
('earthen', 'vessel'): 13.543663355076609
('perpetual', 'desolations'): 13.537404364604926
('golden', 'spoon'): 13.49676238010758
('bright', 'spot'): 13.472278721839842
('tenth', 'deals'): 13.464340902415202
('cunning', 'workman'): 13.437868691054012
lowest 20 pmi:
('into', ','): -7.246995554827266
(',', 'me'): -7.270431431411936
('this', 'and'): -7.29703574219293
('with', '.\n'): -7.30