In [None]:
import math
from collections import Counter

In [None]:
def read_file(file_path):
    words = []
    with open(file_path, "r", encoding='utf-8') as file:
        for line in file:
            line_tokens = line.lower().split()
            words += line_tokens
    return words

def calculate_pmi(corpus, min_count=10):
    # 计算单词和词对的频率
    unigram_freq = Counter(corpus)
    bigram_freq = Counter(zip(corpus[:-1], corpus[1:]))
    total_words = len(corpus)
    
    # 过滤掉出现次数少于min_count的单词
    valid_words = {word for word, count in unigram_freq.items() if count >= min_count}
    
    # 计算有效词对的PMI值
    pmi_values = {}
    for (w1, w2), bigram_count in bigram_freq.items():
        if w1 in valid_words and w2 in valid_words:
            p_w1 = unigram_freq[w1] / total_words
            p_w2 = unigram_freq[w2] / total_words
            p_w1_w2 = bigram_count / total_words
            pmi = math.log2(p_w1_w2 / (p_w1 * p_w2))
            pmi_values[(w1, w2)] = pmi
    
    return pmi_values

In [None]:
file_path = './kingjamesbible_tokenized.txt'
corpus = read_file(file_path)
    
pmi_values = calculate_pmi(corpus)
    
sorted_pmi = sorted(pmi_values.items(), key=lambda item: item[1], reverse=True)
    
top_20_pmi = sorted_pmi[:20]
bottom_20_pmi = sorted_pmi[-20:]
    
print("top 20 pmi:")
for pair, pmi in top_20_pmi:
    print(f"{pair}: {pmi}")
    
print("bottom 20 pmi:")
for pair, pmi in bottom_20_pmi:
    print(f"{pair}: {pmi}")