# Pointwise mutual information

### Getting the words from the corpus

In [5]:

import nltk
from nltk.corpus import brown

words = brown.words()

# remove punctuation
words = [word for word in words if word.isalnum()]

# print number of words
print("Number of words in the brown corpus: ", len(words))

# get the frequency distribution of the words
fdist = nltk.FreqDist(words)

# get words that occur ten or more times
high_freq_words = [word for word in words if fdist[word] >= 10]


fdist


Number of words in the brown corpus:  988331


FreqDist({'the': 62713, 'of': 36080, 'and': 27915, 'to': 25732, 'a': 21881, 'in': 19536, 'that': 10237, 'is': 10011, 'was': 9777, 'for': 8841, ...})

### Calculating the pmi

In [9]:
import math

bigrams = list(nltk.bigrams(words))

# get frequency distribution of the bigrams
bigrams_fdist = nltk.FreqDist(bigrams)

# calculate the pmi for all the bigrams
pmi = {}
for bigram in bigrams:
    w1, w2 = bigram

    # calculate the pmi with the formula

    if fdist[w1] >= 10 and fdist[w2] >= 10:

        # bigram probability
        bigram_prob = bigrams_fdist[bigram] / len(bigrams)

        # unigram probabilities
        w1_prob = fdist[w1] / len(words)
        w2_prob = fdist[w2] / len(words)

        pmi[bigram] = math.log2(bigram_prob / (w1_prob * w2_prob))

        if pmi[bigram] > 40:
            print("we have a big pmi: ", bigram, pmi[bigram])
            print("bigram prob: ", bigram_prob)
            print("w1 prob: ", w1_prob)
            print("w2 prob: ", w2_prob)

            

### Printing the pairs with highest and lowest score

In [10]:
sorted_pmi = sorted(pmi.items(), key=lambda x: x[1], reverse=True)

print("20 word pairs with the highest pmi value: ")
for i in range(20):
    print(sorted_pmi[i])

print("\n20 word pairs with the lowest pmi value: ")
for i in range(1, 21):
    print(sorted_pmi[-i])

20 word pairs with the highest pmi value: 
(('Hong', 'Kong'), 16.45520460843694)
(('Viet', 'Nam'), 15.914636227074237)
(('Pathet', 'Lao'), 15.827173385823897)
(('Simms', 'Purdew'), 15.827173385823897)
(('7th', 'Cavalry'), 15.815100553523322)
(('El', 'Paso'), 15.592708132186875)
(('Herald', 'Tribune'), 15.562119812353451)
(('Lo', 'Shu'), 15.522318804295477)
(('Islands', 'Guam'), 15.48167681979813)
(('WTV', 'antigen'), 15.436588930269593)
(('Gray', 'Eyes'), 15.400063054244479)
(('Puerto', 'Rico'), 15.32967372635308)
(('Internal', 'Revenue'), 15.32967372635308)
(('decomposition', 'theorem'), 15.107281305016633)
(('Saxon', 'Shore'), 15.081746212909495)
(('anionic', 'binding'), 15.078134959357117)
(('carbon', 'tetrachloride'), 15.02927879301536)
(('Common', 'Market'), 15.007745631465719)
(('unwed', 'mothers'), 15.007745631465719)
(('Beverly', 'Hills'), 14.991804087596698)

20 word pairs with the lowest pmi value: 
(('a', 'the'), -10.439231740746157)
(('the', 'it'), -8.73673012210247)
(('the