In [1]:
from collections import Counter
import itertools
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
file = open ("cacm.txt","r")
globalWordList = list()

for line in file:
    wordlist = line.split()
    globalWordList.append(wordlist)
print(globalWordList[0:10])

[['preliminary', 'report', 'algebraic', 'language', 'december', 'perlis'], ['extraction', 'roots', 'digital', 'computers', 'december'], ['techniques', 'department', 'matrix', 'program', 'schemes', 'december'], ['engineering', 'programming', 'terminology', 'november'], ['square', 'root', 'approximations', 'november'], ['computers', 'procedures', 'november'], ['engineering', 'programming', 'terminology', 'october'], ['equivalence', 'transformation', 'program', 'schemes', 'october'], ['proposal', 'october'], ['engineering', 'programming', 'terminology', 'september']]


In [3]:
word_counts = Counter()
for ii, document in enumerate(globalWordList):
    for token in document :
        word_counts[token] += 1

tok2indx = {tok: indx for indx, tok in enumerate(word_counts.keys())}
indx2tok = {indx: tok for tok,indx in tok2indx.items()}
print('vocabulary size: {}'.format(len(word_counts)))
print('most common: {}'.format(word_counts.most_common(10)))

vocabulary size: 1845
most common: [('data', 950), ('january', 934), ('program', 843), ('programming', 790), ('time', 783), ('language', 774), ('method', 722), ('systems', 707), ('problem', 524), ('paper', 515)]


In [4]:
back_window = 2
front_window = 2
wordpair_counts = Counter()
for idocument , document  in enumerate(globalWordList):
    tokens = [tok2indx[tok] for tok in document ]
    for ii_word, word in enumerate(tokens):
        ii_pair_min = max(0, ii_word - back_window)
        ii_pair_max = min(len(document) - 1, ii_word + front_window)
        ii_pairs = [
            ii for ii in range(ii_pair_min, ii_pair_max + 1) 
            if ii != ii_word]
        for ii_pair in ii_pairs:
            wordpair = (tokens[ii_word], tokens[ii_pair])
            wordpair_counts[wordpair] += 1    
print('number of wordpairs: {}'.format(len(wordpair_counts)))
most_common = [
    (indx2tok[wp[0][0]], indx2tok[wp[0][1]], wp[1]) 
    for wp in wordpair_counts.most_common(10)]
print('most common: {}'.format(most_common))

number of wordpairs: 199260
most common: [('language', 'programming', 159), ('programming', 'language', 159), ('time', 'sharing', 141), ('sharing', 'time', 141), ('programming', 'languages', 137), ('languages', 'programming', 137), ('information', 'retrieval', 116), ('retrieval', 'information', 116), ('language', 'language', 110), ('program', 'program', 106)]


In [5]:
row_id = []
col_id = []
dat_values = []
ii = 0
for (tok1, tok2), wp_count in wordpair_counts.items():   
    row_id.append(tok1)
    col_id.append(tok2)
    dat_values.append(wp_count)
wwcnt_mat = sparse.csr_matrix((dat_values, (row_id, col_id)))

In [6]:
def ww_sim(word, mat, topn):
    """Calculate topn most similar words to word"""
    indx = tok2indx[word]
    if isinstance(mat, sparse.csr_matrix):
        v1 = mat.getrow(indx)
    else:
        v1 = mat[indx:indx+1, :]
    sims = cosine_similarity(mat, v1).flatten()
    sindxs = np.argsort(-sims)
    sim_word_scores = [(indx2tok[sindx]) for sindx in sindxs[0:topn]]
    return sim_word_scores

In [7]:
ww_sim('programming', wwcnt_mat, 5)

['programming', 'nonprocedural', 'semantics', 'natural', 'language']

In [8]:
num_wordpairs = wwcnt_mat.sum()
assert(sum(wordpair_counts.values())==num_wordpairs)

# for creating sparce matrices
row_id = []
col_id = []

pmi_dat_values = []    # pointwise mutual information


# sum_over_rows[ii] = sum_over_words[ii] = wwcnt_mat.getcol(ii).sum()
sum_over_words = np.array(wwcnt_mat.sum(axis=0)).flatten()
# sum_over_cols[ii] = sum_over_pairs[ii] = wwcnt_mat.getrow(ii).sum()
sum_over_pairs = np.array(wwcnt_mat.sum(axis=1)).flatten()

for (tok_word, tok_pair), wp_count in wordpair_counts.items():
    
    nwc = wp_count
    Pwc = (nwc + 0.25)/ (1 + num_wordpairs)
    nw = sum_over_pairs[tok_word]
    Pw = (nw + 0.5)/ (1 + num_wordpairs)
    nc = sum_over_words[tok_pair]
    Pc = (nc + 0.5)/ (1 + num_wordpairs)
    
    #  pmi = log {P(w,c) / [P(w) P(c)]} 
    pmi = np.log2(Pwc/(Pw*Pc))   
    
    row_id.append(tok_word)
    col_id.append(tok_pair)
    pmi_dat_values.append(pmi)
            
pmi_mat = sparse.csr_matrix((pmi_dat_values, (row_id, col_id)))


In [9]:
ww_sim('programming', pmi_mat, 5)

['programming', 'languages', 'language', 'systems', 'programs']

In [10]:
ww_sim('january', pmi_mat, 5)

['january', 'december', 'august', 'april', 'october']