In [1]:
import os
import math
import re

In [2]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text.strip()

In [3]:
basepath = './aclImdb'
def read_corpus():
    for s in ('test', 'train'):
        for l in ('pos', 'neg'):
            path = os.path.join(basepath, s, l)
            for file in os.listdir(path):
                with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                    yield preprocessor(infile.read())

In [4]:
w2w_vec = dict()
w_vec = dict()
nneighbors = 4

In [5]:
for text in read_corpus():
    words = text.split()
    for wi in range(len(words)):
        w_vec[words[wi]] = w_vec.get(words[wi], 0) + 1
        for i in range(wi + 1, wi + 1 + nneighbors):
            if len(words) > i:
                w_pair = (words[wi], words[i])
                w2w_vec[w_pair] = w2w_vec.get(w_pair, 0) + 1 
        for i in range(wi - nneighbors, wi):
            if i > 0:
                w2w_vec[w_pair] = w2w_vec.get(w_pair, 0) + 1 

In [6]:
def pmi_scoring(w1, w2):
    score = w2w_vec[(w1,w2)] / (w_vec[w1] * w_vec[w2])
    return math.log2(score) if score else 0

In [7]:
def ppmi_scoring(w1,w2):
    return max(pmi_scoring(w1,w2), 0)

In [8]:
ppmi_vec = dict()
for w1, w2 in w2w_vec:
    if w1 not in ppmi_vec:
        ppmi_vec[w1] = dict()
    score = ppmi_scoring(w1,w2)
    if score:
        ppmi_vec[w1][w2] = score    

In [9]:
def cos_similarity(w1,w2):
    dot_w1_w2 = 0
    for w_other in ppmi_vec[w1]:
        if w_other in ppmi_vec[w2]:
            dot_w1_w2 += ppmi_vec[w1][w_other] * ppmi_vec[w2][w_other]
    vec_len_w1 = math.sqrt(sum([v**2 for v in ppmi_vec[w1].values()]))
    vec_len_w2 = math.sqrt(sum([v**2 for v in ppmi_vec[w2].values()]))
    return dot_w1_w2 / (vec_len_w1 * vec_len_w2)

In [10]:
ppmi_vec_filtered = list(filter(lambda x: ppmi_vec[x], ppmi_vec))

In [11]:
len(ppmi_vec_filtered)

1869

In [18]:
sim_w = list()
for w1 in ppmi_vec_filtered:
    for w2 in ppmi_vec_filtered:
        if w1 == w2:
            continue
        sim = cos_similarity(w1, w2)
        if sim: 
             sim_w.append((w1,w2,sim))

In [19]:
sim_w

[('_spiritited', '_toy', 1.0),
 ('_spiritited', 'away_', 1.0),
 ('omero', 'antonutti', 1.0),
 ('friedo', 'almghandi', 1.0),
 ('jhj', 'nashdnfhcka', 1.0),
 ('jhj', 'sakasdadj', 1.0),
 ('jhj', 'fhnkhad', 1.0),
 ('nargis', 'bagheri', 1.0),
 ('v1', 'v2', 1.0),
 ('v1', '20061114', 1.0),
 ('jpieczanski', 'sidwell', 1.0),
 ('jpieczanski', 'pieczanski', 1.0),
 ('kaempfen', 'selbst', 1.0),
 ('kaempfen', 'goetter', 1.0),
 ('ld_____________________________my',
  'ld______________________________________________my',
  1.0),
 ('ld_____________________________my', 'ld_________my', 1.0),
 ('924', '310', 1.0),
 ('dans', 'nos', 1.0),
 ('dans', 'toujours', 1.0),
 ('naaaaaaaaaaaaaaah', '555', 1.0),
 ('wollter', 'ahmed', 1.0),
 ('wollter', 'selldal', 1.0),
 ('selbst', 'kaempfen', 1.0),
 ('selbst', 'goetter', 1.0),
 ('_toy', '_spiritited', 1.0),
 ('_toy', 'away_', 1.0),
 ('nos', 'dans', 1.0),
 ('nos', 'toujours', 1.0),
 ('gli', 'occhi', 1.0),
 ('khandar', 'daar', 1.0),
 ('oonga', 'dhanno', 1.0),
 ('knifee'