In [134]:
from collections import Counter

import numpy as np

In [135]:
# test data
D = [
    ["play","play","cricket","football"],
    ["play","music"],
    ["like","singing"],
    ["cricket","very","small","insect"]
]

d = ["want","play","cricket"]

In [136]:
def hamming_distance(p1 : Counter,p2 : Counter) -> int:
    hd = 0
    for xi in (p1 | p2): # xi in (p1 U p2)
        if (p1[xi] == 0) or (p2[xi] == 0):
            # print(xi)
            # as xi in union and not in any one of it its a mismatch
            hd += 1
    return hd

In [137]:
def euclidean_distance(p1 : Counter,p2 : Counter):
    ed = 0
    for xi in (p1 | p2): # xi in (p1 U p2)
        #print(f"({p1[xi]}-{p2[xi]})**2",end='+')
        ed += (p1[xi] - p2[xi])**2
    #print()
    #return ed before sqrt: for debug
    return np.sqrt(ed)

In [150]:
def get_tf_format(document):
    X = dict()
    Wd = len(document) # Word count of data
    for w,Nw in Counter(document).items():
        # print(f"TF {w} = {Nw}/{Wd}")
        X[w] = Nw/Wd # TF = Nw/Wd 
    return X


def get_tf_idf_format(doc,docs):
    eps = 0.000001
    n_docs = len(docs) # number of documents
    
    doc_tf_idf = get_tf_format(doc)
    to_ignore  = []
    for w in doc_tf_idf:
        Cw = 0
        for doc_i in docs:
            if w in doc_i:
                Cw += 1     
        if Cw == 0:
            to_ignore.append(w)
            # new word -> ignore
            # print(f"IDF {w} => new word; ignore")
        else:
            # print(f"IDF {w} = log({n_docs}/1+{Cw})") 
            idf = eps if Cw == n_docs else np.log10(n_docs/Cw) # lecture note: np.log10(n_docs/(1 + Cw)) 
            doc_tf_idf[w] *= idf
    
    for w in to_ignore:
        doc_tf_idf.pop(w)
    
    return doc_tf_idf


docs_tf_idf = [get_tf_idf_format(doc,D) for doc in D]
print(docs_tf_idf)

d_tf_idf = get_tf_idf_format(d,D)
print(d_tf_idf)

[{'play': 0.1505149978319906, 'cricket': 0.0752574989159953, 'football': 0.1505149978319906}, {'play': 0.1505149978319906, 'music': 0.3010299956639812}, {'like': 0.3010299956639812, 'singing': 0.3010299956639812}, {'cricket': 0.0752574989159953, 'very': 0.1505149978319906, 'small': 0.1505149978319906, 'insect': 0.1505149978319906}]
{'play': 0.10034333188799373, 'cricket': 0.10034333188799373}


In [139]:
def norm(a):
    n = .0
    for ai in a.values():
        n += ai*ai
    return np.sqrt(n)


# p = dict({'moaz': 1 ,'mahmud': 1})
# print(f"norm({p})",norm(p))


def cosine_similarity(p1,p2):
    xi_both = set(p1).intersection(set(p2))
    dot = .0
    for xi in xi_both:
        dot += p1[xi]*p2[xi]
    return dot/(norm(p1)*norm(p2))

In [151]:
def find_dist(D,d,dist_function):
    if dist_function == cosine_similarity:
        docs = [get_tf_idf_format(doc,D) for doc in D]
        test_doc = get_tf_idf_format(d,D)
    else:
        docs = [Counter(di) for di in D]
        test_doc  = Counter(d)
    i = 1
    for doc in docs:
        print(f"{dist_function.__name__}(t,{i})", dist_function(test_doc,doc))
        i += 1
        
        
find_dist(D,d,hamming_distance)
find_dist(D,d,euclidean_distance)
find_dist(D,d,cosine_similarity)

hamming_distance(t,1) 2
hamming_distance(t,2) 3
hamming_distance(t,3) 5
hamming_distance(t,4) 5
euclidean_distance(t,1) 1.7320508075688772
euclidean_distance(t,2) 1.7320508075688772
euclidean_distance(t,3) 2.23606797749979
euclidean_distance(t,4) 2.23606797749979
cosine_similarity(t,1) 0.7071067811865476
cosine_similarity(t,2) 0.3162277660168379
cosine_similarity(t,3) 0.0
cosine_similarity(t,4) 0.19611613513818404
