Hoàng Đình Hữu - MSSV: 20521384

## LEVENSHTEIN

In [2]:
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]


## HAMMING DISTANCE

In [3]:
def hamming(s1, s2):
    i ,count = 0, 0
    while(i < len(s1)):
        if(s1[i] != s2[i]): count += 1
        i += 1
    return count

## JACCARD

In [160]:
def jaccard(x,y):
  intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
  union_cardinality = len(set.union(*[set(x), set(y)]))
  return intersection_cardinality/float(union_cardinality)

In [161]:
s1 = 'Bottle'
s2 = 'Beatle'
for i in [levenshtein,hamming,jaccard]:
    print(i(s1,s2))
sentences = ["The bottle is empty","There is nothing in the bottle"]
s = [sent.lower().split(" ") for sent in sentences]
print(jaccard(s[0],s[1]))

2
2
0.6666666666666666
0.42857142857142855


## BAG OF WORDS

In [164]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
def unique(sequence):
    word = set()
    return [x for x in sequence if not (x in word or word.add(x))]
def Preprocess(s1,s2):
    s1 = s1.lower()
    s2 = s2.lower()
    tokens1 = s1.split()
    tokens2 = s2.split()
    vocab = unique(tokens1+tokens2)
    print(tokens1,tokens2)
    print(vocab,len(vocab))
    filtered = []
    for i in vocab: 
        # if i not in set(stopwords.words('english')):
            filtered.append(i)
    return tokens1,tokens2,filtered
def vectorize(tokens,filtered):
    vector=[]
    for i in filtered:
        vector.append(tokens.count(i))
    print(vector)
    return vector

In [165]:
a = 'I will go out with her tonight'
b = 'She hope that I will give her a present tonight'
t1,t2,filtered = Preprocess(a,b)
v1 = vectorize(t1,filtered)
v2 = vectorize(t2,filtered)
# print('Word Similarity using Levenshtein:',levenshtein(v1,v2))
# print('Word Similarity using Hamming Distance:',hamming(v1,v2))
# print('Word Similarity using Jaccard:',jaccard(v1,v2))

['i', 'will', 'go', 'out', 'with', 'her', 'tonight'] ['she', 'hope', 'that', 'i', 'will', 'give', 'her', 'a', 'present', 'tonight']
['i', 'will', 'go', 'out', 'with', 'her', 'tonight', 'she', 'hope', 'that', 'give', 'a', 'present'] 13
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
[1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]


## TF-IDF

In [166]:
import pandas as pd
import math
import nltk
# from nltk.corpus import stopwords
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return (tfDict)
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))
    return(idfDict)
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return (tfidf)
def Preprocess_tf_idf(a,b):
    a = a.split()
    b = b.split()
    total= set(a).union(set(b))
    print(total)
    wordDictA = dict.fromkeys(total, 0) 
    wordDictB = dict.fromkeys(total, 0)
    for word in a:
        wordDictA[word]+=1  
    for word in b:
        wordDictB[word]+=1
    print('',wordDictA,'\n',wordDictB)
    tfa = computeTF(wordDictA, a)
    tfb = computeTF(wordDictB, b)
    filtered_sentence = []
    for word in wordDictA:
        w = str(word)
        filtered_sentence.append(w)
    idfs = computeIDF([wordDictA, wordDictB])
    idfa = computeTFIDF(tfa, idfs)
    idfb = computeTFIDF(tfb, idfs)
    return list(round(i,4) for i in idfa.values()),list(round(i,4) for i in idfb.values())

In [167]:
a = 'I will go out with her tonight'
b = 'She hope that I will give her a present tonight'
v1,v2 = Preprocess_tf_idf(a,b)
print('',v1,'\n',v2)

{'her', 'go', 'out', 'I', 'with', 'will', 'that', 'present', 'give', 'hope', 'She', 'tonight', 'a'}
 {'her': 1, 'go': 1, 'out': 1, 'I': 1, 'with': 1, 'will': 1, 'that': 0, 'present': 0, 'give': 0, 'hope': 0, 'She': 0, 'tonight': 1, 'a': 0} 
 {'her': 1, 'go': 0, 'out': 0, 'I': 1, 'with': 0, 'will': 1, 'that': 1, 'present': 1, 'give': 1, 'hope': 1, 'She': 1, 'tonight': 1, 'a': 1}
 [0.043, 0.043, 0.043, 0.043, 0.043, 0.043, 0.0, 0.0, 0.0, 0.0, 0.0, 0.043, 0.0] 
 [0.0301, 0.0, 0.0, 0.0301, 0.0, 0.0301, 0.0301, 0.0301, 0.0301, 0.0301, 0.0301, 0.0301, 0.0301]


## COSINE

In [168]:
from math import sqrt, pow, exp
def squared_sum(x):
    return round(sqrt(sum([a*a for a in x])),3)
def cos_similarity(x,y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = squared_sum(x)*squared_sum(y)
    return round(numerator/float(denominator),3)

a = "The bottle is empty"
b = "There is nothing in the bottle"
t1,t2,filtered = Preprocess(a,b)
v1 = vectorize(t1,filtered)
v2 = vectorize(t2,filtered)
cos_similarity(v1, v2)

['the', 'bottle', 'is', 'empty'] ['there', 'is', 'nothing', 'in', 'the', 'bottle']
['the', 'bottle', 'is', 'empty', 'there', 'nothing', 'in'] 7
[1, 1, 1, 1, 0, 0, 0]
[1, 1, 1, 0, 1, 1, 1]


0.612