# Jaccard Similarity

In [7]:
import string
# '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def replace_punctuation(item):
    for i in string.punctuation.replace("''", ''):
        if i in item:
            item = item.replace(i, '')
    return item

tokenize = lambda doc: replace_punctuation(doc.lower()).split(" ")
# tokenize = lambda doc: doc.lower().split(" ")

In [8]:
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"

all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

tokenized_documents = [tokenize(d) for d in all_documents] # tokenized docs


In [9]:
def jaccard_similarity(query, document):
    print(set(query), set(document))
    intersection = set(query).intersection(set(document))
#     print(intersection)
    union = set(query).union(set(document))
    return len(intersection)/len(union)

jaccard_similarity(tokenized_documents[1],tokenized_documents[6])

{'at', 'endemic', 'confronting', 'china', 'about', 'violence', 'an', 'corruption', 'domestic', 'seems', 'serious', 'problem', 'last', 'and'} {'', 'is', 'hunting', 'a', 'while', 'he', 'always', 'horse', 'even', 'about', 'seems', 'serious', 'horses', 'things', 'deer', 'riding', 'so', 'crazy', 'putin', 'vladimir'}


0.0967741935483871

# TF-IDF
- Term Frequency 
- Inverse Document Frequency
- tf-idf = tf*idf

In [10]:
import math

def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

def sublinear_term_frequency(term, tokenized_document):
#     return math.log(1+(tokenized_document.count(term)/len(tokenized_document)))
    return (1 + math.log(tokenized_document.count(term))) if tokenized_document.count(term) else 0

def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

tfidf_representation = tfidf(all_documents)
print(tfidf_representation[0])
print(document_0)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 0.0, 2.252762968495368, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.252762968495368, 0.0, 2.9459101490553135, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 2.9459101490553135, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 0.0, 2.9459101490553135, 0.0, 3.8142592685777856, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.1277471639690138, 2.9459101490553135, 0.0, 0.0, 1.5596157879354227, 2.9459101490553135, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 1.336472236621213, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 1.8472978603872037]
China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy.


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)

sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
print(tfidf_representation[0])
print(sklearn_representation.toarray()[0].tolist())
print(document_0)
# print(sklearn_tfidf.vocabulary_)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 0.0, 2.252762968495368, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.252762968495368, 0.0, 2.9459101490553135, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 2.9459101490553135, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 0.0, 2.9459101490553135, 0.0, 3.8142592685777856, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.1277471639690138, 2.9459101490553135, 0.0, 0.0, 1.5596157879354227, 2.9459101490553135, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 0.0, 1.336472236621213, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.9459101490553135, 0.0, 0.0, 0.0, 1.8472978603872037]
[0.0, 0.3153972256619837, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.18627868225707217, 0.18627868225707217, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2435943187490687, 0.0, 0.0, 0.0, 0.25863023686268505, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2435943187490687, 0.0

# Cosine Similarity

In [12]:
def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude


In [13]:
tfidf_representation = tfidf(all_documents)
our_tfidf_comparisons = []
for count_0, doc_0 in enumerate(tfidf_representation):
    for count_1, doc_1 in enumerate(tfidf_representation):
        our_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

skl_tfidf_comparisons = []
for count_0, doc_0 in enumerate(sklearn_representation.toarray()):
    for count_1, doc_1 in enumerate(sklearn_representation.toarray()):
        skl_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
    print(x)

((1.0000000000000002, 4, 4), (1.0000000000000002, 6, 6))
((1.0000000000000002, 2, 2), (1.0000000000000002, 5, 5))
((1.0, 5, 5), (1.0000000000000002, 4, 4))
((1.0, 3, 3), (1.0000000000000002, 0, 0))
((1.0, 0, 0), (1.0, 3, 3))
((0.9999999999999999, 6, 6), (1.0, 2, 2))
((0.9999999999999999, 1, 1), (1.0, 1, 1))
((0.3485485947359029, 4, 2), (0.34854859473590283, 4, 2))
((0.3485485947359029, 2, 4), (0.34854859473590283, 2, 4))
((0.1665505800294338, 6, 3), (0.16655058002943382, 6, 3))
((0.1665505800294338, 3, 6), (0.16655058002943382, 3, 6))
((0.14747371149061467, 5, 3), (0.14747371149061467, 5, 3))
((0.14747371149061467, 3, 5), (0.14747371149061467, 3, 5))
((0.14340231078604393, 3, 2), (0.1434023107860439, 3, 2))
((0.14340231078604393, 2, 3), (0.1434023107860439, 2, 3))
((0.12639252989744235, 3, 0), (0.12639252989744235, 3, 0))
((0.12639252989744235, 0, 3), (0.12639252989744235, 0, 3))
((0.1121220817608579, 6, 1), (0.11212208176085793, 6, 1))
((0.1121220817608579, 1, 6), (0.11212208176085793

In [14]:
def jaccard_similarity_bqc(str1, str2):
    set_1 = set(str1)
    set_2 = set(str2)
    set_3 = set_1.intersection(set_2)
    n = len(set_3)
    if len(set_1) == 0 or len(set_2) == 0:
        return 0
    if str1 == str2:
        return 1
    if (str1 in str2 and len(str1) >= 4)or (str2 in str1 and len(str2) >= 4):
        return 0.8
    sm_pr = n / float(len(set_1) + len(set_2) - n)
    if n == float(len(set_1) + len(set_2) - n):
        return 1
    else:
        return sm_pr

In [47]:
import jieba,math
from sklearn.feature_extraction.text import TfidfVectorizer

def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

def cosine_similarity_bqc(str1, str2):
    if len(str1) == 0 or len(str2) == 0:
        return 0.0
    if str1 == str2:
        return 1.0
    if (str1 in str2 and len(str1) >= 4)or (str2 in str1 and len(str2) >= 4):
        return 0.8
    tmp = [" ".join(jieba.cut(i)) for i in [str1, str2]]
    print(tmp)
    print(tmp[0]<tmp[1])
    sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
    sklearn_representation = sklearn_tfidf.fit_transform(tmp)
    sm = cosine_similarity(sklearn_representation.toarray()[0].tolist(),sklearn_representation.toarray()[1].tolist())
    return sm

a = '秒贷金融'
b = '秒贷网'
print(cosine_similarity_bqc(a, b))
print(jaccard_similarity(a, b))
print(jaccard_similarity_bqc(a, b))

['秒 贷 金融', '秒 贷网']
True
0.0
{'贷', '融', '秒', '金'} {'贷', '秒', '网'}
0.4
0.4


In [39]:
a = set('东北福饺子')
b = set('东北福多味饺子馆')
print(a,b)
b>a

{'福', '东', '北', '子', '饺'} {'福', '多', '东', '馆', '北', '味', '子', '饺'}


True

In [48]:
3773 + 3855 + 2603 + 3755 + 3344 + 3568 + 2724 + 3196 + 2788

29606