In [25]:
tokens1 = '我们 的 目标 就 是 能够 使用 海量 用户 搜索 日志'
tokens2 = '在 海量 数据 里 挖掘 潜藏 的 查询 之间 的 结构 信息'
token3 = 't'

In [26]:
# textrank
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import networkx as nx

def textrank(sentences):
    """
    Given input text, split sentences and calc text rank score.
    :param sentences: input sentence list
    :return: a dictionary of (sentence index, sentence score)
    """
    bow_matrix = CountVectorizer().fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
    similarity_graph = normalized * normalized.T
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    return dict(((i, scores[i]) for i, s in enumerate(sentences)))


In [27]:
textrank([tokens1,tokens2,token3])

{0: 0.4651161545800183, 1: 0.4651161545800183, 2: 0.06976769083996354}

In [28]:
# Encoding local matching vector

In [29]:
IDF = {}

In [30]:
from scipy.linalg import norm
def cosine_sim(a, b):
    if len(b) < len(a):
        a, b = b, a
    res = 0
    for key, a_value in a.items():
        res += a_value * b.get(key, 0)
    if res == 0:
        return 0
    try:
        res = res / (norm(list(a.values())) * norm(list(b.values())))
    except ZeroDivisionError:
        res = 0
    return res

In [16]:
import random
idf_dict = {}
token = tokens1 + tokens2

for t in token.split():
    idf_dict[t] = random.uniform(0,2)

In [17]:
idf_dict

{'我们': 0.8475428640922156,
 '的': 1.7121881952212235,
 '目标': 1.4821940926689208,
 '就': 0.8199354451619345,
 '是': 1.8112978956401107,
 '能够': 1.2703013045679612,
 '使用': 1.7615470961490607,
 '海量': 1.1333676073051475,
 '用户': 1.8409108187581233,
 '搜索': 0.5745408255349125,
 '日志在': 0.6124637272657749,
 '数据': 1.2308230393632884,
 '里': 0.007054410825109825,
 '挖掘': 1.4386600190268888,
 '潜藏': 0.40411136261439706,
 '查询': 0.3333993802165316,
 '之间': 0.8665756696285858,
 '结构': 1.036542920893356,
 '信息': 1.5550139196068333}

## 1. 计算tf

<div>
<img src='imgs/tf.png' width='300' height='300'/>
</div>

## 2. 计算tf-idf

<div>
<img src='imgs/tfidf.png' width='500' height='500'/>
</div>

## 3. jaccard

<div>
<img src='imgs/jaccard.png' width='300' height='300'/>
</div>

## 4. ochiai

<div>
<img src='imgs/ochiai.png' width='300' height='300'/>
</div>

## 5. BM25

<div>
<img src='imgs/bm25.png' width='600' height='600'/>
</div>


In [32]:
# 计算tf

def gen_tf(text):
    """
    Given a segmented string, return a dict of tf.
    """
    tokens = text.split()
    total = len(tokens)
    tf_dict = {}
    for w in tokens:
        tf_dict[w] = tf_dict.get(w, 0.0) + 1.0
    for k in tf_dict:
        tf_dict[k] /= total
    return tf_dict

def tf_cos_sim(text1, text2):
    tf1 = gen_tf(text1)
    tf2 = gen_tf(text2)
    return cosine_sim(tf1, tf2)


tf_cos_sim(tokens1, tokens2)

0.2417468892076141

In [19]:
# 给定idf， 求计算tf-idf
def gen_tfidf(text, idf_dict):
    """
    Given a segmented string and idf dict, return a dict of tfidf.
    """
    tokens = text.split()
    total = len(tokens)
    tfidf_dict = {}
    for w in tokens:  # 求词频
        tfidf_dict[w] = tfidf_dict.get(w, 0.0) + 1.0
    for k in tfidf_dict:  # 除以总个数
        tfidf_dict[k] *= idf_dict.get(k, 0.0) / total
    return tfidf_dict


def tfidf_cos_sim(text1, text2, idf_dict):
    tfidf1 = gen_tfidf(text1, idf_dict)
    tfidf2 = gen_tfidf(text2, idf_dict)
    return cosine_sim(tfidf1, tfidf2)

tfidf_cos_sim(tokens1, tokens2, idf_dict)

0.352082200987131

In [20]:
# jaccard
def jaccard_common_words(text1, text2):
    str1 = set(str(text1).split())
    str2 = set(str(text2).split())
    if len(str1) == 0 or len(str2) == 0:
        return 0.0
    return float(len(str1 & str2)) / len(str1 | str2)

jaccard_common_words(tokens1, tokens2)

0.1

In [21]:
# ochiai
# 
import math
def ochiai_common_words(text1, text2):
    str1 = set(str(text1).split())
    str2 = set(str(text2).split())
    if len(str1) == 0 or len(str2) == 0:
        return 0.0
    return float(len(str1 & str2)) / math.sqrt(len(str1) * len(str2))

ochiai_common_words(tokens1, tokens2)

0.18181818181818182

In [24]:
# bm25
# 单词与query的部分通常省略掉
# 参考：https://www.programmersought.com/article/51573171512/

def cal_bm25_sim(tokens1, tokens2, idf_dict):
    ts1, ts2 = tokens1, tokens2
    if len(tokens1) > len(tokens2):
        ts1 = tokens2   # 短的作为query
        ts2 = tokens1
    
    freqs = {}  # 文档中的词频 tf_td
    for word in ts2:
        if word not in freqs:
            freqs[word] = 0
        freqs[word] += 1

    param_k1 = 1.5
    param_b = 0.75
    
    score1, score2 = 0.0, 0.0
    
    
    for word in ts1:
        if word not in freqs or word not in idf_dict:
            continue
        score1 += idf_dict[word] * (freqs[word] * (param_k1 + 1) / (
            freqs[word] + param_k1 * (1 - param_b + param_b * 1)))
        
    for word in ts2:
        if word not in freqs or word not in idf_dict:
            continue
        score2 += idf_dict[word] * (freqs[word] * (param_k1 + 1) / (
            freqs[word] + param_k1 * (1 - param_b + param_b * 1)))
        

    sim = score1 / score2 if score2 > 0 else 0
    sim = sim if sim <= 1.0 else 1.0
    return sim


cal_bm25_sim('海量 用户'.split(), '用户'.split(), idf_dict)

0.6189436747502947