In [None]:
!pip install textdistance
!pip install py2neo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py2neo
  Downloading py2neo-2021.2.3-py2.py3-none-any.whl (177 kB)
[K     |████████████████████████████████| 177 kB 4.0 MB/s 
Collecting interchange~=2021.0.4
  Downloading interchange-2021.0.4-py2.py3-none-any.whl (28 kB)
Collecting pansi>=2020.7.3
  Downloading pansi-2020.7.3-py2.py3-none-any.whl (10 kB)
Collecting monotonic
  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: pansi, monotonic, interchange, py2neo
Successfully installed interchange-2021.0.4 monotonic-1.6 pansi-2020.7.3 py2neo-2021.2.3


# 1 Token-level distance function

We use algorithms in `textdistance` to compute the distance between two strings in token-level. An example is `jaccard` algorithm:

In [None]:
import textdistance as td 

In [None]:
td.jaccard('感冒','感,;sdf冒')

0.2857142857142857

## One Mention, Many Candidates
Since in real practice we should compute a mention word against many entities from a knowledge base, we further implement this interface.

In [None]:
import numpy as np
def one2many(dist_func):
    def new_func(mention:str, candidates: list):
        f = lambda candidate: dist_func(mention,candidate)
        return np.array(list(map(f, candidates)))

    return new_func

In [None]:
jaccard_many = one2many(td.jaccard)

In [None]:
jaccard_many('感冒',['感冒','流行感冒','小儿感冒'])

array([1. , 0.5, 0.5])

# 2 Connect to Neo4j KG
Connect to Neo4j KG and load the disease entities.

In [None]:
from py2neo import Graph
password = None # set your password here
graph = Graph("neo4j+s://f54cadff.databases.neo4j.io:7687", auth=("neo4j", password))

In [None]:
def query_graph(entity_type:str):
    '''
    return: query results in List
    '''
    cql = f'match (n:`{entity_type}`) return n.name'
    ret = list(graph.run(cql))
    f = lambda record: record.values()[0]
    ret = list(map(f,ret))
    return ret

In [None]:
result = query_graph('疾病')

In [None]:
len(result)

8807

In [None]:
result[0]

'放射性视网膜病变'

# 3 Compute similarity between mention and candidates

In [None]:
similarities = jaccard_many('感冒',result)

In [None]:
id = np.argmax(similarities)
id

5686

In [None]:
result[id]

'感冒'

In [None]:
def topK_candidates(k:int, mention, candidates):
    similarities = jaccard_many(mention,candidates)
    ids = np.argpartition(similarities, -k)[-k:]

    result = [candidates[a] for a in ids]
    f = lambda x : td.jaccard(mention,x)
    return sorted(result, key=f, reverse= True)
    

In [None]:
topK_candidates(3,'感冒',result)

['感冒', '风寒感冒', '小儿感冒']

In [None]:
!pip install git+https://github.com/leoxiang66/text-similarity.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/leoxiang66/text-similarity.git
  Cloning https://github.com/leoxiang66/text-similarity.git to /tmp/pip-req-build-p_kjfvhm
  Running command git clone -q https://github.com/leoxiang66/text-similarity.git /tmp/pip-req-build-p_kjfvhm


In [None]:
from textsimilarity import textsimilarity as ts

In [None]:
ts.top_K_similarity_between_one_mention_and_many_candidates(10,'感冒',result)

['感冒',
 '风热感冒',
 '小儿感冒',
 '风寒感冒',
 '肠胃型感冒',
 '病毒性感冒',
 '流行性感冒',
 '郁冒',
 '副流行性感冒',
 '小儿流行性感冒']