In [1]:
import re
import string
import random

from tqdm.autonotebook import tqdm
from sentence_transformers import SentenceTransformer
from harvesttext import HarvestText
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from utils.io import read_pickle
from utils.sbert_class import PretrainedSBERT
from utils.milvus_client import MilvusHelper
from simcse import unsup_simcse


# timestamp
AWEEK = 604800 * 1000
AMONTH = 2629743 * 1000

  from tqdm.autonotebook import tqdm


In [2]:
library = read_pickle('data/small_library.pkl')
library[1]

{'title': '缓解膝关节炎疼痛，力量训练要加强度吗？JAMA发表长期临床试验数据',
 'publish_time': 1614229098000,
 'esid': '23a180035e76ee6419214b03d953565f'}

### Prepare pre-training data for simCSE

In [None]:
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛丨｜|｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
punc += string.punctuation


def remove_punc(s: str) -> str:
    s = re.sub(r"[%s]+" %punc, "", s)
    s = s.replace(" ", "")
    return s

'''
First use some rules to detect possible duplicate titles.
Use remaining artitle contents as pre-training corpus.
'''
title = set()
unique_ids = set()

def check_dup(t:str) -> bool:
    """ Return True if duplicate else False
    """
    if t in title:
        return True
    for t_ in title:
        if (t in t_ or t_ in t) and abs(len(t)-len(t_)) <= 3:
            return True
    return False

for d in tqdm(library):
    t = remove_punc(d['title'])
    if not check_dup(t):
        title.add(t)
        unique_ids.add(d['esid'])
print(len(unique_ids))

In [None]:
raw = read_pickle('data/library_raw.pkl')
ht = HarvestText()

def clean_and_cut(doc):
    doc = ht.clean_text(doc).replace("\xa0", "").replace('▲', "")
    doc = ILLEGAL_CHARACTERS_RE.sub(r'', doc)
    doc = doc.replace('\u200b', '')
    doc = doc.replace('\ufeff', '')
    doc = doc.replace('\ue601', '')
    sents = ht.cut_sentences(doc)
    sents = [s.lstrip('*').strip() for s in sents]
    sents = [s for s in sents if '微信' not in s and not s.startswith("图") and len(s) >= 10]#[:10]
    return sents

train_sentences = []
for d in tqdm(raw):
    if d['esid'] not in unique_ids:
        continue
    raw_content = d['content']
    train_sentences += clean_and_cut(raw_content)

print(len(train_sentences))
train_sentences = list(set(train_sentences))
print(len(train_sentences))

### Define and pre-trained model

In [3]:
model = PretrainedSBERT.MPnet
model.path = 'outputs/simcse-pom-mpnet'
sbert = SentenceTransformer(model.path, device='cuda:3')

In [None]:
unsup_simcse(sbert, train_sentences)

In [None]:
sbert.save('outputs/simcse-pom-mpnet')

### Encode titles

In [4]:
vecs = sbert.encode(
    [d['title'].replace(' ', '') for d in library],
    batch_size=128,
    show_progress_bar=True,
    normalize_embeddings=True
)

Batches:   0%|          | 0/112 [00:00<?, ?it/s]

### Save vectors into Milvus

In [5]:
milvus = MilvusHelper(dimension=model.dim, clear_collection=True)

# insert and index
milvusids = milvus.insert(vecs)
milvus.create_index()

Status(code=0, message='Success')
drop collection Status(code=0, message='Delete collection successfully!')
create collection pom_news_title with size 768 Status(code=0, message='Create collection successfully!')
insert 14237 records Status(code=0, message='Add vectors successfully!')
create index Status(code=0, message='Build index successfully!')


In [6]:
for i, news in enumerate(library):
    news['milvusid'] = milvusids[i]

milvusid_2_news = {milvusids[i]: library[i] for i in range(len(library))}

In [7]:
library[0]

{'title': '一劳永逸，预防所有冠状病毒感染？《自然》今日报道全新“纳米疫苗”',
 'publish_time': 1620685930000,
 'esid': 'ceb4cd13acf2094d8e85a9d03df3670d',
 'milvusid': 1625476390551559000}

### Searching

In [8]:
def result_filter(results, selfid, timestamp, score_threshold):
    '''
    Inputs: secondary milvus result and others
    Returns: matched docs and scores
    '''
    matched = []
    for res in results:
        score = res.distance
        if score < score_threshold:
            break
        candidate = milvusid_2_news[res.id]
        cts = candidate['publish_time']
        if cts and \
        cts > timestamp - AWEEK and \
        cts < timestamp + AWEEK and \
        candidate['milvusid'] != selfid:
            matched.append((candidate, res.distance))
    return matched

In [9]:
idx = 199
query = list(vecs[idx])
news = library[idx]
result = milvus.search(top_k=100, query=[query])

In [10]:
print(news)
filtered = result_filter(result[0], news['milvusid'], news['publish_time'], 0.6)
for x, y in filtered:
    print(x, y)

{'title': '《细胞》子刊：运动的好处能通过胎盘传给宝宝！科学家发现，小鼠运动会刺激胎盘分泌SOD3，改善子代肝脏代谢功能丨科学大发现', 'publish_time': 1618829921000, 'esid': '88e9e4ee39f780517fb66e45bfca5911', 'milvusid': 1625476390551559199}


In [11]:
ids = []
matches = []
while len(ids) < 10:
    idx = random.choice(list(range(len(vecs))))
    query = list(vecs[idx])
    news = library[idx]
    if not news['publish_time'] or not news['milvusid']:
        continue
    result = milvus.search(top_k=100, query=[query])
    filtered = result_filter(result[0], news['milvusid'], news['publish_time'], 0.6)
    if len(filtered) > 0:
        ids.append(idx)
        matches.append(filtered)

In [12]:
for i in range(len(ids)):
    print("----------------------")
    print(library[ids[i]])
    for x, y in matches[i]:
        print(x, y)

----------------------
{'title': '人工智能用于药物研发，优势、潜力与挑战', 'publish_time': 1622304000000, 'esid': '1a370dec019a42fbe037f349366a79d9', 'milvusid': 1625476390551571542}
{'title': '人工智能在药物研发中的应用', 'publish_time': 1622162341000, 'esid': 'cc6b1a5d135428139e48b648b0d46397', 'milvusid': 1625476390551567116} 0.6512377858161926
----------------------
{'title': '君实生物PD-1/TGF-β 双功能融合蛋白获批临床', 'publish_time': 1620368903000, 'esid': '8db12714d18846c48bad435aaa8482ec', 'milvusid': 1625476390551572567}
{'title': '首个PD-1/TGFβ双抗获批临床：君实生物JS201', 'publish_time': 1620300792000, 'esid': '2dce3627ff67a2fbea38335c6d332d89', 'milvusid': 1625476390551579586} 0.7328601479530334
----------------------
{'title': '2021 AACR|蓝鸟报告溶瘤病毒可增强CAR-T疗法抗实体瘤疗效', 'publish_time': 1618460429000, 'esid': '46786b3080fc24929decbc8d9cf6a76a', 'milvusid': 1625476390551579560}
{'title': 'BioNews | Bluebird-PsiOxus：发布溶瘤病毒增强CAR-T细胞疗法在实体瘤中的疗效', 'publish_time': 1618919301000, 'esid': '80dbed72fc6069597044482e093adbaf', 'milvusid': 16254763905