In [None]:
!pip install jieba
!pip install openai
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas
!pip install scipy
!pip install IPython

In [2]:
import jieba
from typing import Optional
import openai
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import re

openai.api_key = "YOUR_API_KEY"

In [10]:
def text_to_embed(text: str, source: Optional[str] = "openai"):
    if source is not None and source not in ["openai", "local"]:
        raise ValueError("source只能是'openai'或'local'")
    if source == "openai":
        response = openai.Embedding.create(
            input=text,
            engine="text-embedding-ada-002")
        return response["data"][0]["embedding"]
    elif source == "local":
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
        return model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output.detach().numpy()[0] 

# 创建函数获取搜索结果
def get_search_results(query, source):
    
    user_embed = text_to_embed(query, source=source)
    user_embed = np.array(user_embed).astype(float)

    # use cosine similarity to find the most similar sentence from test_X
    embd = np.array(data["embed"].tolist())
    cosine_similarity = np.dot(embd, user_embed) / (np.linalg.norm(embd, axis=1) * np.linalg.norm(user_embed))
    cosine_similarity = cosine_similarity.reshape(-1, 1)

    # find the five most similar sentence
    top5 = np.argsort(cosine_similarity, axis=0)[-5:]
    top5 = top5.reshape(-1, 1)
    top5 = top5[::-1]

    
    result = []
    for i in top5:
        result.append(data["sentence"][i[0]])
    return result

    

# 分词
def tokenize(text):
    text = jieba.cut_for_search(text)

    return text

# 移除停用词
def remove_stop_words(tokens, stop_words):
    return [token for token in tokens if token not in stop_words]

# 高亮文本
def highlight_text(text, words):
    for word in words:
        text = text.replace(word, f"<mark>{word}</mark>")
    return text

# 主程序
def highlighted_result(query, stop_words, source):

    # 分词
    tokens = tokenize(query)
    # 移除停用词
    filtered_tokens = remove_stop_words(tokens, stop_words)
    # 获取搜索结果
    search_results = get_search_results(query, source)
    # 对比并高亮
    highlighted_results = [highlight_text(result, filtered_tokens).replace(r"</mark><mark>", "") for result in search_results]

    for i in range(len(highlighted_results)):
        print(f"第{i+1}个结果：\n{highlighted_results[i]}\n")

    return highlighted_results

def adjust_boundaries(text, start, end):
    while start > 0 and text[start:start+6] != '<mark>':
        start -= 1

    while end < len(text) and text[end-7:end] != '</mark>':
        end += 1

    return start, end

def adjust_end(text, start, end):
    pattern = r'<mark>'
    # ther could be more than one match, so we need to find the last one
    n = 0

    for mat in re.finditer(pattern, text[start:end + 7]):
        n += 1
        if n == 2:
            return mat.end() + start - 6
 
    return end

def get_highlighted_substr(text, num_chars=20):
    pattern = r'<mark>([^<]+)</mark>'
    highlighted_substrings = []

    last_end = 0
    
    for match in re.finditer(pattern, text):
        sep = ' ... '
        if match:
            start = match.start(1)
            end = match.end(1)
            # Adjust start and end index to include full <mark> tags if cut off
            start, end = adjust_boundaries(text, start, end)

            start -= num_chars
            end += num_chars
            start = max(0, start)
            end = min(len(text), end)
            # Adjust start index to avoid overlapping substrings
            if start < last_end:
                start = last_end
                sep = ''
            # Adjust end index to include full </mark> tag

            end = adjust_end(text, start, end)

            highlighted_substrings.append(sep + text[start:end])
            last_end = end

    return ''.join(highlighted_substrings).strip(" ... ")


def show_highlightes(highlight):
    #只显示高亮前后的文本
    highlight_print = [''] * len(highlight)
    for i in range(len(highlight)):
        highlight_print[i] = get_highlighted_substr(highlight[i], 30)
        if highlight_print[i] != '':
            display(HTML(highlight_print[i]))


In [11]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
from argparse import Namespace
import sys
sys.path.append("..")
# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert")
model_args = Namespace(do_mlm=None, pooler_type="cls", temp=0.05, mlp_only_train=False, init_embeddings_model=None)
model = AutoModel.from_pretrained("silk-road/luotuo-bert", trust_remote_code=True, model_args=model_args)

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [14]:
import ast
with open("../data/stop_word.txt", "r", encoding="utf-8") as f:
    stop_words = f.read().splitlines()
data = pd.read_csv("../data/search_data.csv")
data["embed"] = data["embed"].apply(lambda x: ast.literal_eval(x))

In [15]:
user_input = "虎扑报道马刺的保罗-加索尔与球队正式签订协议，有哪些相关的新闻？"
highlight = highlighted_result(user_input, stop_words, source="local")
show_highlightes(highlight)

第1个结果：
<mark>虎扑</mark>体育7月14日讯。<mark>马刺</mark>官网宣布,<mark>保罗</mark>-加<mark>索尔</mark>与<mark>球队正式签订协议</mark>。根据之前的<mark>报道</mark>,加<mark>索尔</mark>与<mark>马刺签订</mark>的是一份为期2年3000万的<mark>协议</mark>。加<mark>索尔</mark>上赛季共打了72场比赛,场均31.8分钟,贡献16.5分11.0篮板4.1助攻2.0盖帽。

第2个结果：
2015-09-2809:03。新浪体育。显示图片下一站丹佛?北京时间9月28日,据雅<mark>虎</mark>体育<mark>报道</mark>,波特兰开拓者队已经与老将射手迈克-米勒达成了合同买断。一旦米勒成为自由球员之后,丹佛掘金队有意得到他。据雅<mark>虎</mark>体育得到的消息,如果迈克-米勒在规定的时间内没有被<mark>球队</mark>认领并成为一名自由球员,联盟里有一些<mark>球队</mark>将会对他感兴趣,其中就包括掘金队。目前,掘金队中已经有14个保障合同。事实上早在一年前,掘金队就在自由球员市场上猛追过米勒。但最终,米勒决定跟随勒布朗-詹姆斯加盟骑士队。今年7月,开拓者队在关于布兰登-海伍德的那笔交易中得到了迈克-米勒。作为2000年的NBA首轮5号新秀,迈克-米勒曾获得过2001年最佳新秀称号、2006年最佳第六人称号,并且曾跟随迈阿密热火队夺得过两次NBA总冠军。在长达15年的职业生涯里,米勒一直被视为一名关键时刻靠得住的射手,并且能够为争冠<mark>球队</mark>做出贡献。他的生涯场均数据是11.3分、4.4个篮板和2.7次助攻。

第3个结果：
<mark>虎扑</mark>体育9月3日讯。根据《太阳哨兵报》记者IraWinderman的<mark>报道</mark>,帕特-莱利本周接受采访时表示,交易得到沙奎尔-奥尼尔是他们队史上最大的交易收获。在2004年夏天,奥尼尔被湖人交易到了热火,随后他就与德维恩-韦德在2005-06赛季一起为热火拿到了他们队史上的首个总冠军。“我会这么说,我的意思也是这个,”莱利说,“获得奥尼尔比我们史上的任何收获都重要,其中也包括三巨头。”莱利称热火交易得到奥尼尔的重要