## 扩展词与扩展查询（进阶方向）

查询改写（Query Rewriting，或称为查询扩展Query Expansion）。查询改写的应用方式是对原始Query拓展出与用户需求关联度高的改写词，多个改写词与用户搜索词一起做检索，从而用更好的表述，帮用户搜到更多符合要求的文本。

- 语义拓展：主要是同义词、下位词以及常见的大小写数字和繁简转化等，例如“理发”、“剪发”、“造型”、“发艺”、“美发”、“剪头”等等。
    - 用户表达和商家表达上的Gap：非语言上的同义。如用户表述口语化“学吉他”，商户描述书面化“吉他培训”；用户输入不完全匹配商户名：“希尔顿大酒店”（商家更常见的描述为“希尔顿酒店”）。
- 场景拓展：例如“摘草莓”在美团的搜索场景下，用户基于对平台的认知对应需求是“草莓园”。
- 其他漏召回问题：部分的多字少字、纠错等问题，如“房屋扫”对应“家政保洁”的需求；理论上查询改写可以通过增加改写词解决所有漏召回问题，诸如“冬日四件套”包括“冰糖葫芦、烤地瓜、炒栗子、热奶茶”这类有时效性的网红概念，也可以通过改写进行解决。

In [8]:
import time 
import jwt
import requests
import jieba
import re
from tqdm import tqdm
import json
import pdfplumber
from langchain.schema import Document
from rank_bm25 import BM25Okapi
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
 

def extract_page_text(filepath, max_len=256, overlap_len=100):
    page_content  = []
    pdf =pdfplumber.open(filepath)
    page_count = 0
    # pattern = r'^\d{1,3}'
    for page in tqdm(pdf.pages):
        page_text = page.extract_text().strip()
        raw_text = [text.strip() for text in page_text.split('\n')]
        new_text = '\n'.join(raw_text)
        new_text = re.sub(r'\n\d{2,3}\s?', '\n', new_text)
        # new_text = re.sub(pattern, '', new_text).strip()
        if len(new_text)>10 and '..............' not in new_text:
            page_content.append(new_text)
        else:
            page_content.append('  ')

    cleaned_chunks = []
    i = 0
    all_str = ''.join(page_content)
    all_str = all_str.replace('\n', '')
    while i<len(all_str):
        cur_s = all_str[i:i+max_len]
        if len(cur_s)>10:
            cleaned_chunks.append(Document(page_content=cur_s, metadata={'page':page_count+1}))
        i+=(max_len - overlap_len)

    return cleaned_chunks,page_content
# 实际KEY，过期时间
def generate_token(apikey: str, exp_seconds: int):
    try:
        id, secret = apikey.split(".")
    except Exception as e:
        raise Exception("invalid apikey", e)

    payload = {
        "api_key": id,
        "exp": int(round(time.time() * 1000)) + exp_seconds * 1000,
        "timestamp": int(round(time.time() * 1000)),
    }
    return jwt.encode(
        payload,
        secret,
        algorithm="HS256",
        headers={"alg": "HS256", "sign_type": "SIGN"},
    )
def ask_glm(content):
    url = "https://open.bigmodel.cn/api/paas/v4/chat/completions"
    headers = {
      'Content-Type': 'application/json',
      'Authorization': generate_token("f1a0b6c3d36d46d3eed74a6c7de3e9e4.pZ88EkbBscyHXXcJ", 1000)
    }

    data = {
        "model": "glm-4",
        "messages": [{"role": "user", "content": content}]
    }

    response = requests.post(url, headers=headers, json=data)
    return response.json()



def get_answer_from_llm(question_idx,questions):
    for query_idx in question_idx:
        doc_scores = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))
        max_score_page_idxs = doc_scores.argsort()[-3:]

        pairs = []
        for idx in max_score_page_idxs:
            pairs.append([questions[query_idx]["question"], pdf_content[idx] ])

        inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
        with torch.no_grad():
            inputs = {key: inputs[key].cuda() for key in inputs.keys()}
            scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
        max_score_page_idx = max_score_page_idxs[scores.cpu().numpy().argmax()]
        questions[query_idx]['reference'] = 'page_' + str(max_score_page_idx + 1)

        prompt= '''你是一个汽车维修和汽车销售的专家，将用户的提问改为含义相近当不相同的句子,将改写后的句子以list的形式返回，不要输出起他无关内容,举一个例子：[改写后的句子1,改写后的句子2...] ：
           问题：{0}
        '''.format(
            # ''.join([f'第{i+1}页内容：' + pdf_content[i].replace('\n', '') + '\n' for i in doc_scores.argsort()[-3:]]) ,
            questions[query_idx]["question"]
        )

        answer = ask_glm(prompt)['choices'][0]['message']['content']
        # if '无法回答' in answer:
        #     answer = ask_glm(prompt2)['choices'][0]['message']['content']
        questions[query_idx]['question_rewriting'] = answer
        print(query_idx,questions[query_idx])
    return questions

def get_questions_from_file(file_path):
    with open(f'/root/code/submit_task8_glm4.json', 'r', encoding='utf8') as f:
        questions = f.read()
    questions = eval(questions)
    return questions

def get_useless_question_idx(questions):
    ## 收集无法回答答案的问题的索引
    useless_question_idx = []
    for i,question in enumerate(questions):
        if '无法回答' in question['answer']:
            # print(question)
            useless_question_idx.append(i)
    return useless_question_idx





# file_path = '/root/code/submit_task8_glm4.json'
questions = json.load(open("./data/questions.json"))
filepath = './data/初赛训练数据集.pdf'
_,pdf_content = extract_page_text(filepath, max_len=256, overlap_len=100)


tokenizer = AutoTokenizer.from_pretrained('/root/code/quietnight/bge-reranker-large/')
rerank_model = AutoModelForSequenceClassification.from_pretrained('/root/code/quietnight/bge-reranker-large/')
rerank_model.cuda()


pdf_content_words = [jieba.lcut(x ) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)



100%|██████████| 354/354 [00:07<00:00, 49.91it/s]


In [4]:
with open(f'task9_questions.json', 'r', encoding='utf8') as up:
    task9_questions = eval(up.read())

In [58]:

def get_answer_from_llm(question_idx,questions):
    for query_idx in question_idx:
        doc_scores = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))
        max_score_page_idxs = doc_scores.argsort()[-3:]

        pairs = []
        for idx in max_score_page_idxs:
            pairs.append([questions[query_idx]["question"], pdf_content[idx] ])

        inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
        with torch.no_grad():
            inputs = {key: inputs[key].cuda() for key in inputs.keys()}
            scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
        max_score_page_idx = max_score_page_idxs[scores.cpu().numpy().argmax()]
        questions[query_idx]['reference'] = 'page_' + str(max_score_page_idx + 1)

        prompt= '''你是一个汽车专家，帮我结合给定的资料与关键词和改写后的问题，回答所给的问题。如果问题无法从资料中获得，请输出:结合给定的资料，无法回答问题。
            资料:{0}

            关键词:{1}

            改写后的问题:{2}

            问题：{3}
                '''.format(
                    ''.join([f'第{i+1}页内容：' + pdf_content[i].replace('\n', '') + '\n' for i in doc_scores.argsort()[-5:]]) ,
                    questions[query_idx]["question_keywords"],
                    questions[query_idx]["question_rewriting"],
                    questions[query_idx]["question"],
                )

        answer = ask_glm(prompt)['choices'][0]['message']['content']
        # if '无法回答' in answer:
        #     answer = ask_glm(prompt2)['choices'][0]['message']['content']
        questions[query_idx]['answer'] = answer
        print(query_idx,questions[query_idx])
    return questions

In [None]:

def clean_question_rewrite(question_rewrite):
    question_rewrite = question_rewrite.replace('“','"').replace('”','"').replace('，',',').replace('？','?').replace(']\n[',',').replace('："','"')
    question_rewrite = question_rewrite.replace('改写后的句子','')
    question_rewrite = question_rewrite.replace('1"','"').replace('2"','"').replace('3"','"').replace('4"','"')
    return question_rewrite
bad_question = []
for idx,question in enumerate(task9_questions):
    question_rewrite = task9_questions[idx]['question_rewriting']
    task9_questions[idx]['question_rewriting'] = clean_question_rewrite(question_rewrite)
    try:
        eval(clean_question_rewrite(question_rewrite))
    except:
        bad_question.append(idx)
        print(question_rewrite)



In [60]:
task9_questions = get_answer_from_llm(range(len(task9_questions)),task9_questions )

0 {'question': '“前排座椅通风”的相关内容在第几页？', 'answer': '“前排座椅通风”的相关内容在第115页、第116页和第117页中都有提及。', 'reference': 'page_117', 'question_keywords': '前排座椅通风\n', 'question_rewriting': '前排座椅通风的相关内容在第115页、第116页和第117页中都有提及。'}
1 {'question': '"关于车辆的儿童安全座椅固定装置，在哪一页可以找到相关内容？"', 'answer': '关于车辆的儿童安全座椅固定装置的相关内容，可以在用户手册的第123页和第124页找到。这些页面提供了儿童安全座椅的安装说明、固定方法和相关安全警告。', 'reference': 'page_123', 'question_keywords': '儿童安全座椅 固定装置 相关内容', 'question_rewriting': '["车辆儿童安全座椅的安装说明在手册的哪一页有所介绍?","我想了解车辆中儿童安全座椅固定设备的信息,能告诉我具体在哪一页吗?","在用户手册的哪一部分可以查阅到车辆儿童安全座椅的固定方法?","我想查找关于汽车内置儿童座椅固定系统的说明,能指出手册中对应的页码吗?"]'}
2 {'question': '“打开前机舱盖”的相关信息在第几页？', 'answer': '"打开前机舱盖"的相关信息在第308页。', 'reference': 'page_307', 'question_keywords': '打开 前机舱盖 相关信息 页码', 'question_rewriting': '["请问在哪一页可以找到开启引擎盖的相关资料?","我想要了解如何打开汽车前盖的信息,这在手册的第几页?","能告诉我查看打开车辆前引擎盖步骤的页面号码是多少吗?","在哪一页可以查阅到关于车辆前部机盖开启的说明?"]'}
3 {'question': '“打开前机舱盖”这个操作在哪一页？', 'answer': '根据给定的资料，关于“打开前机舱盖”的操作说明可以在第308页找到。该页面详细描述了如何抬起、轻抬前机舱盖前边缘，并确保其完全关闭的步骤，同时也包含了关闭前机舱盖时的注意事项。', 'reference': 'page_

In [61]:
with open(f'task10_glm4.json', 'w', encoding='utf8') as up:
    json.dump(task9_questions, up, ensure_ascii=False, indent=4)

In [None]:
# for query_idx in range(len(task9_questions[59:])):
#     query_idx = query_idx+59
#     prompt= '''你是一个汽车维修和汽车销售的专家，将用户的提问改为含义相近当不相同的句子,将改写后的句子以list的形式返回，不要输出起他无关内容,举一个例子：[改写后的句子1,改写后的句子2...] ：
#     问题：{0}
# '''.format(
#     # ''.join([f'第{i+1}页内容：' + pdf_content[i].replace('\n', '') + '\n' for i in doc_scores.argsort()[-3:]]) ,
#     task9_questions[query_idx]["question"]
# )
#     answer = ask_glm(prompt)['choices'][0]['message']['content']
#     task9_questions[query_idx]['question_rewriting'] = answer
#     print(query_idx,task9_questions[query_idx])


### 总结
- 重新改写后模型得分0.753

### 参考文献
- [美团搜索中查询改写技术的探索与实践](https://tech.meituan.com/2022/02/17/exploration-and-practice-of-query-rewriting-in-meituan-search.html)