In [17]:
import jieba
import pdfplumber
import json

In [18]:
questions = json.load(open('./questions.json'))

In [19]:
pdf = pdfplumber.open('./初赛训练数据集.pdf')

In [20]:
pdf_content = []

for idx, content in enumerate(pdf.pages):
    pdf_content.append(
        {
            'page_idx': str(idx + 1),
            'page_content': content.extract_text()
        }
    )

In [21]:
pdf_content[189]

{'page_idx': '190',
 'page_content': '驾驶辅助\n警告！ 停用高级智能驾驶\n■ 设置的跟车距离间隔较低，可能会导致发生意外情况时来不及反\n应。\n■ 您始终有责任始终保持安全的距离和速度，并在必要时及时采取\n制动措施。\n说明！\n□ 跟车距离间隔不是固定距离，与前车之间的实际距离会随车速变\n化而变化。\n□ 在湿滑路面行驶时，请选择更大的跟车距离间隔。\n□ 踩下加速踏板时，车辆将暂时加速；松开加速踏板后，车速将缓\n慢减速至目标车速。\n□ 若您开启了“巡航自动限速辅助”功能，系统可以将当前道路限 按下方向盘上的 按键或者踩下制动踏板，暂时停用LynkCo-Pilot功\n速牌上的限速值自动设定为目标车速。\n能， 图标变为白色，LynkCo-Pilot进入准备模式。\n190'}

In [22]:
import torch
import transformers

In [23]:
from transformers import  AutoTokenizer, AutoModelForSequenceClassification, AutoModel

In [24]:
rerank_tokenizer = AutoTokenizer.from_pretrained('../BAAI/bge-reranker-base/')
rerank_model = AutoModelForSequenceClassification.from_pretrained('../BAAI/bge-reranker-base/')

In [25]:
embedding_tokenizer = AutoTokenizer.from_pretrained('../BAAI/bge-large-zh-v1.5/')
embedding_model = AutoModel.from_pretrained('../BAAI/bge-large-zh-v1.5/')

In [26]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [27]:
device

device(type='cuda', index=0)

In [28]:
rerank_model.to(device)
embedding_model.to(device)
rerank_model.eval()
embedding_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [29]:
pdf_content_words = [jieba.lcut(x['page_content']) for x in pdf_content]

In [30]:
from rank_bm25 import  BM25Okapi

In [31]:
bm25 = BM25Okapi(pdf_content_words)

In [33]:
questions_embedding = []

with torch.no_grad():
    for query_idx in range(len(questions)):
        encoded_query = embedding_tokenizer(questions[query_idx]['question'], truncation=True, padding=True, return_tensors='pt', max_length=512)
        encoded_query = {k: v.cuda() for k, v in encoded_query.items()}
        output = embedding_model(**encoded_query)
        query_embedding = output[0][:, 0]
        questions_embedding.append(query_embedding)

questions_embedding = torch.concatenate(questions_embedding)
questions_embedding.shape

torch.Size([301, 1024])

In [34]:
encoded_docs = embedding_tokenizer([x['page_content'] for x in pdf_content], truncation=True, padding=True, return_tensors='pt', max_length=512)   

In [35]:
with torch.no_grad():
    encoded_docs = {k: v.cuda() for k, v in encoded_docs.items()}
    outputs = embedding_model(**encoded_docs)
    docs_embedding = outputs[0][:, 0]

In [38]:
docs_embedding.shape

torch.Size([354, 1024])

In [41]:
import torch.nn.functional as F

query_2_doc_similarity = F.normalize(questions_embedding) @ F.normalize(docs_embedding).T

In [42]:
query_2_doc_similarity.shape

torch.Size([301, 354])

In [37]:
torch.cuda.empty_cache() 

In [51]:
# single recall BM based on tf-idf, add LLM to implement semantic-based recall.


import numpy as np

for query_idx in range(len(questions)):
    lexical_page_scores = bm25.get_scores(jieba.lcut(questions[query_idx]['question']))
    lexical_top3_score_page_indexes = lexical_page_scores.argsort()[-3:]

    semantic_page_scores = query_2_doc_similarity[query_idx].cpu().numpy()
    semantic_top3_score_page_indexes = semantic_page_scores.argsort()[-3:]

    candidates = list(set(np.hstack((semantic_top3_score_page_indexes, lexical_top3_score_page_indexes))))
    pairs = [[questions[query_idx]['question'], pdf_content[page_idx]['page_content']] for page_idx in candidates]

    inputs = rerank_tokenizer(pairs, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
    
    best_match_index = candidates[scores.cpu().numpy().argmax()]
    questions[query_idx]['reference'] = 'page_' + str(best_match_index + 1)


with open('./submit_5_multi_recall.json', 'w', encoding='utf8') as f:
    json.dump(questions, f, ensure_ascii=False, indent=4)

