In [1]:
# 本任务要求对用户提问的意图进行识别，判断是使用RAG进行回答，还是进行通识回答。没有标注样本，所以简单计算一下query和doc的相似度，然后设置一个经验阈值。

In [2]:
from transformers import AutoModel, AutoTokenizer
import torch

In [3]:
bge_model = AutoModel.from_pretrained('../BAAI/bge-large-zh-v1.5/').cuda()

In [4]:
tokenizer = AutoTokenizer.from_pretrained('../BAAI/bge-large-zh-v1.5/')

In [5]:
bge_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [6]:
import pdfplumber

In [7]:
pdf = pdfplumber.open('./初赛训练数据集.pdf')
pdf_content = [page.extract_text() for page in pdf.pages]

In [8]:
import json

In [9]:
questions = json.load(open('./questions.json'))

In [13]:
# compute the embedding of the doc.

embedding_dict = dict()

with torch.no_grad():
    for i, input in enumerate(pdf_content):
        encoded_input = tokenizer(input, truncation=True, padding=True, max_length=512, return_tensors='pt')
        encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
        output = bge_model(**encoded_input)
        embedding_dict[i] = output[0][:, 0]

In [15]:
output[0].shape

torch.Size([1, 97, 1024])

In [18]:
encoded_input['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')

In [22]:
doc_embedding = torch.concatenate(list(embedding_dict.values()))


In [23]:
doc_embedding.shape

torch.Size([354, 1024])

In [24]:
query_embedding = []

for i in range(len(questions)):
    query = questions[i]['question']
    with torch.no_grad():
        encoded_input = tokenizer(query, truncation=True, padding=True, return_tensors='pt', max_length=512)
        encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
        output = bge_model(**encoded_input)
        query_embedding.append(output[0][:, 0])

In [25]:
query_embedding = torch.concatenate(query_embedding)

In [26]:
query_embedding.shape

torch.Size([301, 1024])

In [29]:
import torch.nn.functional as F

In [30]:
QA_similarity = F.normalize(query_embedding) @ F.normalize(doc_embedding).T

In [31]:
QA_similarity.shape

torch.Size([301, 354])

In [32]:
min_similarity = torch.min(QA_similarity)

In [33]:
min_similarity

tensor(0.0021, device='cuda:0')

In [34]:
max_similarity = torch.max(QA_similarity)

In [35]:
max_similarity

tensor(0.8799, device='cuda:0')

In [38]:
threshold = (max_similarity - min_similarity) * 0.4 + min_similarity

In [39]:
threshold

tensor(0.3532, device='cuda:0')