In [1]:
from datasets import load_dataset
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from sklearn.metrics import f1_score
import numpy as np
import faiss
import torch
import pandas as pd
from tqdm import tqdm

import json
import os


## Load Dataset

In [2]:
# 1. SQuAD 데이터 로드 (훈련 세트 기준, validation도 가능)
dataset = load_dataset("squad", split="train")

# 2. 고유한 context 문서 집합 생성 (retrieval corpus로 사용)
unique_contexts = list(set(dataset["context"]))
corpus_df = pd.DataFrame({"doc_id": list(range(len(unique_contexts))), "text": unique_contexts})

# 3. QA 쌍 구성 (질문, 정답, 해당 문서)
qa_data = []
context_to_id = {context: idx for idx, context in enumerate(unique_contexts)}

for item in dataset:
    question = item["question"]
    answer = item["answers"]["text"][0] if item["answers"]["text"] else ""
    context = item["context"]
    doc_id = context_to_id[context]
    qa_data.append({
        "question": question,
        "answer": answer,
        "doc_id": doc_id,
        "context": context
    })

qa_pairs = pd.DataFrame(qa_data)

# 4. 결과 미리 보기
print("Corpus 예시:")
print(corpus_df.head())

print("\nQA 쌍 예시:")
print(qa_pairs.head())

Corpus 예시:
   doc_id                                               text
0       0  While Japan had a large number of submarines, ...
1       1  Other critics, such as Francis Fukuyama, note ...
2       2  During the 1990s after NAFTA was signed, indus...
3       3  Pre-sectarian Buddhism is the earliest phase o...
4       4  As the Industrial Revolution spread across Eur...

QA 쌍 예시:
                                            question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                                    answer  doc_id  \
0               Saint Bernadette Soubirous    7437   
1                a copper statue of Christ    7437   
2                        the Main Building    7437   
3  a Marian place of prayer and reflection    7437   
4     

In [3]:
# # 1. corpus 저장 (Retrieval 문서들)
# corpus_records = corpus_df.to_dict(orient="records")
# with open("dataset/squad_rag_corpus2.json", "w", encoding="utf-8") as f:
#     json.dump(corpus_records, f, ensure_ascii=False, indent=2)

# # 2. QA 쌍 저장 (질문-정답-문서 매핑)
# qa_records = qa_pairs.to_dict(orient="records")
# with open("dataset/squad_rag_qa_pairs2.json", "w", encoding="utf-8") as f:
#     json.dump(qa_records, f, ensure_ascii=False, indent=2)

# print("✅ JSON 파일 저장 완료:")
# print("- squad_rag_corpus.json")
# print("- squad_rag_qa_pairs.json")

## Save embedded vector

In [4]:

# 3. Load DPR model and tokenizer (use multi-qa)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-multiset-base")
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mode

In [5]:
ctx_encoder.eval()
q_encoder.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ctx_encoder.to(device)
q_encoder.to(device)


DPRQuestionEncoder(
  (question_encoder): DPREncoder(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_feature

In [6]:
batch_size = 32
ctx_embeddings = []

for i in tqdm(range(0, len(corpus_df), batch_size), desc="Encoding contexts"):
    batch_texts = corpus_df["text"].iloc[i:i+batch_size].tolist()
    batch_texts = [str(t).strip() for t in batch_texts]

    inputs = ctx_tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = ctx_encoder(**inputs)
        emb_batch = output.pooler_output.cpu().numpy()  # or output.last_hidden_state[:, 0]
        ctx_embeddings.append(emb_batch)

ctx_embeddings = np.vstack(ctx_embeddings)

Encoding contexts: 100%|██████████| 591/591 [01:52<00:00,  5.26it/s]


In [7]:
batch_size = 32  # 필요에 따라 조정 가능
q_embeddings = []

questions = qa_pairs["question"].tolist()

for i in tqdm(range(0, len(questions), batch_size), desc="Encoding questions"):
    batch_questions = questions[i:i+batch_size]
    batch_questions = [str(q).strip() for q in batch_questions]

    inputs = q_tokenizer(batch_questions, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = q_encoder(**inputs)
        emb_batch = output.pooler_output.cpu().numpy()  # or output.last_hidden_state[:, 0]
        q_embeddings.append(emb_batch)

q_embeddings = np.vstack(q_embeddings)

Encoding questions:   0%|          | 0/2738 [00:00<?, ?it/s]

Encoding questions: 100%|██████████| 2738/2738 [00:45<00:00, 59.99it/s]


In [8]:
# 4. 저장
embedding_dir = "/mnt/aix7101/jeong/aix_project"
if not os.path.exists(embedding_dir):
    os.makedirs(embedding_dir)
    print(f"📁 Created directory: {embedding_dir}")

ctx_path = os.path.join(embedding_dir, "dpr_ctx_embeddings_multiqa.npy")
q_path = os.path.join(embedding_dir, "dpr_q_embeddings_multiqa.npy")

np.save(ctx_path, ctx_embeddings)
np.save(q_path, q_embeddings)

print(f"✅ Context embeddings saved to: {ctx_path}")
print(f"✅ Question embeddings saved to: {q_path}")

✅ Context embeddings saved to: /mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings_multiqa.npy
✅ Question embeddings saved to: /mnt/aix7101/jeong/aix_project/dpr_q_embeddings_multiqa.npy


In [9]:
# from nltk.tokenize import sent_tokenize
# import numpy as np
# from tqdm import tqdm

# batch_size = 16  # GPU 상황에 따라 조정
# ctx_sentence_embeddings = []

# for doc in tqdm(corpus_df["text"], desc="Encoding multi-sentence contexts"):
#     # 1. 문서 내 문장 분리
#     sentences = sent_tokenize(doc)
#     doc_embeddings = []

#     # 2. 문장들을 배치로 처리
#     for i in range(0, len(sentences), batch_size):
#         batch_sents = sentences[i:i+batch_size]
#         inputs = ctx_tokenizer(
#             batch_sents,
#             return_tensors="pt",
#             truncation=True,
#             padding=True,
#             max_length=128
#         )
#         inputs = {k: v.to(device) for k, v in inputs.items()}

#         with torch.no_grad():
#             output = ctx_encoder(**inputs)
#             emb_batch = output.pooler_output.cpu().numpy()  # or last_hidden_state[:, 0]
#             doc_embeddings.append(emb_batch)

#     # 3. 문서 하나에 대한 (문장 수, dim) 배열 생성
#     doc_embeddings = np.vstack(doc_embeddings)
#     ctx_sentence_embeddings.append(doc_embeddings)
    
# # 4. 문서별 문장 수가 달라 3D 배열로 만들고 싶을 경우
# max_len = max(e.shape[0] for e in ctx_sentence_embeddings)
# dim = ctx_sentence_embeddings[0].shape[1]

# padded_embeddings = np.zeros((len(ctx_sentence_embeddings), max_len, dim))
# for i, emb in enumerate(ctx_sentence_embeddings):
#     padded_embeddings[i, :emb.shape[0], :] = emb

In [10]:
# # 4. 저장
# embedding_dir = "/mnt/aix7101/jeong/aix_project"
# if not os.path.exists(embedding_dir):
#     os.makedirs(embedding_dir)
#     print(f"📁 Created directory: {embedding_dir}")

# sentence_ctx_path = os.path.join(embedding_dir, "dpr_m_ctx_embeddings_multiqa.npy")
# ctx_sentence_embeddings = np.array(ctx_sentence_embeddings, dtype=object)
# np.save(sentence_ctx_path, ctx_sentence_embeddings, allow_pickle=True)

# print(f"✅ Context Sentence embeddings saved to: {sentence_ctx_path}")

## BM25

In [12]:
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import pandas as pd
import numpy as np

def compute_bm25_recall(qa_pairs: pd.DataFrame, corpus_df: pd.DataFrame, k: int = 5) -> float:
    """
    BM25 기반 Recall@k 계산 함수
    
    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍이 포함된 데이터프레임 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        k (int): top-k 문서 중 정답이 포함되는지 평가할 k 값
        
    Returns:
        float: Recall@k
    """
    # 1. 토크나이즈된 문서 리스트 생성
    tokenized_corpus = [doc.split() for doc in corpus_df["text"]]
    
    # 2. BM25 인덱스 구성
    bm25 = BM25Okapi(tokenized_corpus)
    
    hit_count = 0

    # 3. 각 질문에 대해 BM25 top-k 문서 검색
    for _, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating BM25 Recall@K"):
        question = row["question"]
        gt_doc_id = row["doc_id"]

        tokenized_query = question.split()
        scores = bm25.get_scores(tokenized_query)

        # 상위 k개의 문서 인덱스 추출
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_doc_ids = corpus_df.iloc[topk_indices]["doc_id"].tolist()

        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 BM25 Recall@{k}: {recall_at_k:.4f}")
    return recall_at_k

In [13]:
recall_bm25 = compute_bm25_recall(qa_pairs, corpus_df, k=3)

Evaluating BM25 Recall@K:   0%|          | 123/87599 [00:05<1:02:03, 23.49it/s]


KeyboardInterrupt: 

## DPR

In [6]:
def compute_dpr_recall(qa_pairs, corpus_df, ctx_emb_path, q_emb_path, k=5):
    """
    저장된 임베딩 파일을 기반으로 top-k 문서 중 정답 문서가 포함되는 비율(Recall@k)을 계산합니다.
    
    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍이 포함된 데이터프레임 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        ctx_emb_path (str): 문서 임베딩이 저장된 .npy 경로
        q_emb_path (str): 질문 임베딩이 저장된 .npy 경로
        k (int): top-k 문서 중 정답이 포함되는지 평가할 k 값
        
    Returns:
        float: Recall@k
    """
    
    # 1. 임베딩 로드
    ctx_embeddings = np.load(ctx_emb_path)
    q_embeddings = np.load(q_emb_path)

    assert len(q_embeddings) == len(qa_pairs), "❗ 질문 임베딩 수와 QA 쌍 수가 일치하지 않습니다."

    hit_count = 0

    # 2. 각 질문에 대해 유사한 top-k 문서 검색
    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating Recall@K"):
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]

        # 문서들과의 유사도 (cosine 유사도 대신 dot-product 사용)
        scores = np.dot(ctx_embeddings, q_emb)

        # top-k 인덱스
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_doc_ids = corpus_df.iloc[topk_indices]["doc_id"].tolist()
        # print(topk_indices)
        # print(gt_doc_id)
        # 정답 문서가 top-k에 포함되는지 확인
        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 Recall@{k}: {recall_at_k:.4f}")
    return recall_at_k

In [7]:
recall_dpr = compute_dpr_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings_multiqa.npy",
    q_emb_path="/mnt/aix7101/jeong/aix_project/dpr_q_embeddings_multiqa.npy",
    k=3
)


Evaluating Recall@K: 100%|██████████| 87599/87599 [01:01<00:00, 1427.80it/s]

📌 Recall@3: 0.0002





## DPR-m

In [25]:
def compute_dprm_recall(
    qa_pairs: pd.DataFrame,
    corpus_df: pd.DataFrame,
    ctx_emb_path: str,
    q_emb_path: str,
    k: int = 5,
    aggregation: str = "mean",
) -> float:
    """
    문장 단위의 문서 임베딩을 사용하여 DPR-m 방식의 Recall@k 계산.

    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        ctx_emb_path (str): 문장 단위 문서 임베딩 저장 경로 (.npy, shape: [num_docs, num_sents, dim])
        q_emb_path (str): 질문 임베딩 저장 경로 (.npy, shape: [num_queries, dim])
        k (int): Recall@k
        aggregation (str): 'max' 또는 'mean' 방식으로 문서 유사도 집계
        
    Returns:
        float: Recall@k
    """
    
    # 1. 임베딩 로드
    ctx_embeddings = np.load(ctx_emb_path, allow_pickle=True)  # object 배열
    q_embeddings = np.load(q_emb_path)

    assert len(q_embeddings) == len(qa_pairs), "❗ 질문 임베딩 수와 QA 쌍 수가 일치하지 않습니다."

    hit_count = 0

    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating DPR-m Recall@K"):
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]  # (dim,)

        # 각 문서에 대해 문장 임베딩과 q_emb의 유사도 계산
        scores = []
        for doc_sents in ctx_embeddings:
            sent_scores = np.dot(doc_sents, q_emb)  # (num_sents,)
            if aggregation == "max": # 유사도가 제일 높은 문장이 있는 것으로 할지
                score = np.max(sent_scores)
            elif aggregation == "mean": # 전체적인 문장의 평균으로 계산할지
                score = np.mean(sent_scores)
            else:
                raise ValueError("aggregation은 'max' 또는 'mean'이어야 합니다.")
            scores.append(score)

        scores = np.array(scores)
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_doc_ids = corpus_df.iloc[topk_indices]["doc_id"].tolist()

        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 DPR-m Recall@{k} ({aggregation} aggregation): {recall_at_k:.4f}")
    return recall_at_k

In [36]:
compute_dprm_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_m_ctx_embeddings_multiqa.npy",
    q_emb_path="/mnt/aix7101/jeong/aix_project/dpr_q_embeddings_multiqa.npy",
    k=3,
    aggregation="max"
)

Evaluating DPR-m Recall@K: 100%|██████████| 87599/87599 [1:55:38<00:00, 12.63it/s]

📌 DPR-m Recall@3 (max aggregation): 0.6796





0.6795739677393577

## hybrid (bm25 + DPR)

In [37]:
def compute_hybrid_recall(
    qa_pairs: pd.DataFrame,
    corpus_df: pd.DataFrame,
    ctx_emb_path: str,
    q_emb_path: str,
    bm25_top_n: int = 100,
    k: int = 5
) -> float:
    """
    BM25 + DPR hybrid retrieval 기반 Recall@k 계산

    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        ctx_emb_path (str): DPR 문서 임베딩 경로 (.npy, shape: [num_docs, dim])
        q_emb_path (str): DPR 질문 임베딩 경로 (.npy, shape: [num_queries, dim])
        bm25_top_n (int): BM25로 먼저 선택할 후보 문서 개수
        k (int): 최종 DPR top-k에서 정답 포함 여부 평가

    Returns:
        float: Recall@k
    """
    # 1. 임베딩 불러오기
    ctx_embeddings = np.load(ctx_emb_path)     # shape: (num_docs, dim)
    q_embeddings = np.load(q_emb_path)         # shape: (num_queries, dim)
    assert len(q_embeddings) == len(qa_pairs), "❗ 질문 임베딩 수와 QA 쌍 수가 일치하지 않습니다."

    # 2. BM25 인덱스 구성
    tokenized_corpus = [doc.split() for doc in corpus_df["text"]]
    bm25 = BM25Okapi(tokenized_corpus)

    hit_count = 0

    # 3. 각 질문에 대해 hybrid retrieval 수행
    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating Hybrid Recall@K"):
        question = row["question"]
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]  # (dim,)

        # (1) BM25 후보 추출
        tokenized_query = question.split()
        bm25_scores = bm25.get_scores(tokenized_query)
        bm25_top_indices = np.argsort(bm25_scores)[::-1][:bm25_top_n]

        # (2) DPR 유사도 계산 (bm25 후보에 한해)
        candidate_ctx_embs = ctx_embeddings[bm25_top_indices]  # (bm25_top_n, dim)
        dpr_scores = np.dot(candidate_ctx_embs, q_emb)         # (bm25_top_n,)

        # (3) DPR 기반 top-k 문서 선택
        topk_local_indices = np.argsort(dpr_scores)[::-1][:k]
        topk_doc_indices = [bm25_top_indices[i] for i in topk_local_indices]
        topk_doc_ids = corpus_df.iloc[topk_doc_indices]["doc_id"].tolist()

        # (4) 정답 포함 여부 확인
        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 Hybrid Recall@{k} (BM25 top-{bm25_top_n} + DPR top-{k}): {recall_at_k:.4f}")
    return recall_at_k

In [38]:
compute_hybrid_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings_multiqa.npy",
    q_emb_path="/mnt/aix7101/jeong/aix_project/dpr_q_embeddings_multiqa.npy",
    bm25_top_n=300,
    k=5
)

Evaluating Hybrid Recall@K:   5%|▌         | 4609/87599 [02:37<52:21, 26.41it/s]  

## Custom Retrieval 구성 요소
1. 문장에서 keyword 추출 (phrase 단위로 추출할 수 있는 방법이 있는지)
2. 추출한 keyword와의 score도 함께 계산
4. query만으로 추출한 recall@k
5. keyword만으로 추출한 recall@k
6. 둘을 hybrid하는 것도 ㄱㅊ

### keyword extract function

In [11]:
import spacy
import pandas as pd

# 1. spaCy 영어 모델 로드
nlp = spacy.load("en_core_web_sm")

# 2. 의문사 리스트 정의
WH_WORDS = {"what", "who", "whom", "where", "when", "why", "how"}

# 3. keyphrase 추출 함수 정의
def extract_keyphrases_spacy(question: str):
    doc = nlp(question.lower())
    keyphrases = set()

    wh_word = None
    for token in doc:
        if token.text in WH_WORDS:
            wh_word = token.text
            break

    for chunk in doc.noun_chunks:
        if any(not token.is_stop and token.pos_ in {"NOUN", "PROPN"} for token in chunk):
            keyphrases.add(chunk.text.strip())

    # 의문사에 따른 힌트 키워드 추가
    if wh_word:
        hint_map = {
            "who": "person",
            "where": "location",
            "when": "time",
            "why": "reason",
            "how": "method",
        }
        hint = hint_map.get(wh_word)
        if hint:
            keyphrases.add(hint)

    return list(keyphrases)

### use keybert

In [12]:
from keybert import KeyBERT
from typing import List
import re

# KeyBERT 모델 초기화 (기본적으로 'all-MiniLM-L6-v2' 사용)
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

def extract_keyphrases_keybert(question: str, top_n: int = 5, diversity: bool = False) -> List[str]:
    """
    KeyBERT 기반 keyphrase 추출 함수 (의문사 힌트 없음)

    Args:
        question (str): 입력 질문
        top_n (int): 추출할 키프레이즈 개수
        diversity (bool): MMR(Minimal Marginal Relevance) 사용 여부

    Returns:
        List[str]: 추출된 키프레이즈 리스트
    """
    question_clean = re.sub(r"[^\w\s]", "", question.lower())  # 간단한 전처리

    if diversity:
        keyphrases = kw_model.extract_keywords(
            question_clean,
            keyphrase_ngram_range=(1, 3),
            stop_words='english',
            use_mmr=True,
            diversity=0.7,
            top_n=top_n
        )
    else:
        keyphrases = kw_model.extract_keywords(
            question_clean,
            keyphrase_ngram_range=(1, 3),
            stop_words='english',
            top_n=top_n
        )

    return [phrase for phrase, _ in keyphrases]

1. keyphrase-based pre-filtering
- keyphrase를 추출
- 각 keyphrase를 embedding하고 corpus와의 유사도 계산을 통해 후보 100개씩 추출
2. query-to-context matching
- 전체 corpus가 아닌 후보 corpus와만 비교해서 최종 recall@k를 계산

In [13]:
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import normalize
import torch

def compute_dpr_hybrid_keyphrase_recall(
    qa_pairs,
    corpus_df,
    ctx_emb_path,
    extract_keyphrases_fn,
    top_n_per_keyphrase=50,
    final_top_k=5,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    """
    DPR 인코더를 사용한 키프레이즈 기반 Hybrid Retrieval Recall@K 계산

    Args:
        qa_pairs (pd.DataFrame): ['question', 'doc_id']
        corpus_df (pd.DataFrame): ['doc_id', 'text']
        ctx_embeddings (np.ndarray): 문서 임베딩 (shape: [num_docs, dim])
        ctx_tokenizer, ctx_encoder: DPR context 인코더
        q_tokenizer, q_encoder: DPR query 인코더
        extract_keyphrases_fn (function): 키프레이즈 추출 함수
        top_n_per_keyphrase (int): 키프레이즈 당 후보 문서 수
        final_top_k (int): 최종 선택할 문서 수
        device (str): 'cuda' or 'cpu'
        
    Returns:
        float: Recall@k
    """
    hit_count = 0
    ctx_embeddings = np.load(ctx_emb_path, allow_pickle=True)  
    ctx_embeddings = normalize(ctx_embeddings)


    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Custom Retrieval Recall@K"):
        question = row["question"]
        gt_doc_id = row["doc_id"]

        # 1. 키프레이즈 추출
        keyphrases = extract_keyphrases_fn(question)
        if not keyphrases:
            continue

        # 2. 키프레이즈 임베딩
        phrase_embs = []
        for phrase in keyphrases:
            inputs = ctx_tokenizer(phrase, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                emb = ctx_encoder(**inputs).pooler_output[0].cpu().numpy()
            phrase_embs.append(emb)
        phrase_embs = normalize(np.stack(phrase_embs))

        # 3. 키워드 별 상위 문서 수집
        candidate_indices = set()
        for emb in phrase_embs:
            scores = np.dot(ctx_embeddings, emb)
            top_indices = np.argsort(scores)[::-1][:top_n_per_keyphrase]
            candidate_indices.update(top_indices)

        if not candidate_indices:
            continue

        # 4. 쿼리 임베딩
        q_inputs = q_tokenizer(question, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        q_inputs = {k: v.to(device) for k, v in q_inputs.items()}
        with torch.no_grad():
            query_emb = q_encoder(**q_inputs).pooler_output[0].cpu().numpy()
        query_emb = normalize(query_emb.reshape(1, -1))[0]

        # 5. 후보 문서 재랭킹
        candidate_indices = list(candidate_indices)
        candidate_embs = ctx_embeddings[candidate_indices]
        rerank_scores = np.dot(candidate_embs, query_emb)

        top_k_indices = np.argsort(rerank_scores)[::-1][:final_top_k]
        top_k_doc_ids = corpus_df.iloc[[candidate_indices[i] for i in top_k_indices]]["doc_id"].tolist()

        if gt_doc_id in top_k_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 Custom Retrieval Keyphrase-based Recall@{final_top_k}: {recall_at_k:.4f}")
    return recall_at_k

In [14]:
recall_spacy = compute_dpr_hybrid_keyphrase_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings_multiqa.npy",
    extract_keyphrases_fn=extract_keyphrases_spacy,  # 앞서 정의한 spaCy 기반 함수
    top_n_per_keyphrase=100,
    final_top_k=5
)

Custom Retrieval Recall@K: 100%|██████████| 87599/87599 [1:15:37<00:00, 19.31it/s]

📌 Custom Retrieval Keyphrase-based Recall@5: 0.4982





In [None]:
# recall 값 저장
with open("recall_spacy_result.txt", "w") as f:
    f.write(f"Recall: {recall_spacy}\n")

: 

In [None]:
recall_keybert = compute_dpr_hybrid_keyphrase_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings_multiqa.npy",
    extract_keyphrases_fn=extract_keyphrases_keybert,  # keybert 기반 extraction
    top_n_per_keyphrase=100,
    final_top_k=5
)

Custom Retrieval Recall@K:  67%|██████▋   | 59100/87599 [1:43:19<1:15:10,  6.32it/s]

In [None]:
# recall 값 저장
with open("recall_keybert_result.txt", "w") as f:
    f.write(f"Recall: {recall_keybert}\n")

In [None]:
# recall 값 저장
with open("recall_result_test.txt", "w") as f:
    f.write(f"Recall: {recall_dpr}\n")

In [None]:
# use keybert

In [None]:
#-- retrieval

In [None]:
#-- custom checking code
