In [None]:
from datasets import load_dataset
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from sklearn.metrics import f1_score
import numpy as np
import faiss
import torch
import pandas as pd
from tqdm import tqdm

import json
import os


## Load Dataset

In [None]:
# 1. SQuAD 데이터 로드 (훈련 세트 기준, validation도 가능)
dataset = load_dataset("squad", split="train")

# 2. 고유한 context 문서 집합 생성 (retrieval corpus로 사용)
unique_contexts = list(set(dataset["context"]))
corpus_df = pd.DataFrame({"doc_id": list(range(len(unique_contexts))), "text": unique_contexts})

# 3. QA 쌍 구성 (질문, 정답, 해당 문서)
qa_data = []
context_to_id = {context: idx for idx, context in enumerate(unique_contexts)}

for item in dataset:
    question = item["question"]
    answer = item["answers"]["text"][0] if item["answers"]["text"] else ""
    context = item["context"]
    doc_id = context_to_id[context]
    qa_data.append({
        "question": question,
        "answer": answer,
        "doc_id": doc_id,
        "context": context
    })

qa_pairs = pd.DataFrame(qa_data)

# 4. 결과 미리 보기
print("Corpus 예시:")
print(corpus_df.head())

print("\nQA 쌍 예시:")
print(qa_pairs.head())

Corpus 예시:
   doc_id                                               text
0       0  The U.S. Social Security Administration (SSA),...
1       1  Arnold Alois Schwarzenegger (/ˈʃwɔːrtsənˌɛɡər/...
2       2  In 2006, the Sister City Program of the City o...
3       3  By 1840, the Market Hall and Sheds, where fres...
4       4  Some commentators have defined reverse discrim...

QA 쌍 예시:
                                            question  \
0  To whom did the Virgin Mary allegedly appear i...   
1  What is in front of the Notre Dame Main Building?   
2  The Basilica of the Sacred heart at Notre Dame...   
3                  What is the Grotto at Notre Dame?   
4  What sits on top of the Main Building at Notre...   

                                    answer  doc_id  \
0               Saint Bernadette Soubirous   14556   
1                a copper statue of Christ   14556   
2                        the Main Building   14556   
3  a Marian place of prayer and reflection   14556   
4     

In [None]:
# 1. corpus 저장 (Retrieval 문서들)
corpus_records = corpus_df.to_dict(orient="records")
with open("dataset/squad_rag_corpus2.json", "w", encoding="utf-8") as f:
    json.dump(corpus_records, f, ensure_ascii=False, indent=2)

# 2. QA 쌍 저장 (질문-정답-문서 매핑)
qa_records = qa_pairs.to_dict(orient="records")
with open("dataset/squad_rag_qa_pairs2.json", "w", encoding="utf-8") as f:
    json.dump(qa_records, f, ensure_ascii=False, indent=2)

print("✅ JSON 파일 저장 완료:")
print("- squad_rag_corpus.json")
print("- squad_rag_qa_pairs.json")

✅ JSON 파일 저장 완료:
- squad_rag_corpus.json
- squad_rag_qa_pairs.json


## Save embedded vector

In [4]:

# 3. Load DPR model and tokenizer (use multi-qa)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
  return torch.load(checkpoint_file, map_location="cpu")


In [None]:
ctx_encoder.eval()
q_encoder.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ctx_encoder.to(device)
q_encoder.to(device)


DPRQuestionEncoder(
  (question_encoder): DPREncoder(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=76

In [None]:
# 2. 문서 임베딩 생성
ctx_embeddings = []
for doc in tqdm(corpus_df["text"], desc="Encoding contexts"):
    inputs = ctx_tokenizer(doc, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 입력도 GPU로 이동

    with torch.no_grad():
        emb = ctx_encoder(**inputs).pooler_output[0].cpu().numpy()  # 결과만 다시 CPU로
    ctx_embeddings.append(emb)

ctx_embeddings = np.stack(ctx_embeddings)

In [10]:
# 3. 질문 임베딩 생성
q_embeddings = []
for q in tqdm(qa_pairs["question"], desc="Encoding questions"):
    inputs = q_tokenizer(q, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # ✅ 입력도 GPU로 이동

    with torch.no_grad():
        emb = q_encoder(**inputs).pooler_output[0].cpu().numpy()  # ✅ 결과만 다시 CPU로 이동
    q_embeddings.append(emb)

q_embeddings = np.stack(q_embeddings)

Encoding questions: 100%|██████████| 87599/87599 [14:09<00:00, 103.13it/s]


In [11]:
# 4. 저장
embedding_dir = "/mnt/aix7101/jeong/aix_project"
if not os.path.exists(embedding_dir):
    os.makedirs(embedding_dir)
    print(f"📁 Created directory: {embedding_dir}")

ctx_path = os.path.join(embedding_dir, "dpr_ctx_embeddings2.npy")
q_path = os.path.join(embedding_dir, "dpr_q_embeddings2.npy")

np.save(ctx_path, ctx_embeddings)
np.save(q_path, q_embeddings)

print(f"✅ Context embeddings saved to: {ctx_path}")
print(f"✅ Question embeddings saved to: {q_path}")

✅ Context embeddings saved to: /mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings2.npy
✅ Question embeddings saved to: /mnt/aix7101/jeong/aix_project/dpr_q_embeddings2.npy


In [None]:
from nltk.tokenize import sent_tokenize  # 문장 단위로 분리

ctx_sentence_embeddings = []

for doc in tqdm(corpus_df["text"], desc="Encoding multi-sentence contexts"):
    # 1. 문장 단위로 나누기
    sentences = sent_tokenize(doc)
    
    doc_embeddings = []
    for sent in sentences:
        inputs = ctx_tokenizer(
            sent,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128  # 문장 기준이라 길이 줄여도 OK
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            emb = ctx_encoder(**inputs).pooler_output[0].cpu().numpy()
        doc_embeddings.append(emb)
    
    # 문서에 속한 문장 벡터들을 하나의 array로 (num_sents_in_doc, dim)
    doc_embeddings = np.stack(doc_embeddings)
    ctx_sentence_embeddings.append(doc_embeddings)

# ⚠️ 문서마다 문장 수가 달라 padding이 필요할 수 있음
# → 3D array로 만들기 위해 패딩 (optional)
max_len = max(e.shape[0] for e in ctx_sentence_embeddings)
dim = ctx_sentence_embeddings[0].shape[1]

# zero-padding
padded_embeddings = np.zeros((len(ctx_sentence_embeddings), max_len, dim))
for i, emb in enumerate(ctx_sentence_embeddings):
    padded_embeddings[i, :emb.shape[0], :] = emb

In [None]:
# 4. 저장
embedding_dir = "/mnt/aix7101/jeong/aix_project"
if not os.path.exists(embedding_dir):
    os.makedirs(embedding_dir)
    print(f"📁 Created directory: {embedding_dir}")

sentence_ctx_path = os.path.join(embedding_dir, "dpr_m_ctx_embeddings2.npy")

np.save(sentence_ctx_path, ctx_sentence_embeddings)

print(f"✅ Context Sentence embeddings saved to: {sentence_ctx_path}")

## BM25

In [20]:
from rank_bm25 import BM25Okapi
from tqdm import tqdm
import pandas as pd
import numpy as np

def compute_bm25_recall(qa_pairs: pd.DataFrame, corpus_df: pd.DataFrame, k: int = 5) -> float:
    """
    BM25 기반 Recall@k 계산 함수
    
    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍이 포함된 데이터프레임 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        k (int): top-k 문서 중 정답이 포함되는지 평가할 k 값
        
    Returns:
        float: Recall@k
    """
    # 1. 토크나이즈된 문서 리스트 생성
    tokenized_corpus = [doc.split() for doc in corpus_df["text"]]
    
    # 2. BM25 인덱스 구성
    bm25 = BM25Okapi(tokenized_corpus)
    
    hit_count = 0

    # 3. 각 질문에 대해 BM25 top-k 문서 검색
    for _, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating BM25 Recall@K"):
        question = row["question"]
        gt_doc_id = row["doc_id"]

        tokenized_query = question.split()
        scores = bm25.get_scores(tokenized_query)

        # 상위 k개의 문서 인덱스 추출
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_doc_ids = corpus_df.iloc[topk_indices]["doc_id"].tolist()

        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 BM25 Recall@{k}: {recall_at_k:.4f}")
    return recall_at_k

In [None]:
recall_bm25 = compute_bm25_recall(qa_pairs, corpus_df, k=5)

Evaluating BM25 Recall@K:  31%|███       | 27077/87599 [17:38<47:09, 21.39it/s]  

## DPR

In [None]:
def compute_dpr_recall(qa_pairs, corpus_df, ctx_emb_path, q_emb_path, k=5):
    """
    저장된 임베딩 파일을 기반으로 top-k 문서 중 정답 문서가 포함되는 비율(Recall@k)을 계산합니다.
    
    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍이 포함된 데이터프레임 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        ctx_emb_path (str): 문서 임베딩이 저장된 .npy 경로
        q_emb_path (str): 질문 임베딩이 저장된 .npy 경로
        k (int): top-k 문서 중 정답이 포함되는지 평가할 k 값
        
    Returns:
        float: Recall@k
    """
    
    # 1. 임베딩 로드
    ctx_embeddings = np.load(ctx_emb_path)
    q_embeddings = np.load(q_emb_path)

    assert len(q_embeddings) == len(qa_pairs), "❗ 질문 임베딩 수와 QA 쌍 수가 일치하지 않습니다."

    hit_count = 0

    # 2. 각 질문에 대해 유사한 top-k 문서 검색
    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating Recall@K"):
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]

        # 문서들과의 유사도 (cosine 유사도 대신 dot-product 사용)
        scores = np.dot(ctx_embeddings, q_emb)

        # top-k 인덱스
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_doc_ids = corpus_df.iloc[topk_indices]["doc_id"].tolist()

        # 정답 문서가 top-k에 포함되는지 확인
        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 Recall@{k}: {recall_at_k:.4f}")
    return recall_at_k

In [None]:
recall_dpr = compute_dpr_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings2.npy",
    q_emb_path="/mnt/aix7101/jeong/aix_project/dpr_q_embeddings2.npy",
    k=10
)


Evaluating Recall@K: 87599it [01:35, 914.73it/s] 

📌 Recall@10: 0.6768





## DPR-m

In [None]:
def compute_dprm_recall(
    qa_pairs: pd.DataFrame,
    corpus_df: pd.DataFrame,
    ctx_emb_path: str,
    q_emb_path: str,
    k: int = 5,
    aggregation: str = "max",  # or "mean"
) -> float:
    """
    문장 단위의 문서 임베딩을 사용하여 DPR-m 방식의 Recall@k 계산.

    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        ctx_emb_path (str): 문장 단위 문서 임베딩 저장 경로 (.npy, shape: [num_docs, num_sents, dim])
        q_emb_path (str): 질문 임베딩 저장 경로 (.npy, shape: [num_queries, dim])
        k (int): Recall@k
        aggregation (str): 'max' 또는 'mean' 방식으로 문서 유사도 집계
        
    Returns:
        float: Recall@k
    """
    
    # 1. 임베딩 로드
    ctx_embeddings = np.load(ctx_emb_path)     # shape: (num_docs, num_sents, dim)
    q_embeddings = np.load(q_emb_path)         # shape: (num_queries, dim)

    assert len(q_embeddings) == len(qa_pairs), "❗ 질문 임베딩 수와 QA 쌍 수가 일치하지 않습니다."

    hit_count = 0

    # 2. 각 질문에 대해 문서들과 유사도 계산
    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating DPR-m Recall@K"):
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]                     # shape: (dim,)
        
        # 문서별 문장들과 유사도 → shape: (num_docs, num_sents)
        dot_products = np.einsum("ijk,k->ij", ctx_embeddings, q_emb)  # 효율적인 벡터 연산

        # 문서 단위 유사도 집계
        if aggregation == "max":
            scores = np.max(dot_products, axis=1)     # (num_docs,)
        elif aggregation == "mean":
            scores = np.mean(dot_products, axis=1)
        else:
            raise ValueError("aggregation은 'max' 또는 'mean'이어야 합니다.")

        # top-k 문서 인덱스 추출
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_doc_ids = corpus_df.iloc[topk_indices]["doc_id"].tolist()

        # 정답 포함 여부 확인
        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 DPR-m Recall@{k} ({aggregation} aggregation): {recall_at_k:.4f}")
    return recall_at_k

In [None]:
compute_dprm_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_m_ctx_embeddings2.npy",
    q_emb_path="/mnt/aix7101/jeong/aix_project/dpr_q_embeddings2.npy",
    k=5,
    aggregation="max"
)

## hybrid (bm25 + DPR)

In [None]:
def compute_hybrid_recall(
    qa_pairs: pd.DataFrame,
    corpus_df: pd.DataFrame,
    ctx_emb_path: str,
    q_emb_path: str,
    bm25_top_n: int = 100,
    k: int = 5
) -> float:
    """
    BM25 + DPR hybrid retrieval 기반 Recall@k 계산

    Args:
        qa_pairs (pd.DataFrame): 질문-정답 쌍 (columns: ['question', 'answer', 'doc_id'])
        corpus_df (pd.DataFrame): 문서 집합 (columns: ['doc_id', 'text'])
        ctx_emb_path (str): DPR 문서 임베딩 경로 (.npy, shape: [num_docs, dim])
        q_emb_path (str): DPR 질문 임베딩 경로 (.npy, shape: [num_queries, dim])
        bm25_top_n (int): BM25로 먼저 선택할 후보 문서 개수
        k (int): 최종 DPR top-k에서 정답 포함 여부 평가

    Returns:
        float: Recall@k
    """
    # 1. 임베딩 불러오기
    ctx_embeddings = np.load(ctx_emb_path)     # shape: (num_docs, dim)
    q_embeddings = np.load(q_emb_path)         # shape: (num_queries, dim)
    assert len(q_embeddings) == len(qa_pairs), "❗ 질문 임베딩 수와 QA 쌍 수가 일치하지 않습니다."

    # 2. BM25 인덱스 구성
    tokenized_corpus = [doc.split() for doc in corpus_df["text"]]
    bm25 = BM25Okapi(tokenized_corpus)

    hit_count = 0

    # 3. 각 질문에 대해 hybrid retrieval 수행
    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating Hybrid Recall@K"):
        question = row["question"]
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]  # (dim,)

        # (1) BM25 후보 추출
        tokenized_query = question.split()
        bm25_scores = bm25.get_scores(tokenized_query)
        bm25_top_indices = np.argsort(bm25_scores)[::-1][:bm25_top_n]

        # (2) DPR 유사도 계산 (bm25 후보에 한해)
        candidate_ctx_embs = ctx_embeddings[bm25_top_indices]  # (bm25_top_n, dim)
        dpr_scores = np.dot(candidate_ctx_embs, q_emb)         # (bm25_top_n,)

        # (3) DPR 기반 top-k 문서 선택
        topk_local_indices = np.argsort(dpr_scores)[::-1][:k]
        topk_doc_indices = [bm25_top_indices[i] for i in topk_local_indices]
        topk_doc_ids = corpus_df.iloc[topk_doc_indices]["doc_id"].tolist()

        # (4) 정답 포함 여부 확인
        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 Hybrid Recall@{k} (BM25 top-{bm25_top_n} + DPR top-{k}): {recall_at_k:.4f}")
    return recall_at_k

In [None]:
compute_hybrid_recall(
    qa_pairs=qa_pairs,
    corpus_df=corpus_df,
    ctx_emb_path="/mnt/aix7101/jeong/aix_project/dpr_ctx_embeddings2.npy",
    q_emb_path="/mnt/aix7101/jeong/aix_project/dpr_q_embeddings2.npy",
    bm25_top_n=20,
    k=5
)

## Custom Retrieval 구성 요소

In [None]:
#-- keyword extraction function
# 1. rule-based 

# 2. keyBERT

# 3. Hybrid


In [None]:
#-- retrieval

In [None]:
#-- custom checking code
