In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import json

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Dataset loading and processing
def load_dataset_from_huggingface():
    dataset = load_dataset("nq_open", split="train[:1000]")
    return dataset

# Pickle functions
def save_pickle(obj, filename):
    with open(filename, "wb") as f:
        pickle.dump(obj, f)

def load_json(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

# DPR embeddings generation
def generate_dpr_embeddings(corpus):
    dpr_ctx_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-multiset-base')
    embeddings = dpr_ctx_encoder.encode(corpus, batch_size=64, convert_to_tensor=False, show_progress_bar=True)
    return embeddings

def generate_query_embeddings(queries):
    dpr_question_encoder = SentenceTransformer('facebook-dpr-question_encoder-multiset-base')
    query_embeddings = dpr_question_encoder.encode(queries, batch_size=64, convert_to_tensor=False, show_progress_bar=True)
    return query_embeddings

# FAISS index creation for DPR
def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

In [2]:
import spacy
from keybert import KeyBERT
from typing import List
import re



# Load spacy model for rule-based extraction
nlp = spacy.load("en_core_web_sm")

nlp = spacy.load("en_core_web_sm")
WH_WORDS = {"what", "who", "whom", "where", "when", "why", "how"}

def extract_keyphrases_spacy(question: str):
    doc = nlp(question.lower())
    keyphrases = set()

    wh_word = None
    for token in doc:
        if token.text in WH_WORDS:
            wh_word = token.text
            break

    for chunk in doc.noun_chunks:
        if any(not token.is_stop and token.pos_ in {"NOUN", "PROPN"} for token in chunk):
            keyphrases.add(chunk.text.strip())
    if wh_word:
        hint_map = {
            "who": "person",
            "where": "location",
            "when": "time",
            "why": "reason",
            "how": "method",
        }
        hint = hint_map.get(wh_word)
        if hint:
            keyphrases.add(hint)

    return list(keyphrases)



# KeyBERT based extraction
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

def extract_keyphrases_keybert(question: str, top_n: int = 5, diversity: bool = False) -> List[str]:
    question_clean = re.sub(r"[^\w\s]", "", question.lower())  # 간단한 전처리

    if diversity:
        keyphrases = kw_model.extract_keywords(
            question_clean,
            keyphrase_ngram_range=(1, 3),
            stop_words='english',
            use_mmr=True,
            diversity=0.7,
            top_n=top_n
        )
    else:
        keyphrases = kw_model.extract_keywords(
            question_clean,
            keyphrase_ngram_range=(1, 3),
            stop_words='english',
            top_n=top_n
        )
    return [phrase for phrase, _ in keyphrases]


# # Unified extraction interface
# def extract_keywords(text, method="spacy", topk=5):
#     if method == "spacy":
#         return spacy_extract(text, topk)
#     elif method == "keybert":
#         return keybert_extract(text, topk)
#     else:
#         raise ValueError("Unknown extraction method")


In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import torch

from rank_bm25 import BM25Okapi
from tqdm import tqdm
import pandas as pd
import numpy as np


import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score


# 간단한 토큰 기반 F1 계산기
def compute_simple_f1(references, predictions):
    f1_scores = []
    for ref, pred in zip(references, predictions):
        ref_tokens = set(ref.lower().split())
        pred_tokens = set(pred.lower().split())
        common = ref_tokens & pred_tokens
        if len(common) == 0:
            f1_scores.append(0)
            continue
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(ref_tokens)
        f1 = 2 * precision * recall / (precision + recall)
        f1_scores.append(f1)
    return np.mean(f1_scores)


# 기존 BM25 + generator 평가용 함수
def bm25_retrieval_with_generation(
    qa_pairs: pd.DataFrame,
    corpus_df: pd.DataFrame,
    generator_tokenizer,
    generator_model,
    k: int = 5,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    generator_model.to(device)
    generator_model.eval()

    tokenized_corpus = [doc.split() for doc in corpus_df["text"]]
    bm25 = BM25Okapi(tokenized_corpus)

    predictions = []
    references = []

    for _, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="BM25 + Generator Evaluation"):
        question = row["question"]
        gt_answer = row["answer"]

        tokenized_query = question.split()
        scores = bm25.get_scores(tokenized_query)
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_passages = corpus_df.iloc[topk_indices]["text"].tolist()

        # generator에 넣을 input 생성
        prompt = f"question: {question} context: {' '.join(topk_passages)}"
        inputs = generator_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = generator_model.generate(**inputs, max_length=64)
            answer = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(answer.strip())
        references.append(gt_answer.strip())

    acc = accuracy_score(references, predictions)
    f1 = compute_simple_f1(references, predictions)

    print(f"📌 BM25+Generator Accuracy: {acc:.4f}")
    print(f"📌 BM25+Generator F1 Score: {f1:.4f}")

    return acc, f1


# DPR retrieval
def dpr_retrieval_with_generation(
    qa_pairs, 
    corpus_df, 
    q_embeddings, 
    ctx_embeddings, 
    generator_tokenizer, 
    generator_model, 
    k=5, 
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    generator_model.to(device)
    generator_model.eval()

    predictions = []
    references = []

    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="DPR + Generator Evaluation"):
        question = row["question"]
        gt_answer = row["answer"]
        q_emb = q_embeddings[idx]

        # DPR retrieval
        scores = np.dot(ctx_embeddings, q_emb)
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_passages = corpus_df.iloc[topk_indices]["text"].tolist()

        # generator input 생성
        prompt = f"question: {question} context: {' '.join(topk_passages)}"
        inputs = generator_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = generator_model.generate(**inputs, max_length=64)
            answer = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(answer.strip())
        references.append(gt_answer.strip())

    acc = accuracy_score(references, predictions)
    f1 = compute_simple_f1(references, predictions)

    print(f"📌 DPR+Generator Accuracy: {acc:.4f}")
    print(f"📌 DPR+Generator F1 Score: {f1:.4f}")

    return acc, f1


# Sentence DPR retrieval
def s_dpr_retrieval(qa_pairs, corpus_df, q_embeddings, ctx_embeddings, k=5, aggregation="max"):

    assert len(q_embeddings) == len(qa_pairs), "❗ 질문 임베딩 수와 QA 쌍 수가 일치하지 않습니다."

    hit_count = 0
    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating DPR-m Recall@K"):
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]

        scores = []
        for doc_sents in ctx_embeddings:
            sent_scores = np.dot(doc_sents, q_emb)
            if aggregation == "max":
                score = np.max(sent_scores)
            elif aggregation == "mean":
                score = np.mean(sent_scores)
            else:
                raise ValueError("aggregation은 'max' 또는 'mean'이어야 합니다.")
            scores.append(score)

        scores = np.array(scores)
        topk_indices = np.argsort(scores)[::-1][:k]
        topk_doc_ids = corpus_df.iloc[topk_indices]["doc_id"].tolist()

        if gt_doc_id in topk_doc_ids:
            hit_count += 1
    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 sentenceDPR Recall@{k} ({aggregation} aggregation): {recall_at_k:.4f}")
    return recall_at_k


# Hybrid retrieval (BM25 + DPR)
def hybrid_retrieval(qa_pairs, corpus_df, q_embeddings, ctx_embeddings, bm25_top_n=300, k=5):

    tokenized_corpus = [doc.split() for doc in corpus_df["text"]]
    bm25 = BM25Okapi(tokenized_corpus)

    hit_count = 0

    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Evaluating Hybrid Recall@K"):
        question = row["question"]
        gt_doc_id = row["doc_id"]
        q_emb = q_embeddings[idx]  


        tokenized_query = question.split()
        bm25_scores = bm25.get_scores(tokenized_query)
        bm25_top_indices = np.argsort(bm25_scores)[::-1][:bm25_top_n]

        candidate_ctx_embs = ctx_embeddings[bm25_top_indices] 
        dpr_scores = np.dot(candidate_ctx_embs, q_emb)         

        topk_local_indices = np.argsort(dpr_scores)[::-1][:k]
        topk_doc_indices = [bm25_top_indices[i] for i in topk_local_indices]
        topk_doc_ids = corpus_df.iloc[topk_doc_indices]["doc_id"].tolist()

        if gt_doc_id in topk_doc_ids:
            hit_count += 1

    recall_at_k = hit_count / len(qa_pairs)
    print(f"📌 Hybrid Recall@{k} (BM25 top-{bm25_top_n} + DPR top-{k}): {recall_at_k:.4f}")
    return recall_at_k

# Ours retrieval
def key_retrieval_with_generation(
    ctx_tokenizer,
    ctx_encoder,
    qa_pairs,
    corpus_df,
    q_embeddings,
    ctx_embeddings,
    extract_keyphrases_fn,
    generator_tokenizer,
    generator_model,
    top_n_per_keyphrase=300,
    k=5,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    generator_model.to(device)
    generator_model.eval()

    predictions = []
    references = []

    for idx, row in tqdm(qa_pairs.iterrows(), total=len(qa_pairs), desc="Keyphrase + Generator Evaluation"):
        question = row["question"]
        gt_answer = row["answer"]

        # 1️⃣ Keyphrase 추출
        keyphrases = extract_keyphrases_fn(question)
        if not keyphrases:
            continue  # 키워드 못뽑은 경우 스킵

        inputs = ctx_tokenizer(
            keyphrases,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(device)

        with torch.no_grad():
            phrase_embs = ctx_encoder(**inputs).pooler_output.cpu().numpy()

        # 2️⃣ Candidate documents 선정
        candidate_indices = set()
        for emb in phrase_embs:
            scores = np.dot(ctx_embeddings, emb)
            top_indices = np.argsort(scores)[::-1][:top_n_per_keyphrase]
            candidate_indices.update(top_indices)

        if not candidate_indices:
            continue

        query_emb = q_embeddings[idx]
        candidate_indices = list(candidate_indices)
        candidate_embs = ctx_embeddings[candidate_indices]
        rerank_scores = np.dot(candidate_embs, query_emb)

        top_k_indices = np.argsort(rerank_scores)[::-1][:k]
        top_k_passages = corpus_df.iloc[[candidate_indices[i] for i in top_k_indices]]["text"].tolist()

        # 3️⃣ Generator에 넣기
        prompt = f"question: {question} context: {' '.join(top_k_passages)}"
        gen_inputs = generator_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = generator_model.generate(**gen_inputs, max_length=64)
            answer = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(answer.strip())
        references.append(gt_answer.strip())

    acc = accuracy_score(references, predictions)
    f1 = compute_simple_f1(references, predictions)

    print(f"📌 Keyphrase+Generator Accuracy: {acc:.4f}")
    print(f"📌 Keyphrase+Generator F1 Score: {f1:.4f}")

    return acc, f1

In [4]:

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import torch

from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from dataset import load_jsonl, load_json, build_faiss_index, generate_dpr_embeddings, generate_query_embeddings
from retrieval import bm25_retrieval_with_generation, dpr_retrieval_with_generation, s_dpr_retrieval, hybrid_retrieval, key_retrieval_with_generation
from key_extract import extract_keyphrases_spacy, extract_keyphrases_keybert



generator_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
generator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# 데이터 경로 설정
dataset_dir = "/mnt/aix7101/jeong/aix_project"
corpus_path = f"{dataset_dir}/nq_rag_corpus.json"
query_path = f"{dataset_dir}/nq_rag_qa_pairs.json"
corpus_emb_path = f"{dataset_dir}/wiki_dpr_embeddings.npy"
corpus_sentence_emb_path = f"{dataset_dir}/wiki_s_dpr_embeddings.npy"
query_emb_path = f"{dataset_dir}/nq_q_embeddings.npy"

# 데이터 로딩
print("Loading corpus and queries...")
corpus = load_jsonl(corpus_path)
corpus_df = pd.DataFrame(corpus)
queries = load_jsonl(query_path)
queries_df = pd.DataFrame(queries)

# 임베딩 로딩
print("Loading pre-computed embeddings...")
corpus_embeddings = np.load(corpus_emb_path)
query_embeddings = np.load(query_emb_path)

ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

# q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-multiset-base")
# q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")

ctx_encoder.eval()
# q_encoder.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ctx_encoder.to(device)
# q_encoder.to(device)

extract_keyphrases_fn = extract_keyphrases_keybert

# bm25_retrieval_with_generation(queries_df, corpus_df, generator_tokenizer, generator_model, k=5)
# dpr_retrieval_with_generation(queries_df, corpus_df, query_embeddings, corpus_embeddings, generator_tokenizer, generator_model, k=5)
# s_dpr_retrieval(queries_df, corpus_df, query_embeddings, corpus_embeddings, k=5)
# hybrid_retrieval(queries_df, corpus_df, query_embeddings, corpus_embeddings, k=5)
key_retrieval_with_generation(ctx_tokenizer, ctx_encoder, queries_df, corpus_df, query_embeddings, corpus_embeddings, extract_keyphrases_fn, generator_tokenizer, generator_model, k=5)


Loading corpus and queries...


KeyboardInterrupt: 