In [None]:
from datasets import load_dataset
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from sklearn.metrics import f1_score
import numpy as np
import faiss
import torch
import pandas as pd
from tqdm import tqdm

import json
import os


In [None]:

from datasets import load_dataset

# nq_open 데이터셋 로드
dataset = load_dataset("nq_open", split="train")

# 예시 확인
print(dataset[0])

{'question': 'where did they film hot tub time machine', 'answer': ['Fernie Alpine Resort']}


In [None]:
from datasets import load_dataset

# DPR Wikipedia passages 로드
corpus = load_dataset("wiki_dpr", "psgs_w100.nq.no_index", split="train[:5000000]")

# 샘플 확인
print(corpus[0])

Downloading data:   0%|          | 0/157 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/21015300 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/161 [00:00<?, ?it/s]

{'id': '1', 'text': 'Aaron Aaron ( or ; "Ahärôn") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother\'s spokesman ("prophet") to the Pharaoh. Part of the Law (Torah) that Moses received from', 'title': 'Aaron', 'embeddings': [0.013342111371457577, 0.582173764705658, -0.31309744715690613, -0.6991612911224365, -0.5583199858665466, 0.5187504887580872, 0.7152731418609619, -0.08567414432764053, -0.24895088374614716, -0.4495537281036377, -0.643000066280365, 0.11746902763843536, -0.22123917937278748, 0.30100083351135254, 0.08902842551469803, 0.018262844532728195,

In [None]:
corpus_df = pd.DataFrame({
    "doc_id": corpus["id"],
    "title": corpus["title"],
    "text": corpus["text"]
})

In [10]:
corpus_df.to_json("/mnt/aix7101/jeong/aix_project/nq_rag_corpus.json", orient="records", lines=True)

In [11]:
corpus_df.head()

Unnamed: 0,doc_id,title,text
0,1,Aaron,"Aaron Aaron ( or ; ""Ahärôn"") is a prophet, hig..."
1,2,Aaron,God at Sinai granted Aaron the priesthood for ...
2,3,Aaron,his rod turn into a snake. Then he stretched o...
3,4,Aaron,"however, Aaron and Hur remained below to look ..."
4,5,Aaron,"Aaron and his sons to the priesthood, and arra..."


In [17]:
dataset[0]

{'question': 'where did they film hot tub time machine',
 'answer': ['Fernie Alpine Resort']}

In [13]:
qa_data = []
for item in dataset:
    question = item["question"]
    answers = item["answer"]
    
    # answers가 비어있을 수도 있어서 첫 정답만
    answer = answers[0] if answers else ""
    
    qa_data.append({
        "question": question,
        "answer": answer
    })

In [14]:
qa_pairs = pd.DataFrame(qa_data)

In [15]:
qa_pairs.to_json("qa_pairs.json", orient="records", lines=True)


In [16]:
print("Corpus 예시:")
print(corpus_df.head())

print("\nQA 쌍 예시:")
print(qa_pairs.head())

Corpus 예시:
  doc_id  title                                               text
0      1  Aaron  Aaron Aaron ( or ; "Ahärôn") is a prophet, hig...
1      2  Aaron  God at Sinai granted Aaron the priesthood for ...
2      3  Aaron  his rod turn into a snake. Then he stretched o...
3      4  Aaron  however, Aaron and Hur remained below to look ...
4      5  Aaron  Aaron and his sons to the priesthood, and arra...

QA 쌍 예시:
                                            question                answer
0           where did they film hot tub time machine  Fernie Alpine Resort
1   who has the right of way in international waters        Neither vessel
2            who does annie work for attack on titan                Marley
3  when was the immigration reform and control ac...      November 6, 1986
4              when was puerto rico added to the usa                  1950


In [20]:

# 3. Load DPR model and tokenizer (use multi-qa)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

In [21]:
ctx_encoder.eval()
q_encoder.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ctx_encoder.to(device)
q_encoder.to(device)


DPRQuestionEncoder(
  (question_encoder): DPREncoder(
    (bert_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_feature

In [27]:
batch_size = 16
ctx_embeddings = []

for i in tqdm(range(0, len(corpus_df), batch_size), desc="Encoding contexts"):
    batch_texts = corpus_df["text"].iloc[i:i+batch_size].tolist()
    batch_texts = [str(t).strip() for t in batch_texts]

    inputs = ctx_tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = ctx_encoder(**inputs)
        emb_batch = output.pooler_output.cpu().numpy()  # or output.last_hidden_state[:, 0]
        ctx_embeddings.append(emb_batch)

ctx_embeddings = np.vstack(ctx_embeddings)

Encoding contexts:   0%|          | 469/1313457 [00:19<15:30:55, 23.51it/s]


KeyboardInterrupt: 

In [None]:
batch_size = 32  # 필요에 따라 조정 가능
q_embeddings = []

questions = qa_pairs["question"].tolist()

for i in tqdm(range(0, len(questions), batch_size), desc="Encoding questions"):
    batch_questions = questions[i:i+batch_size]
    batch_questions = [str(q).strip() for q in batch_questions]

    inputs = q_tokenizer(batch_questions, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = q_encoder(**inputs)
        emb_batch = output.pooler_output.cpu().numpy()  # or output.last_hidden_state[:, 0]
        q_embeddings.append(emb_batch)

q_embeddings = np.vstack(q_embeddings)

Encoding questions: 100%|██████████| 87925/87925 [08:58<00:00, 163.40it/s]


In [None]:
# 4. 저장
embedding_dir = "/mnt/aix7101/jeong/aix_project"
if not os.path.exists(embedding_dir):
    os.makedirs(embedding_dir)
    print(f"📁 Created directory: {embedding_dir}")

ctx_path = os.path.join(embedding_dir, "nq_ctx_embeddings.npy")
# q_path = os.path.join(embedding_dir, "nq_q_embeddings.npy")

np.save(ctx_path, ctx_embeddings)
# np.save(q_path, q_embeddings)

print(f"✅ Context embeddings saved to: {ctx_path}")
# print(f"✅ Question embeddings saved to: {q_path}")

✅ Question embeddings saved to: /mnt/aix7101/jeong/aix_project/nq_q_embeddings.npy


In [None]:
from nltk.tokenize import sent_tokenize
import numpy as np
from tqdm import tqdm

batch_size = 16  # GPU 상황에 따라 조정
ctx_sentence_embeddings = []

for doc in tqdm(corpus_df["text"], desc="Encoding multi-sentence contexts"):
    # 1. 문서 내 문장 분리
    sentences = sent_tokenize(doc)
    doc_embeddings = []

    # 2. 문장들을 배치로 처리
    for i in range(0, len(sentences), batch_size):
        batch_sents = sentences[i:i+batch_size]
        inputs = ctx_tokenizer(
            batch_sents,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=128
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = ctx_encoder(**inputs)
            emb_batch = output.pooler_output.cpu().numpy()  # or last_hidden_state[:, 0]
            doc_embeddings.append(emb_batch)

    # 3. 문서 하나에 대한 (문장 수, dim) 배열 생성
    doc_embeddings = np.vstack(doc_embeddings)
    ctx_sentence_embeddings.append(doc_embeddings)
    
# 4. 문서별 문장 수가 달라 3D 배열로 만들고 싶을 경우
max_len = max(e.shape[0] for e in ctx_sentence_embeddings)
dim = ctx_sentence_embeddings[0].shape[1]

padded_embeddings = np.zeros((len(ctx_sentence_embeddings), max_len, dim))
for i, emb in enumerate(ctx_sentence_embeddings):
    padded_embeddings[i, :emb.shape[0], :] = emb

In [None]:
# 4. 저장
embedding_dir = "/mnt/aix7101/jeong/aix_project"
if not os.path.exists(embedding_dir):
    os.makedirs(embedding_dir)
    print(f"📁 Created directory: {embedding_dir}")

sentence_ctx_path = os.path.join(embedding_dir, "nq_dpr_s_ctx_embeddings_multiqa.npy")
ctx_sentence_embeddings = np.array(ctx_sentence_embeddings, dtype=object)
np.save(sentence_ctx_path, ctx_sentence_embeddings, allow_pickle=True)

print(f"✅ Context Sentence embeddings saved to: {sentence_ctx_path}")