In [13]:
%pip install -U langchain langchain-community langchain-openai pypdf faiss-cpu gradio FlagEmbedding sentence-transformers tiktoken rank_bm25


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import os
# OpenAI API 키 설정 (본인의 API 키로 변경)
from dotenv import load_dotenv

load_dotenv()
# OpenAI API 클라이언트 생성
API_KEY = os.getenv("OPENAI_API_KEY")

# from google.colab import userdata
# api_key=userdata.get('api_key')
# os.environ["OPENAI_API_KEY"] = api_key


In [None]:
# 🚀 Colab Chatbot: Ensemble Retriever (BM25 + FAISS) + Cross-Encoder Reranker
# - 1차: 앙상블 리트리버 (FAISS dense + BM25 sparse)
# - 2차: Cross-Encoder reranking (BAAI/bge-reranker-large)
# - 최종 질의응답: OpenAI Chat(기본)
# - Gradio UI 제공
#
# ⚠️ 설치 명령은 본 스크립트에서 실행하지 않습니다. Colab에서는 아래를 별도 셀로 실행하세요:
# %pip install -U langchain langchain-community langchain-openai pypdf faiss-cpu gradio FlagEmbedding sentence-transformers rank-bm25 tiktoken

# ==========================
# 1) Imports & Env Checks
# ==========================
import os, urllib.request, glob, textwrap
from typing import List

from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from FlagEmbedding import FlagReranker
from langchain.schema import SystemMessage, HumanMessage
from rank_bm25 import BM25Okapi
import gradio as gr

# ==========================
# 2) Load Documents
# ==========================

PDF_URL = "https://github.com/chatgpt-kr/openai-api-tutorial/raw/main/ch07/2020_%EA%B2%BD%EC%A0%9C%EA%B8%88%EC%9C%B5%EC%9A%94%EC%96%B4%20700%EC%84%A0_%EA%B2%8C%EC%8B%9C.pdf"
PDF_PATH = "2020_경제금융용어 700선_게시.pdf"

try:
    if not os.path.exists(PDF_PATH):
        urllib.request.urlretrieve(PDF_URL, PDF_PATH)
except Exception as e:
    print("[Warn] PDF download error:", e)

pdf_files = sorted(glob.glob("*.pdf"))
if not pdf_files:
    raise FileNotFoundError("No PDFs found – please upload or place PDFs in the working directory.")

all_docs: List[Document] = []
for f in pdf_files:
    try:
        loader = PyPDFLoader(f)
        all_docs.extend(loader.load())
    except Exception as e:
        print(f"[Warn] Failed to load {f}: {e}")

print(f"Loaded pages: {len(all_docs)} from {len(pdf_files)} PDF(s)")

# ==========================
# 3) Chunking
# ==========================
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,   # smaller chunks to avoid hitting token limits
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""],
)
chunked_docs = text_splitter.split_documents(all_docs)
print("Chunks:", len(chunked_docs))

# ==========================
# 4) Build FAISS index (Dense Retriever)
# ==========================
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Process documents in batches to avoid exceeding token limits
batch_size = 500 # Adjust batch size based on your document length and token limit
vectordb = None

for i in range(0, len(chunked_docs), batch_size):
    batch = chunked_docs[i : i + batch_size]
    if vectordb is None:
        vectordb = FAISS.from_documents(batch, embeddings)
    else:
        vectordb.add_documents(batch)

# ==========================
# 5) Build BM25 index (Sparse Retriever)
# ==========================
tokenized_corpus = [doc.page_content.split() for doc in chunked_docs]
bm25 = BM25Okapi(tokenized_corpus)

# ==========================
# 6) Ensemble Retriever (Dense + Sparse)
# ==========================
def make_doc_key(doc: Document) -> str:
    """각 Document를 대표할 고유 key 생성"""
    src = doc.metadata.get("source", "")
    page = str(doc.metadata.get("page", ""))
    snippet = (doc.page_content or "")[:50]  # 앞부분 50자만
    return f"{src}-{page}-{snippet}"

def ensemble_retrieve(query: str, k: int = 20, alpha: float = 0.5) -> List[Document]:
    """
    앙상블 리트리버: FAISS (dense) + BM25 (sparse)
    alpha: dense 점수 가중치 (0~1)
    """
    # Dense
    dense_docs = vectordb.similarity_search_with_score(query, k=k)
    dense_scores = {make_doc_key(d): s for d, s in dense_docs}

    # Sparse
    bm25_scores = bm25.get_scores(query.split())
    bm25_ranked = sorted(zip(chunked_docs, bm25_scores), key=lambda x: x[1], reverse=True)[:k]
    bm25_scores_dict = {make_doc_key(d): s for d, s in bm25_ranked}

    # 결합
    combined = {}
    doc_map = {}
    for doc in chunked_docs:
        key = make_doc_key(doc)
        d_score = dense_scores.get(key, 0.0)
        b_score = bm25_scores_dict.get(key, 0.0)
        score = alpha * d_score + (1 - alpha) * b_score
        if score > 0:
            combined[key] = score
            doc_map[key] = doc

    ranked = sorted(combined.items(), key=lambda x: x[1], reverse=True)[:k]
    return [doc_map[key] for key, _ in ranked]


# ==========================
# 7) Cross-Encoder Reranker (Stage 2)
# ==========================
reranker = FlagReranker("BAAI/bge-reranker-large", use_fp16=True)

def rerank_with_bge(query: str, docs: List[Document], top_k: int = 5) -> List[Document]:
    pairs = [[query, d.page_content] for d in docs]
    scores = reranker.compute_score(pairs, normalize=True)
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [d for d, _ in ranked[:top_k]]

# ==========================
# 8) LLM & Prompt
# ==========================
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=512)

SYSTEM_PROMPT = (
    "당신은 신뢰할 수 있는 한국어 전문가 비서입니다. 다음 문서를 바탕으로 질문에 정확하고 간결하게 답하세요.\n"
    "- 주어진 문맥만 사용하고, 모르면 모른다고 말하세요.\n"
    "- 관련 조항/용어/정의가 있으면 인용 형태로 짧게 표시하세요.\n"
)

USER_PROMPT_TEMPLATE = (
    "질문: {question}\n\n"
    "참고 문맥:\n{context}\n\n"
    "지침:\n- 한국어로 답변\n- 핵심 bullet 3~5개\n- 필요한 경우 한 줄 인용 포함\n"
)

# ==========================
# 9) Helpers
# ==========================
def build_context(docs: List[Document], max_chars: int = 1500) -> str:
    ctx = []
    total_len = 0
    for i, d in enumerate(docs, 1):
        meta = d.metadata
        loc = f"p{meta.get('page', '?')}" if isinstance(meta.get('page'), int) else ""
        snippet = textwrap.shorten(d.page_content, width=500, placeholder=" …")
        if total_len + len(snippet) > max_chars:
            break
        ctx.append(f"[문서 {i}{' '+loc if loc else ''}]\n" + snippet)
        total_len += len(snippet)
    return "\n\n".join(ctx)

# ==========================
# 10) QA function (Ensemble + Reranker)
# ==========================
def answer_question(query: str, k1: int = 20, k2: int = 5, alpha: float = 0.5):
    initial_docs = ensemble_retrieve(query, k=k1, alpha=alpha)
    reranked = rerank_with_bge(query, initial_docs, top_k=k2)
    context = build_context(reranked)
    user_prompt = USER_PROMPT_TEMPLATE.format(question=query, context=context)
    messages = [
        SystemMessage(content=SYSTEM_PROMPT),
        HumanMessage(content=user_prompt),
    ]
    resp = llm.invoke(messages)
    return resp.content, reranked

# ==========================
# 11) Gradio Chat UI
# ==========================
with gr.Blocks(title="Ensemble + BGE Rerank QA") as demo:
    gr.Markdown(
        """
        # 📚 Ensemble (FAISS + BM25) + BGE Cross‑Encoder Rerank QA
        1차: 앙상블 리트리버(FAISS + BM25) → 2차: **BAAI/bge-reranker-large**로 정밀 재정렬 → LLM 최종 답변
        """
    )

    with gr.Row():
        inp = gr.Textbox(label="질문 입력", placeholder="예: 금융통화위원회의 역할은?")
    with gr.Row():
        k1_slider = gr.Slider(5, 50, value=20, step=1, label="k1: 1차 검색 개수")
        k2_slider = gr.Slider(1, 10, value=5, step=1, label="k2: 2차 컨텍스트 개수")
        alpha_slider = gr.Slider(0, 1, value=0.5, step=0.1, label="alpha: Dense/Sparse 가중치")
    with gr.Row():
        btn = gr.Button("질문하기", variant="primary")
    out = gr.Chatbot(label="답변", bubble_full_width=False)
    with gr.Accordion("📎 사용된 근거 문맥 (Top-k)", open=False):
        ctx = gr.JSON(label="Reranked Context Metadata")

    def _run(q, k1, k2, alpha, history):
        if not q or not q.strip():
            return history, None
        try:
            ans, used_docs = answer_question(q, int(k1), int(k2), float(alpha))
            meta_view = [
                {
                    "source": d.metadata.get("source"),
                    "page": d.metadata.get("page"),
                    "chars": len(d.page_content),
                }
                for d in used_docs
            ]
            history = (history or []) + [(q, ans)]
            return history, meta_view
        except Exception as e:
            history = (history or []) + [(q, f"오류: {e}")]
            return history, None

    btn.click(_run, [inp, k1_slider, k2_slider, alpha_slider, out], [out, ctx])
    inp.submit(_run, [inp, k1_slider, k2_slider, alpha_slider, out], [out, ctx])

demo.launch(debug=True)

Loaded pages: 742 from 2 PDF(s)
Chunks: 2306


  out = gr.Chatbot(label="답변", bubble_full_width=False)
  out = gr.Chatbot(label="답변", bubble_full_width=False)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://0907ce7358b6b15ef9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
