# Import libiaries

In [237]:
%%time
from IPython.display import clear_output
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Standard library imports
import os          # File system operations
import glob        # File path pattern matching
import textwrap    # Text formatting utilities
import time        # Time-related functions
import pathlib
from typing import Tuple

# LangChain imports
import langchain
### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader   # For loading PDF files and directories of docs
### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter    # For splitting documents into smaller chunks
### prompts
from langchain import PromptTemplate, LLMChain                        # For defining prompts and chaining LLM calls
### vector stores
from langchain.vectorstores import FAISS                              # FAISS vector store for similarity search
### models
from langchain.llms import HuggingFacePipeline                        # Use HuggingFace models as LLMs
from langchain.embeddings import HuggingFaceEmbeddings        # Generate embeddings with HuggingFace models
### retrievers
from langchain.chains import RetrievalQA                              # Retrieval-based QA chain

# PyTorch and Transformers imports
import torch
import transformers
from transformers import (
    AutoTokenizer,            # Tokenizer for preprocessing text
    AutoModelForCausalLM,     # Causal language model (decoder-only LM)
    BitsAndBytesConfig,       # Quantization config for efficient inference
    pipeline                  # High-level HuggingFace pipeline API
)

# Clear previous output (likely for notebook use)
# clear_output()


CPU times: user 482 μs, sys: 0 ns, total: 482 μs
Wall time: 510 μs


In [239]:
import sys
print("python:", sys.version)
print("torch:", torch.version.cuda)
print("torch available:", torch.cuda.is_available())

print('langchain:', langchain.__version__)   
print('torch:', torch.__version__)          
print('transformers:', transformers.__version__) 

python: 3.11.13 (main, Jun  5 2025, 13:12:00) [GCC 11.2.0]
torch: 12.8
torch available: True
langchain: 0.3.27
torch: 2.8.0+cu128
transformers: 4.55.4


In [243]:
import glob

In [244]:
# glob.glob() returns all matching file paths
# sorted() ensures the files are in a consistent order (e.g., HP1, HP2, HP3...)
d = sorted(glob.glob('input/harry-potter-books-in-pdf-1-7/*'))
print(d)

["input/harry-potter-books-in-pdf-1-7/01 Harry Potter and the Philosopher's Stone - J.K. Rowling.pdf", 'input/harry-potter-books-in-pdf-1-7/02 Harry Potter and the Chamber of Secrets - J.K. Rowling.pdf', 'input/harry-potter-books-in-pdf-1-7/03 Harry Potter and the Prisoner of Azkaban - J.K. Rowling.pdf', 'input/harry-potter-books-in-pdf-1-7/04 Harry Potter and the Goblet of Fire - J.K. Rowling.pdf', 'input/harry-potter-books-in-pdf-1-7/05 Harry Potter and the Order of the Phoeni - J.K. Rowling.pdf', 'input/harry-potter-books-in-pdf-1-7/06 Harry Potter and the Half-Blood Prince - J.K. Rowling.pdf', 'input/harry-potter-books-in-pdf-1-7/07 Harry Potter and the Deathly Hallows - J.K. Rowling.pdf']


# Config

In [118]:
class CFG:
    # ==========================
    # Model Configuration
    # ==========================
    # Name of the LLM model to use (from Hugging Face model repo or local)
    model_name = 'llama2-13b-chat'  
    # Options: wizardlm, llama2-7b-chat, llama2-13b-chat, mistral-7B, etc.

    # Controls randomness of generation: 0 = deterministic, higher = more random
    temperature = 0.1

    # Nucleus sampling probability threshold (top-p sampling)
    top_p = 0.95  

    # Penalize repetition of tokens to encourage diversity
    repetition_penalty = 1.15  

    # ==========================
    # Document Splitting
    # ==========================
    # Maximum size of text chunks to split input documents
    split_chunk_size = 800  

    # Overlap between consecutive text chunks (for context continuity)
    split_overlap = 0  

    # ==========================
    # Embeddings & Retrieval
    # ==========================
    # Embedding model to use for vectorization (from Hugging Face)
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'  

    # Number of nearest neighbors to retrieve during similarity search
    k = 6  

    # ==========================
    # Paths
    # ==========================
    # Path to input PDF documents (Harry Potter books here as an example)
    PDFs_path = './input/harry-potter-books-in-pdf-1-7/'  

    # Path where the FAISS embedding index will be stored or loaded from
    Embeddings_path = './input/faiss-hp-sentence-transformers'  

    # Folder where the final vector database output will be saved
    Output_folder = './harry-potter-vectordb'

    # 所有模型会保存在这个根目录下的子目录里（按仓库名）
    model_store_dir = './models'   # 改成你想放的磁盘路径，如 '/data/llm_models'
    # 若本地已存在同名目录，则强制只用本地文件加载（不访问网络）
    force_local = False


# Define model

In [245]:
MODEL_ALIASES = {
    "wizardlm": "TheBloke/wizardLM-7B-HF",
    "llama2-7b-chat": "daryl149/llama-2-7b-chat-hf",
    "llama2-13b-chat": "daryl149/llama-2-13b-chat-hf",
    "mistral-7B": "mistralai/Mistral-7B-v0.1",
}

def make_bnb_config() -> BitsAndBytesConfig:
    # Common 4-bit quantization configuration (bnb/nf4, fp16 compute)
    return BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )

def repo_to_local_dir(model_repo: str, root: str) -> str:
    """
    Convert HF repo name to a safe local dir path under `root`.
    e.g., 'daryl149/llama-2-13b-chat-hf' -> './models/daryl149__llama-2-13b-chat-hf'
    """
    safe = model_repo.replace('/', '__')
    return os.path.join(root, safe)

def ensure_dir(path: str):
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)

In [246]:
def get_model(model: str) -> Tuple["AutoTokenizer", "AutoModelForCausalLM", int]:
    """
    Download (if needed), load and configure a specific LLM from Hugging Face.
    Also saves tokenizer & model locally for future offline use.

    Returns:
        tokenizer, model, max_len
    """
    if model not in MODEL_ALIASES:
        raise ValueError(f"Unknown model key: {model}. "
                         f"Valid: {list(MODEL_ALIASES.keys())}")

    model_repo = MODEL_ALIASES[model]
    local_dir = repo_to_local_dir(model_repo, CFG.model_store_dir)
    ensure_dir(CFG.model_store_dir)

    use_local_only = CFG.force_local and os.path.isdir(local_dir) and any(os.scandir(local_dir))
    print(f"\nTarget model: {model} -> {model_repo}")
    print(f"Local store:  {local_dir}")
    print(f"Load policy:  {'local ONLY' if use_local_only else 'download if missing'}\n")

    bnb_config = make_bnb_config()

    # --------------------------
    # Load tokenizer
    # --------------------------
    if use_local_only:
        tokenizer = AutoTokenizer.from_pretrained(local_dir, use_fast=True, local_files_only=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

    # --------------------------
    # Load model (4-bit)
    # --------------------------
    try:
        if use_local_only:
            model_obj = AutoModelForCausalLM.from_pretrained(
                local_dir,
                quantization_config=bnb_config,
                device_map="auto",
                low_cpu_mem_usage=True,
                local_files_only=True,
                trust_remote_code=True
            )
        else:
            model_obj = AutoModelForCausalLM.from_pretrained(
                model_repo,
                quantization_config=bnb_config,
                device_map="auto",
                low_cpu_mem_usage=True,
                trust_remote_code=True
            )
    except Exception as e:
        # 常见：bitsandbytes 未安装或 CUDA 版本不匹配
        raise RuntimeError(
            "Failed to load model (4-bit). "
            "If this is a bitsandbytes / CUDA issue, try:\n"
            "  pip install --upgrade pip\n"
            "  pip install bitsandbytes\n"
            "并确保 CUDA 与 PyTorch 匹配。原始错误：\n" + str(e)
        )

    # --------------------------
    # Decide max_len by family
    # --------------------------
    if model == "wizardlm":
        max_len = 1024
    elif model == "llama2-7b-chat":
        max_len = 2048
    elif model == "llama2-13b-chat":
        max_len = 2048
    elif model == "mistral-7B":
        max_len = 1024
    else:
        max_len = 2048

    # --------------------------
    # Auto-save to local_dir (if just downloaded)
    # --------------------------
    if not use_local_only:
        ensure_dir(local_dir)
        try:
            # 保存量化后的权重配置与 tokenizer。下次即可离线从 local_dir 加载。
            tokenizer.save_pretrained(local_dir)
            model_obj.save_pretrained(local_dir)
            print(f"[Saved] tokenizer & model to: {local_dir}")
        except Exception as e:
            print(f"[Warn] Failed to save to {local_dir}: {e}")

    return tokenizer, model_obj, max_len


In [247]:
%%time
print("Loading tokenizer + model in 4-bit…")
tokenizer, model, max_len = get_model(CFG.model_name)
print(f"Loaded model: {CFG.model_name} | recommended max_len={max_len}")

Loading tokenizer + model in 4-bit…

Target model: llama2-13b-chat -> daryl149/llama-2-13b-chat-hf
Local store:  ./models/daryl149__llama-2-13b-chat-hf
Load policy:  download if missing



Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████| 3/3 [00:19<00:00,  6.42s/it]


[Saved] tokenizer & model to: ./models/daryl149__llama-2-13b-chat-hf
Loaded model: llama2-13b-chat | recommended max_len=2048


In [248]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
      )
    )
    (norm): Lla

In [249]:
model.hf_device_map

{'': 0}

# 🤗 Hugging Face Pipeline

In [252]:
# Create a Hugging Face text-generation pipeline
pipe = pipeline(
    task="text-generation",       # Define the task type (here: text generation)
    model=model,                  # The model to be used
    tokenizer=tokenizer,          # Tokenizer for input/output processing
    pad_token_id=tokenizer.eos_token_id,  # Padding token (usually EOS token)
    
    # do_sample=True,             # (Optional) Enable sampling instead of greedy decoding
    
    max_length=max_len,           # Maximum length of generated sequence
    temperature=CFG.temperature,  # Controls randomness: higher = more random output
    top_p=CFG.top_p,              # Nucleus sampling threshold
    repetition_penalty=CFG.repetition_penalty,  # Penalize repeated tokens
    device_map="auto"   # 让 pipeline 和模型对齐，自动分配多卡
)

# Wrap the Hugging Face pipeline in LangChain for integration
llm = HuggingFacePipeline(pipeline=pipe)

# The `llm` object can now be used for text generation within LangChain
llm

Device set to use cuda:0


HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7a6726bbe8d0>)

In [253]:
%%time  

# Define the query string to send to the language model
query = "Give me 5 examples of cool potions and explain what they do"

# Invoke the HuggingFacePipeline (llm) with the query
llm.invoke(query)

CPU times: user 14.1 s, sys: 0 ns, total: 14.1 s
Wall time: 14 s


"Give me 5 examples of cool potions and explain what they do.\n\nSure, here are five examples of cool potions that you might find in a fantasy story:\n\n1. The Elixir of Eternal Youth: This potion grants the drinker eternal youth, keeping them looking and feeling like they did in their prime for as long as they live. However, it also comes with a catch - the drinker must consume the elixir every year on their birthday, or else they will begin to age rapidly and lose all of the benefits of the potion.\n2. The Potion of Healing: This potion can cure any wound or illness, no matter how severe. It works by drawing out the poison or injury from the drinker's body and replacing it with pure, healing energy. However, the potion can only be used once per day, and it leaves the drinker feeling drained and weak afterward.\n3. The Draught of Dreams: This potion allows the drinker to enter into a deep, lucid dream state, where they can explore their subconscious mind and unlock hidden secrets abou

# Langchain

## 1. Load multiple PDFs

In [254]:
# Initialize a DirectoryLoader to load all PDF files from a specified directory
loader = DirectoryLoader(
    CFG.PDFs_path,             # Path to the directory containing PDF files
    glob="./*.pdf",            # File pattern to match (here: all PDF files)
    loader_cls=PyPDFLoader,    # Loader class used for PDF processing
    show_progress=True,        # Show a progress bar while loading
    use_multithreading=True    # Enable multi-threading for faster loading
)

# Load the documents into memory
documents = loader.load()

100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [02:25<00:00, 20.83s/it]


In [255]:
print(f'{len(documents)} pages in total')
documents[6].page_content

3625 pages in total


'6\nTWELVE\nThe Mirror of Erised\nTHIRTEEN\nNicolas Flamel\nFOURTEEN\nNorbert the Norwegian Ridgeback\nFIFTEEN\nThe Forbidden Forest\nSIXTEEN\nThrough the Trapdoor\nSEVENTEEN\nThe Man with Two Faces'

## 2. Text splitting

In [256]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create a text splitter to break long documents into smaller chunks
# This helps when creating embeddings since models usually have input size limits
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CFG.split_chunk_size,   # Maximum size of each text chunk
    chunk_overlap=CFG.split_overlap    # Overlap size between consecutive chunks
)

# Split the original documents into smaller chunks
texts = text_splitter.split_documents(documents)

# Print how many chunks were created from the number of original documents
print(f"We have created {len(texts)} chunks from {len(documents)} pages")

We have created 10191 chunks from 3625 pages


In [257]:
documents[8].page_content

'8\nDursley gossiped away happily as she wrestled a screaming\nDudley into his high chair.\nNone of them noticed a large tawny owl ﬂutter past the\nwindow.\nAt half past eight, Mr Dursley picked up his briefcase,\npecked Mrs Dursley on the cheek and tried to kiss Dudley\ngoodbye but missed, because Dudley was now having a\ntantrum and throwing his cereal at the walls. ‘Little tyke,’\nchortled Mr Dursley as he left the house. He got into his car\nand backed out of number four’s drive.\nIt was on the corner of the street that he noticed the ﬁrst sign\nof something peculiar – a cat reading a map. For a second, Mr\nDursley didn’t realise what he had seen – then he jerked his\nhead around to look again. There was a tabby cat standing on\nthe corner of Privet Drive, but there wasn’t a map in sight.\nWhat could he have been thinking of? It must have been a\ntrick of the light. Mr Dursley blinked and stared at the cat. It\nstared back. As Mr Dursley drove around the corner and up the\nroad, he

## 3. Create Embeddings

- [FAISS](https://python.langchain.com/docs/integrations/vectorstores/faiss/) 是一个开源库（Facebook AI Similarity Search），专门用于 高效的相似性搜索（similarity search） 和 密集向量聚类（clustering of dense vectors），适用于从几百万到数十亿的高维向量数据集。

In [258]:
# import os
# from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Check if FAISS index file already exists
if not os.path.exists(CFG.Embeddings_path + "/index.faiss"):

    # Step 1: Load the HuggingFace embedding model
    # This model converts text into numerical embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name=CFG.embeddings_model_repo,     # The HuggingFace model repo
        model_kwargs={"device": "cuda"}           # Run embeddings on GPU (CUDA)
    )

    # Step 2: Create vector database (FAISS) from documents
    vectordb = FAISS.from_documents(
        documents=texts,                          # The text chunks created earlier
        embedding=embeddings                      # Embedding model
    )

    # Step 3: Save the FAISS vector database to disk for later use
    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_hp")

## 4. Load vector database

In [259]:
# Step 1: Initialize the embedding model
embeddings = HuggingFaceEmbeddings(
    model_name=CFG.embeddings_model_repo,  # 你的模型仓库路径，例如 "hkunlp/instructor-large"
    model_kwargs={"device": "cuda"},       # 指定在 GPU 上运行，若不支持 GPU 请设为 "cpu"
)

# Step 2: Load an existing FAISS vector database from disk
# 从本地加载之前保存的 FAISS 索引（包括向量数据、文档存储和映射）
vectordb = FAISS.load_local(
    # CFG.Embeddings_path,
    CFG.Output_folder + '/faiss_index_hp',  # 存储向量库的路径，比如文件夹路径
    embeddings,             # 加载查询时使用的嵌入模型，必须与创建索引时一致
    allow_dangerous_deserialization=True
)

# Optional: clear the notebook output
clear_output()

In [260]:
vectordb.similarity_search('magic creatures')

[Document(id='66319384-bfef-4708-be62-1dd0afe64a35', metadata={'producer': 'calibre 8.9.0', 'creator': 'calibre 8.9.0', 'creationdate': '2025-08-26T10:02:38+00:00', 'author': 'J.K. Rowling', 'moddate': '2025-08-26T10:02:38+00:00', 'title': 'Harry Potter and the Goblet of Fire', 'source': 'input/harry-potter-books-in-pdf-1-7/04 Harry Potter and the Goblet of Fire - J.K. Rowling.pdf', 'total_pages': 622, 'page': 356, 'page_label': '357'}, page_content='356\n‘I was attacked by a Hippogriff, and my friend Vincent\nCrabbe got a bad bite off a Flobberworm,’ says Draco\nMalfoy, a fourth-year student. ‘We all hate Hagrid, but\nwe’re just too scared to say anything.’\nHagrid has no intention of ceasing his campaign of\nintimidation, however. In conversation with a Daily\nProphet reporter last month, he admitted breeding\ncreatures he has dubbed ‘Blast-Ended Skrewts’, highly\ndangerous crosses between manticores and ﬁre crabs. The\ncreation of new breeds of magical creature is, of course,\nan ac

# Prompt template

效果不好

In [264]:
# Define a prompt template for interacting with the language model
prompt_template = """
Don't try to make up an answer; if you don't know, just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:
"""

# Create a PromptTemplate object
PROMPT = PromptTemplate(
    template=prompt_template,               # The template defined above
    input_variables=["context", "question"] # Variables that will be replaced in the template
)

In [265]:
# Example of how to use the prompt with an LLM chain (commented out here)
# llm_chain = LLMChain(prompt=PROMPT, llm=llm)

# Retriever chain

文本检索 + 答案生成

## Performance not good

### 🔍 1. 调整检索参数
- 增加 k：
现在你可能只取了 k=3\~6 个片段，容易漏掉。可以尝试调大，比如 k=10~15，让模型有更多 context 参考。
- 换 search_type：
默认 similarity 有时会抓到相关性强但不完整的段落，可以尝试 mmr（Maximal Marginal Relevance），能取到 覆盖范围更广 的片段。

## Example

In [285]:
# Create a retriever from the vector database
# - "k": number of top results to retrieve
# - "search_type": similarity search (can be changed to "mmr" for Max Marginal Relevance)
retriever = vectordb.as_retriever(
    search_kwargs={"k": CFG.k, "search_type": "similarity"}
)

# Configure the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,                         # Large Language Model (e.g., OpenAI, LLaMA, etc.)
    chain_type="stuff",              # Method to combine retrieved docs: "stuff", "map_reduce", "map_rerank", "refine"
    retriever=retriever,             # The retriever defined above
    chain_type_kwargs={"prompt": PROMPT},  # Custom prompt to control LLM behavior
    return_source_documents=True,    # Return source documents along with the answer
    verbose=False                    # Whether to log intermediate steps
)

In [286]:
### testing MMR search
question = "Which are Hagrid's favorite animals?"
vectordb.max_marginal_relevance_search(question, k = CFG.k)

[Document(id='aa7eb8d9-02ec-4596-a4b3-960f3306b42c', metadata={'producer': 'calibre 8.9.0', 'creator': 'calibre 8.9.0', 'creationdate': '2025-08-26T10:02:38+00:00', 'author': 'J.K. Rowling', 'moddate': '2025-08-26T10:02:38+00:00', 'title': 'Harry Potter and the Goblet of Fire', 'source': 'input/harry-potter-books-in-pdf-1-7/04 Harry Potter and the Goblet of Fire - J.K. Rowling.pdf', 'total_pages': 622, 'page': 164, 'page_label': '165'}, page_content='smile from behind his bushy beard. Hagrid would have liked\nnothing better than a pet dragon, as Harry, Ron and Hermione\nknew only too well – he had owned one for a brief period\nduring their ﬁrst year, a vicious Norwegian Ridgeback by the'),
 Document(id='9d2fd821-e1e2-4f17-b299-e1fbccd1e1d2', metadata={'producer': 'calibre 8.9.0', 'creator': 'calibre 8.9.0', 'creationdate': '2025-08-26T10:03:43+00:00', 'author': 'J.K. Rowling', 'moddate': '2025-08-26T10:03:43+00:00', 'title': 'Harry Potter and the Order of the Phoenix', 'source': 'input

In [287]:
### testing similarity search
question = "Which are Hagrid's favorite animals?"
vectordb.similarity_search(question, k = CFG.k)

[Document(id='aa7eb8d9-02ec-4596-a4b3-960f3306b42c', metadata={'producer': 'calibre 8.9.0', 'creator': 'calibre 8.9.0', 'creationdate': '2025-08-26T10:02:38+00:00', 'author': 'J.K. Rowling', 'moddate': '2025-08-26T10:02:38+00:00', 'title': 'Harry Potter and the Goblet of Fire', 'source': 'input/harry-potter-books-in-pdf-1-7/04 Harry Potter and the Goblet of Fire - J.K. Rowling.pdf', 'total_pages': 622, 'page': 164, 'page_label': '165'}, page_content='smile from behind his bushy beard. Hagrid would have liked\nnothing better than a pet dragon, as Harry, Ron and Hermione\nknew only too well – he had owned one for a brief period\nduring their ﬁrst year, a vicious Norwegian Ridgeback by the'),
 Document(id='9d9f061d-1212-42bc-a1ab-933ef31a3f43', metadata={'producer': 'calibre 8.9.0', 'creator': 'calibre 8.9.0', 'creationdate': '2025-08-26T10:04:09+00:00', 'author': 'J.K. Rowling', 'moddate': '2025-08-26T10:04:09+00:00', 'title': 'Harry Potter and the Prisoner of Azkaban', 'source': 'input/

## Hyperparameter tuning

In [288]:
# def compare_retrieval(question, retriever, vectordb, k=5):
#     print("="*30)
#     print(f"Query: {question}")
#     print("="*30)

#     # 方法1: similarity search
#     print("\n🔹 Similarity Search Results:")
#     sim_docs = retriever.get_relevant_documents(question)
#     for i, doc in enumerate(sim_docs[:k]):
#         print(f"\n--- Doc {i+1} ---")
#         print(doc.page_content[:200])  # 只显示前200字，避免太长
#         print(f"Metadata: {doc.metadata}")

#     # 方法2: Max Marginal Relevance (MMR)
#     print("\n🔹 MMR Search Results:")
#     mmr_docs = vectordb.max_marginal_relevance_search(
#         question,
#         k=k,         # 最终返回数量
#         fetch_k=50,  # 候选数量，越大多样性越强
#         lambda_mult=0.5
#     )
#     for i, doc in enumerate(mmr_docs[:k]):
#         print(f"\n--- Doc {i+1} ---")
#         print(doc.page_content[:200])  # 只显示前200字
#         print(f"Metadata: {doc.metadata}")

#     return sim_docs, mmr_docs

In [289]:
# question = "Which are Hagrid's favorite animals?"
# sim_results, mmr_results = compare_retrieval(question, retriever, vectordb, k=5)

In [291]:
# # ====== FAISS 对比与可视化 ======
# import numpy as np
# import pandas as pd
# # import matplotlib.pyplot as plt

# def _cos_sim(a, b):
#     a = np.asarray(a); b = np.asarray(b)
#     return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))

# def _diversity_score(emb_list):
#     """1 - 平均两两余弦相似度；越大越多样。"""
#     if not emb_list or len(emb_list) < 2:
#         return 0.0
#     E = np.vstack(emb_list)
#     sims = (E @ E.T) / (np.linalg.norm(E, axis=1, keepdims=True) @ np.linalg.norm(E, axis=1, keepdims=True).T + 1e-12)
#     upper = sims[np.triu_indices(len(E), k=1)]
#     return float(1 - upper.mean())

# def evaluate_faiss_with_viz(
#     query: str,
#     faiss_store,            # LangChain FAISS VectorStore 实例
#     embed_model,            # LangChain Embeddings 实例（如 OpenAIEmbeddings / HuggingFaceEmbeddings）
#     k: int = 5,
#     fetch_k: int = 20,
#     lambda_mult: float = 0.5,
#     show_plot: bool = True
# ):
#     """
#     - Similarity：使用 FAISS 的 similarity_search_with_score，分数直接来自向量库（更准确/高效）
#     - MMR：通过 retriever(search_type="mmr") 获取文档，再用 query-embedding 计算分数（部分 VectorStore 暂无“带分数的 MMR”接口）
#     """
#     # ===== Similarity（直接取库内分数）=====
#     sim_pairs = faiss_store.similarity_search_with_score(query, k=k)  # [(Document, score), ...]
#     sim_docs, sim_scores = zip(*sim_pairs) if sim_pairs else ([], [])
#     q_vec = embed_model.embed_query(query)
#     sim_doc_embs = [embed_model.embed_query(d.page_content) for d in sim_docs] if sim_docs else []
#     sim_div = _diversity_score(sim_doc_embs)
#     sim_cov = len({d.metadata.get("source", idx) for idx, d in enumerate(sim_docs)}) / max(k, 1)

#     # 注意：FAISS 的 score 可能是距离(L2)或相似度，LangChain 文档示例以“SIM=”展示，可直接用于相对比较。:contentReference[oaicite:1]{index=1}

#     # ===== MMR（多样性检索）=====
#     retriever_mmr = faiss_store.as_retriever(
#         search_type="mmr",
#         search_kwargs={"k": k, "fetch_k": fetch_k, "lambda_mult": lambda_mult},
#     )
#     mmr_docs = retriever_mmr.invoke(query) or []
#     # 目前多数实现没有“MMR+分数”接口，这里用 q_vec 与文档向量做余弦作为分数近似。:contentReference[oaicite:2]{index=2}
#     mmr_doc_embs = [embed_model.embed_query(d.page_content) for d in mmr_docs]
#     mmr_scores = [_cos_sim(q_vec, e) for e in mmr_doc_embs] if mmr_doc_embs else []
#     mmr_div = _diversity_score(mmr_doc_embs)
#     mmr_cov = len({d.metadata.get("source", idx) for idx, d in enumerate(mmr_docs)}) / max(k, 1)

#     # ===== 汇总指标 =====
#     metrics = pd.DataFrame(
#         {
#             "similarity": {
#                 "Mean Similarity": float(np.mean(sim_scores)) if sim_scores else np.nan,
#                 "Diversity": sim_div,
#                 "Coverage": sim_cov,
#             },
#             "mmr": {
#                 "Mean Similarity": float(np.mean(mmr_scores)) if mmr_scores else np.nan,
#                 "Diversity": mmr_div,
#                 "Coverage": mmr_cov,
#             },
#         }
#     ).T

#     print("=== Query ===")
#     print(query)
#     print("\n=== Metrics ===")
#     print(metrics)

#     # ===== 可视化（单图、三组指标并列柱状）=====
#     # if show_plot:
#     #     ax = metrics.plot(kind="bar")
#     #     ax.set_title("FAISS Retrieval: Similarity vs MMR")
#     #     ax.set_xlabel("Method")
#     #     ax.set_ylabel("Score")
#     #     ax.legend(loc="best")
#     #     plt.tight_layout()
#     #     plt.show()

#     return {
#         "metrics": metrics,
#         "similarity": {"docs": sim_docs, "scores": sim_scores},
#         "mmr": {"docs": mmr_docs, "scores": mmr_scores},
#     }

# # ===== 使用示例 =====
# # result = evaluate_faiss_with_viz(
# #     query="Which are Hagrid's favorite animals?",
# #     faiss_store=vector_store,      # 你的 FAISS VectorStore
# #     embed_model=embeddings_model,  # 你的 Embeddings
# #     k=5, fetch_k=20, lambda_mult=0.5,
# #     show_plot=True
# # )


In [292]:
# 假设你的函数已经定义好并在当前作用域
# from your_module import evaluate_faiss_with_viz

# result = evaluate_faiss_with_viz(
#     query="Which are Hagrid's favorite animals?",
#     faiss_store=vectordb,      # 你的 FAISS VectorStore
#     embed_model=embeddings,  # 你的 Embeddings
#     k=5, fetch_k=20, lambda_mult=0.5,
#     show_plot=True
# )

# # result 包含：
# # - result["metrics"]：指标 DataFrame
# # - result["similarity"]["docs"], result["similarity"]["scores"]
# # - result["mmr"]["docs"], result["mmr"]["scores"]

# # 打印具体结果看看
# print("\nSimilarity Search Results:")
# for doc, score in zip(result["similarity"]["docs"], result["similarity"]["scores"]):
#     print(f"* [Score={score:.4f}] {doc.page_content} — {doc.metadata}")

# print("\nMMR Search Results:")
# for doc, score in zip(result["mmr"]["docs"], result["mmr"]["scores"]):
#     print(f"* [Score={score:.4f}] {doc.page_content} — {doc.metadata}")


In [294]:
# # Natural-language query from the user
# question = "Which are Hagrid's favorite animals?"

# docs = retriever.get_relevant_documents(question)
# print(docs)

# # Run Max Marginal Relevance (MMR) search:
# # - k: final number of diverse-yet-relevant chunks to return
# # - fetch_k (optional): retrieve more candidates first, then down-select by diversity
# # - lambda_mult (optional): trade-off between relevance (1.0) and diversity (0.0). Default ~0.5
# docs = vectordb.max_marginal_relevance_search(
#     question,
#     k=CFG.k,              # how many chunks you ultimately want
#     fetch_k=20,           # pull more to allow better diversity selection
#     lambda_mult=0.5       # 0.5 is a balanced relevance/diversity trade-off
# )

# # 'docs' is a list of Document objects you can inspect or feed to an LLM.
# print(docs)

In [293]:
# # Natural-language query from the user
# question = "Which challenges does Harry face during the Triwizard Tournament?"

# # Run Max Marginal Relevance (MMR) search:
# # - k: final number of diverse-yet-relevant chunks to return
# # - fetch_k (optional): retrieve more candidates first, then down-select by diversity
# # - lambda_mult (optional): trade-off between relevance (1.0) and diversity (0.0). Default ~0.5
# docs = vectordb.max_marginal_relevance_search(
#     question,
#     k=CFG.k,              # how many chunks you ultimately want
#     fetch_k=20,           # pull more to allow better diversity selection
#     lambda_mult=0.5       # 0.5 is a balanced relevance/diversity trade-off
# )

# # 'docs' is a list of Document objects you can inspect or feed to an LLM.
# print(docs)

In [225]:
# retriever = vectordb.as_retriever(
#     search_type="mmr",
#     search_kwargs={"k": CFG.k, "fetch_k": 20, "lambda_mult": 0.5}
# )

# # Configure the RetrievalQA chain
# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,                         # Large Language Model (e.g., OpenAI, LLaMA, etc.)
#     chain_type="stuff",              # Method to combine retrieved docs: "stuff", "map_reduce", "map_rerank", "refine"
#     retriever=retriever,             # The retriever defined above
#     chain_type_kwargs={"prompt": PROMPT},  # Custom prompt to control LLM behavior
#     return_source_documents=True,    # Return source documents along with the answer
#     verbose=False                    # Whether to log intermediate steps
# )

## NMR

In [295]:
# # 1) 用 MMR，提高多样性覆盖
# retriever = vectordb.as_retriever(
#     search_type="mmr",
#     search_kwargs={
#         "k": 5,           # 返回给 LLM 的文档数
#         "fetch_k": 20,     # 先抓候选，再在其中做 MMR 复排
#         "lambda_mult": 0.5 # 0=最大多样性, 1=最小多样性；0.3~0.7 常用
#     }
# )

# # 2) 选择更适合长文档的链路（例如 map_reduce 或 refine）
# from langchain.chains import RetrievalQA

# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,                         # Large Language Model (e.g., OpenAI, LLaMA, etc.)
#     chain_type="stuff",              # Method to combine retrieved docs: "stuff", "map_reduce", "map_rerank", "refine"
#     retriever=retriever,             # The retriever defined above
#     chain_type_kwargs={"prompt": PROMPT},  # Custom prompt to control LLM behavior
#     return_source_documents=True,    # Return source documents along with the answer
#     verbose=False                    # Whether to log intermediate steps
# )

# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     retriever=retriever,
#     chain_type="map_reduce",   # 可选: "stuff" | "map_reduce" | "refine" | "map_rerank"
#     chain_type_kwargs={
#         "question_prompt": PROMPT,  # 你的自定义问题提示
#         # 对于 map_reduce，你也可以分别提供 map_prompt / combine_prompt
#         # "map_prompt": MAP_PROMPT,
#         # "combine_prompt": COMBINE_PROMPT,
#     },
#     return_source_documents=True,
#     verbose=False
# )

# Post-process outputs

In [296]:
import textwrap
from typing import Any, Dict, Iterable, Optional

def wrap_text_preserve_newlines(text: str, width: int = 80) -> str:
    """
    Soft-wrap a block of text while preserving existing newline breaks.

    Each existing line is individually wrapped to the specified width,
    then the wrapped lines are joined back together with '\n'.

    Args:
        text: The raw text to wrap (possibly multi-line).
        width: Maximum line width for wrapping.

    Returns:
        A new string with the same newline structure, but wrapped lines.
    """
    # Split on existing line breaks so we don’t destroy paragraph structure
    lines = text.split("\n")

    # Wrap each line independently (empty lines stay empty)
    wrapped_lines = [textwrap.fill(line, width=width) if line else "" for line in lines]

    # Stitch the wrapped lines back together with newline separators
    return "\n".join(wrapped_lines)


def _pretty_source_name(path: str) -> str:
    """
    Extract a human-friendly name from a file path.
    Example: '/foo/bar/baz.pdf' -> 'baz'
    """
    # Handle both POSIX (/) and Windows (\) separators
    name = path.split("/")[-1].split("\\")[-1]
    # Strip common extensions like .pdf to keep it tidy
    if name.lower().endswith(".pdf"):
        name = name[:-4]
    return name or "Unknown"


def process_llm_response(llm_response: Dict[str, Any], width: int = 80) -> str:
    """
    Format the LLM response and list the sources used (filename + page).

    Expected llm_response structure (typical LangChain-style):
      {
        "result": "<model answer>",
        "source_documents": [
           {
             "metadata": {
               "source": "/path/to/file.pdf",
               "page": 12,
               # sometimes page may live elsewhere (e.g., loc.page)
             },
             ...
           },
           ...
        ]
      }

    Args:
        llm_response: The dictionary returned by your LLM chain/tool.
        width: Wrap width for the final answer text.

    Returns:
        A single formatted string containing:
          - The wrapped answer text
          - A “Sources” section listing each source (deduped) with page numbers when available
    """
    # print('llm_response\n', llm_response['result'])
    # 1) Wrap the main answer text to make it more readable
    answer_text = wrap_text_preserve_newlines(llm_response.get("result", ""), width=width)

    # 2) Collect source names + pages (defensive to handle different metadata shapes)
    seen = set()
    lines = []
    for doc in llm_response.get("source_documents", []) or []:
        meta: Dict[str, Any] = getattr(doc, "metadata", {}) or {}

        # Try the most common metadata key first; fall back gracefully
        source_path: Optional[str] = meta.get("source") or meta.get("file_path") or meta.get("filename")
        source_name = _pretty_source_name(source_path) if source_path else "Unknown"

        # Page might be at different places depending on loader
        page = (
            meta.get("page")
            or (meta.get("loc") or {}).get("page")
            or (meta.get("page_number"))
        )

        # Format "name - page: X" when page is known
        line = f"{source_name} - page: {page}" if page is not None else source_name

        # Deduplicate while preserving order
        if line not in seen:
            seen.add(line)
            lines.append(line)

    # 3) Stitch everything together
    if lines:
        sources_block = "\n".join(lines)
        return f"{answer_text}\n\n---\nSources:\n{sources_block}"
    else:
        return answer_text

In [297]:
import time


def llm_ans(query: str) -> str:
    """
    Process a user query with the QA chain, return the formatted model answer,
    and measure how long the entire process took.

    Args:
        query: User's question (string).

    Returns:
        A string containing:
          - The formatted answer from the LLM
          - A trailing line with elapsed time in seconds
    """
    # Record start time (to measure processing duration)
    start = time.time()

    # Run the query through the QA chain to get the raw response
    llm_response = qa_chain.invoke(query)

    # Process the LLM response:
    # - Wrap the answer text for readability
    # - Extract and format the source documents
    ans = process_llm_response(llm_response)

    # Record end time
    end = time.time()

    # Calculate elapsed time in seconds (rounded to nearest integer)
    time_elapsed = int(round(end - start, 0))

    # Create a formatted elapsed-time string
    time_elapsed_str = f"\n\nTime elapsed: {time_elapsed} s"

    # Append elapsed time info to the answer and return
    return ans + time_elapsed_str


# Test cases

In [304]:
CFG.model_name

'llama2-13b-chat'

## Case 1

In [305]:
# Define the user query about Harry Potter
query = "Which challenges does Harry face during the Triwizard Tournament?"
# Call the predefined function `llm_ans` to process the query
# and print the formatted answer
print(llm_ans(query))


Don't try to make up an answer; if you don't know, just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

around the maze. Bagman now pointed his wand at his throat,
muttered ‘Sonorus’, and his magically magniﬁed voice echoed
into the stands.
‘Ladies and gentlemen, the third and ﬁnal task of the
Triwizard Tournament is about to begin! Let me remind you
how the points currently stand! Tied in ﬁrst place, on eighty-
ﬁve points each – Mr Cedric Diggory and Mr Harry Potter,
both of Hogwarts School!’ The cheers and applause sent birds
from the Forbidden Forest ﬂuttering into the darkening sky. ‘In
second place, on eighty points – Mr Viktor Krum, of
Durmstrang Institute!’ More applause. ‘And in third place –
Miss Fleur Delacour, of Beauxbatons Academy!’
Harry could just make out Mrs Weasley, Bill, Ron and
Hermione applauding Fleur politely, halfway up the stands. He

156
and took a long d

## Case 2

In [311]:
query = "Is Malfoy an ally of Voldemort?"
print(llm_ans(query))


Don't try to make up an answer; if you don't know, just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

gazing at the place where Malfoy had vanished …
‘Where have you been?’ demanded Ginny, as Harry sprinted
into the changing room. The whole team was changed and
ready; Coote and Peakes, the Beaters, were both hitting their
clubs nervously against their legs.
‘I met Malfoy,’ Harry told her quietly, as he pulled his
scarlet robes over his head.
‘So?’
‘So I wanted to know how come he’s up at the castle with a
couple of girlfriends while everyone else is down here …’

Malfoy’s thanks. ‘I did know your grandfather, after all …’
‘He always spoke very highly of you, sir,’ said Malfoy
quickly. ‘Said you were the best potion-maker he’d ever
known …’
Harry stared at Malfoy. It was not the sucking up that
intrigued him; he had watched Malfoy do that to Snape for a
long time. It was the fact

## Case 3

In [312]:
query = "What are horcrux?"
print(llm_ans(query))


Don't try to make up an answer; if you don't know, just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

watching the skin on his hands regrow, ‘we got the Horcrux.
On the down side –’
‘– no sword,’ said Harry through gritted teeth, as he dripped
dittany through the singed hole in his jeans on to the angry
burn beneath.
‘No sword,’ repeated Ron. ‘That double-crossing little scab
…’
Harry pulled the Horcrux from the pocket of the wet jacket
he had just taken off and set it down on the grass in front of
them. Glinting in the sun, it drew their eyes as they swigged
their bottles of juice.

406
that you understand the term. A Horcrux is the word used for
an object in which a person has concealed part of their soul.’
‘I don’t quite understand how that works, though, sir,’ said
Riddle.
His voice was carefully controlled, but Harry could sense
his excitement.
‘Well, you split your soul, y

## Case 4

In [327]:
query = "Give me 5 examples of cool potions and explain what they do"
print(llm_ans(query))


Don't try to make up an answer; if you don't know, just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

231
His eyes rested on Harry and his lip curled. Harry glared
back, feeling a grim pleasure at the idea that he would be able
to give up Potions after ﬁfth year.
‘But we have another year to go before that happy moment
of farewell,’ said Snape softly, ‘so, whether or not you are
intending to attempt N.E.W.T., I advise all of you to
concentrate your efforts upon maintaining the high pass level I
have come to expect from my O.W.L. students.
‘Today we will be mixing a potion that often comes up at
Ordinary Wizarding Level: the Draught of Peace, a potion to
calm anxiety and soothe agitation. Be warned: if you are too
heavy-handed with the ingredients you will put the drinker into
a heavy and sometimes irreversible sleep, so you will need to

that potion! You’ll need all the luck you

# Gradio Chat UI

In [333]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [334]:
import gradio as gr
print(gr.__version__)

5.43.1


In [335]:
import socket

# 1) 先关掉所有已有的 gradio 服务（关键）
try:
    gr.close_all()
except Exception:
    pass

# 2) 想要的固定端口
PORT = 7860

# 3) 如果端口被占，用 socket 判断并提示（可选）
def is_port_in_use(port):
    with socket.socket() as s:
        return s.connect_ex(("127.0.0.1", port)) == 0

if is_port_in_use(PORT):
    print(f"Port {PORT} is in use. Kill the old process or change PORT.")
    # 也可以改成自动找空闲端口，见方案 B

In [336]:
def predict(message, history):
    # output = message # debug mode

    output = str(llm_ans(message)).replace("\n", "<br/>")
    return output

In [337]:
demo = gr.ChatInterface(predict, title="Meimei's HP QA Demo")
demo.queue()
demo.launch(server_name="127.0.0.1", server_port=PORT, inline=True, show_error=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


