In [5]:
import os
import time
import torch
from pathlib import Path
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma
from langchain_huggingface.llms import HuggingFacePipeline
from langchain import hub
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda, RunnableMap


def get_device():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    return device

In [2]:
def load_embeddings():
    return HuggingFaceEmbeddings(model_name="bkai-foundation-models/vietnamese-bi-encoder")

def load_llm(model_name):
    token_path = Path("token.txt")
    if not token_path.exists():
        raise FileNotFoundError("Missing HuggingFace token.txt")

    with token_path.open("r") as f:
        hf_token = f.read().strip()

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        low_cpu_mem_usage=True,
        device_map="auto",
        token=hf_token
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        pad_token_id=tokenizer.eos_token_id,
        device_map="auto"
    )

    return HuggingFacePipeline(pipeline=model_pipeline)

In [3]:
# MODEL_NAME= "google/gemma-2b-it"

# llm = load_llm(MODEL_NAME)
# print("LLM model loaded.")

In [None]:
def load_documents(folder_path):
    folder = Path(folder_path.strip().strip('"\''))

    if not folder.exists():
        raise FileNotFoundError(f"Folder not found: {folder}")

    pdf_files = list(folder.glob("*.pdf"))
    if not pdf_files:
        raise ValueError(f"No PDF files in folder: {folder}")

    all_docs, filenames = [], []
    for pdf_file in pdf_files:
        try:
            loader = PyPDFLoader(str(pdf_file))
            docs = loader.load()
            all_docs.extend(docs)
            filenames.append(pdf_file.name)
            print(f"✅ Loaded {pdf_file.name} ({len(docs)} pages)")
        except Exception as e:
            print(f"❌ Failed loading {pdf_file.name}: {e}")
    return all_docs, filenames

In [4]:
def build_rag_chain(docs, embeddings, llm):
    chunker = SemanticChunker(
        embeddings=embeddings,
        buffer_size=1,
        breakpoint_threshold_type="percentile",
        breakpoint_threshold_amount=95,
        min_chunk_size=500,
        add_start_index=True
    )

    chunks = chunker.split_documents(docs)
    vector_db = Chroma.from_documents(chunks, embedding=embeddings)
    retriever = vector_db.as_retriever()
    prompt = hub.pull("rlm/rag-prompt")

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # rag_chain = (
    #     #? { } -> Syntax for RunnableMap.
    #     #? retriever is the retrieved information. format_docs is function to format the retrieved information.
    #     RunnableParallel({'context': retriever | format_docs, 'question': RunnablePassthrough()})
    #     | prompt
    #     | llm
    #     | StrOutputParser()
    # )

    # return_json_result = RunnableLambda(lambda result: {
    #     "context": result["context"],
    #     "question": result["question"],
    #     "answer": result
    # })

    rag_chain = (
        RunnableMap({
            'context': retriever | format_docs,
            'question': RunnablePassthrough(),
        })
        | prompt
        | llm
        | JsonOutputParser()
    )

    return rag_chain, len(chunks)

In [None]:
def main():
    get_device()
    embeddings = load_embeddings()
    #? PhoGPT-5.5B
    #? Phi-2 (2.7B)
    #? lmsys/vicuna-7b-v1.5
    MODEL_NAME= "google/gemma-2b-it"
    llm = load_llm(MODEL_NAME)

    folder_path = "pdf_folder"  # Replace with your path
    start = time.time()
    docs, filenames = load_documents(folder_path)
    rag_chain, num_chunks = build_rag_chain(docs, embeddings, llm)

    print(f"\nReady: {len(filenames)} files, {num_chunks} chunks")
    print(f"⏱️ Loading Time: {time.time() - start:.2f}s")

    while True:
        user_input = input("\nYour question (type 'exit' to quit): ")
        if user_input.lower() == "exit":
            break
        print("\nGenerating answer...")
        start = time.time()
        response = rag_chain.invoke(user_input)
        # print(f"\nContext: {response['context']}")
        # print(f"Questionf: {response['question']}")
        print(f'\nJSON OUTPUT: {response}')
        print(f"⏱️ Time taken: {time.time() - start:.2f}s")


if __name__ == "__main__":
    main()


In [1]:
from langchain_core.messages import AIMessage
from langchain_core.output_parsers import JsonOutputParser

message = AIMessage(content='\`\`\`\n{"foo": "bar"}\n\`\`\`')
output_parser = JsonOutputParser()
output_parser.invoke(message)

OutputParserException: Invalid json output: \`\`\`
{"foo": "bar"}
\`\`\`
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 