# Local LLM

Local Question-Answering System using Chroma + Qwen LLM

This script retrieves relevant context from a local Chroma vector database and answers user questions using a locally loaded Qwen model.

> NOTE - running on CPU-only machine usually be very slow, change the MODEL to a smaller one or use a GPU machine instead.

### Install pre-request modules

In [None]:
!pip install langchain langchain-core langchain-chroma langchain-huggingface transformers chromadb

### Import modules

In [None]:
from transformers import pipeline
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings
from langchain_chroma import Chroma

### Configuration of the runtime environment

In [None]:
CHROMA_DIR = "./sample_data/chroma_db"          # Path to persisted Chroma DB
COLLECTION_NAME = "covid_webpages"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"  # Qwen model on Hugging Face
PIPELINE_TASK = "text-generation"
RETRIEVAL_K = 5
MAX_GEN_TOKENS = 512

### Load embedding model and database

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name=GEN_MODEL)
vectorstore = Chroma(
    persist_directory=CHROMA_DIR,
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_model
)
retriever = vectorstore.as_retriever(search_kwargs={"k": RETRIEVAL_K})

### Load LLM model

In [None]:
def create_llm_pipeline(model_name: str, task: str, max_tokens: int):
    """Initialize the HuggingFace text generation pipeline."""
    gen_pipe = pipeline(
        task,
        model=model_name,
        tokenizer=model_name,
        max_new_tokens=max_tokens,
        temperature=0.7,
        do_sample=True
    )
    return HuggingFacePipeline(pipeline=gen_pipe)


llm = create_llm_pipeline(GEN_MODEL, PIPELINE_TASK, MAX_GEN_TOKENS)

### Build QA chain

In [None]:
def create_qa_prompt():
    """Create a prompt template for the QA chain."""
    system_prompt = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        "{context}"
    )
    
    return ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
    ])
    

def build_qa_chain(llm, retriever):
    """Combine retriever and LLM into a QA chain using modern LangChain."""
    prompt = create_qa_prompt()
    
    # Create the document combination chain
    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    
    # Create the full retrieval chain
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    
    return rag_chain


qa_chain = build_qa_chain(llm, retriever)

In [None]:
# Example query
query = "What is the impact of the pandemic on the economy?"
result = qa_chain.invoke({"input": query})

### Show results

In [None]:
def display_result(query: str, result: dict):
    """Pretty-print the question, answer, and sources."""
    print(f"🔹 Question: {query}")
    print(f"🔹 Answer: {result['answer']}\n")
    print("🔹 Sources:")
    for doc in result["context"]:
        filename = doc.metadata.get("filename", "Unknown source")
        preview = doc.page_content[:200].replace("\n", " ")
        print(f"- {filename} → {preview}...\n")


display_result(query, result)