In [None]:
%pip install llama_stack 
%pip install llama-stack-client==0.2.10

In [1]:
from llama_stack_client import RAGDocument, LlamaStackClient
# Cell 2: extract text from website via SmolDocling (with BeautifulSoup fallback), save to `raw_text`
import time
from bs4 import BeautifulSoup 

In [3]:
deployment_endpoint= "http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321"

In [4]:
client = LlamaStackClient(base_url=deployment_endpoint)
models= client.models.list()
print(client.models.list())

model_id = next(m.identifier for m in models if m.model_type == "llm")
embedding_model = next(m for m in models if m.model_type == "embedding")
embedding_model_id = embedding_model.identifier
embedding_dimension = embedding_model.metadata["embedding_dimension"]

vector_db_id = "my_milvus_db"
provider_id  = "milvus"

# ### Do this step only once 


_ = client.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
provider_id=provider_id,
)
print(f"Registered vector DB: {vector_db_id}")

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/vector-dbs "HTTP/1.1 200 OK"


[Model(identifier='granite', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='granite', model_type='llm'), Model(identifier='granite-embedding-125m', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]
Registered vector DB: my_milvus_db


In [8]:
# ✅ Simple Docling PDF reader
# Reads all PDFs in "data/" folder using docling and saves extracted text

import os
from pathlib import Path
from docling.document_converter import DocumentConverter

# Folder containing PDFs
data_dir = Path("data")

# Initialize docling converter
converter = DocumentConverter()

# Dictionary to store all extracted texts
pdf_texts = {}

# Loop through each PDF
for pdf_file in data_dir.glob("*.pdf"):
    print(f"Reading: {pdf_file.name} ...")
    result = converter.convert(str(pdf_file))
    text_content = result.document.export_to_text()
    
    # Store in dictionary
    pdf_texts[pdf_file.name] = text_content

    # (Optional) Save each PDF’s text as a separate .txt file
    txt_path = pdf_file.with_suffix(".txt")
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text_content)

print("\n✅ Extraction complete!")
print(f"Extracted text for {len(pdf_texts)} PDFs.")

# Example: access content in-memory
for name, text in pdf_texts.items():
    print(f"\n----- {name} -----\n")
    print(text[:500], "...\n")  # show first 500 chars only


INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash f9730ffaa6e7f8d4fb0c98c8df3f18cb
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered picture descriptions: ['vlm', 'api']
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.models.auto_ocr_model:rapidocr cannot be used because onnxruntime is not installed.
INFO:docling.models.auto_ocr_model:easyocr cannot be used because it is not installed.


Reading: RAG_demo_doc.pdf ...


INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
[32m[INFO] 2025-11-06 08:30:23,413 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-06 08:30:23,415 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-11-06 08:30:24,772 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2025-11-06 08:30:25,156 [RapidOCR] download_file.py:95: Successfully saved to: /opt/app-root/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-11-06 08:30:25,159 [RapidOCR] torch.py:54: Using /opt/app-root/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-11-06 08:30:25,339 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-06 08:30:25,340 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidA


✅ Extraction complete!
Extracted text for 1 PDFs.

----- RAG_demo_doc.pdf -----

## State Bank of India

Central Recruitment &amp; Promotion Department Corporate Centre, Mumbai

Phone: 022-22820427; e-mail: crpd@sbi.co.in

SBI HONOURED AS OVERALL WINNER UNDER 'TOP PERFORMING BANK' CATEGORY AT EASE 7.0 CITATION CEREMONY

## State Bank of India

CENTRAL RECRUITMENT &amp; PROMOTION DEPARTMENT CORPORATE CENTRE, MUMBAI (Phone: 022-2282 0427; E-mail: crpd@sbi.co.in)

## RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT &amp; SALES)

(Advertisement No. CRPD/CR/2025-26/06)

## ONLI ...



### Checking Extraction quality


In [11]:
# Assuming `pdf_texts` is your dictionary: {"file1.pdf": "text...", "file2.pdf": "text..."}
output_file = "combined_extracted_text.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for pdf_name, text in pdf_texts.items():
        f.write(f"\n\n===== {pdf_name} =====\n")
        f.write(text.strip())
        f.write("\n" + "=" * 80 + "\n")

print(f"✅ Saved all extracted text to '{output_file}'")


✅ Saved all extracted text to 'combined_extracted_text.txt'


### Saving the Text into chunks and saving them with the help of embeddings 

In [24]:
rag_documents = []

for name, text in pdf_texts.items():
    document = RAGDocument(
        document_id=name.replace(".pdf", ""),     # remove extension for id
        content=text,                             # ✅ pass plain string text
        mime_type="text/plain",
        metadata={"source": name}
    )
    rag_documents.append(document)


In [25]:
rag_documents

[{'document_id': 'RAG_demo_doc',
  'content': '## State Bank of India\n\nCentral Recruitment &amp; Promotion Department Corporate Centre, Mumbai\n\nPhone: 022-22820427; e-mail: crpd@sbi.co.in\n\nSBI HONOURED AS OVERALL WINNER UNDER \'TOP PERFORMING BANK\' CATEGORY AT EASE 7.0 CITATION CEREMONY\n\n## State Bank of India\n\nCENTRAL RECRUITMENT &amp; PROMOTION DEPARTMENT CORPORATE CENTRE, MUMBAI (Phone: 022-2282 0427; E-mail: crpd@sbi.co.in)\n\n## RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT &amp; SALES)\n\n(Advertisement No. CRPD/CR/2025-26/06)\n\n## ONLINE REGISTRATION OF APPLICATION AND PAYMENT OF FEES: 06.08.2025 TO 26.08.2025\n\nApplications are invited from eligible Indian Citizens for appointment as Junior Associate (Customer Support &amp; Sales) in clerical cadre in State Bank of India. Candidates can apply for vacancies in one State/UT only. Candidates can appear for the test only once under this recruitment project. The candidates applying for vacancies of a particular Sta

### Inserting this data in Vector DB

In [26]:
# Now you can insert all at once
client.tool_runtime.rag_tool.insert(
    documents=rag_documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=500
)
print("✅ All PDFs ingested successfully.")

INFO:llama_stack_client._base_client:Retrying request to /v1/tool-runtime/rag-tool/insert in 0.463135 seconds
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"


✅ All PDFs ingested successfully.


### Asking User Query 

In [53]:
# Example RAG query for one-off lookups
query = "What are guidelines for uploading photographs for SBI RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT & SALES 2025 ?"


ground_truth =""" Photograph Image: (4.5 cm x 3.5 cm)
• Photograph must be a recent passport style colour picture.
• Make sure that the picture is in colour, taken against a light-coloured, preferably white,
background.
• Look straight at the camera with a relaxed face
• If picture is taken on a sunny day, have the sun behind you, or place yourself in shade,
so that you are not squinting and there are no harsh shadows
• If you have to use flash, ensure there's no "red-eye"
• If you wear glasses make sure that there are no reflections and your eyes can be
clearly seen.
• Caps, hats and dark glasses are not acceptable. Religious headwear is allowed but it
must not cover your face.
• Dimensions 200 x 230 pixels (preferred)
• Size of file should be between 20 kb–50 kb
• Ensure that size of the scanned image is not more than 50kb. If the size of the file is
more than 50 kb, then adjust the settings of the scanner such as the DPI resolution,
no. of colours etc., during the process of scanning.
• Photo uploaded should be of appropriate size and clearly visible.
• It is advisable that candidate retains about 8 copies of the same photograph
which is uploaded at the time of online application as these would be needed for
further processes of this selection process."""

### Answer With RAG

In [62]:
def qa_with_rag(client, query, vector_db_id, model_id):
    """
    Perform a RAG-based Q&A using the provided LlamaStack client.
    Retrieves relevant context from the vector database, 
    then queries the model with strict factual constraints.
    """

    # Query relevant chunks from vector DB
    rag_result = client.tool_runtime.rag_tool.query(
        vector_db_ids=[vector_db_id],
        content=query,
    )

    # System / context instruction for factual responses
    context_msg = """
    You are an intelligent assistant that answers user queries based strictly on the provided context documents.

    Instructions:
    - Use only the information found in the retrieved context.
    - Do NOT make up or assume any facts beyond what is given.
    - If the answer cannot be found in the provided context, clearly respond with:
      "The information you asked for is not available in the provided documents."
    - Maintain factual accuracy and logical consistency at all times.
    - If there are multiple relevant pieces of information, summarize them precisely and cite their context where applicable.
    - Be concise, structured, and neutral — avoid speculation or creative elaboration.
    - When numerical or factual answers are expected, extract them exactly as stated in the context.
    - At the end of the answer, indicate a confidence score (0–100%) along with the document resources.
    """

    # Build the textual context from retrieved chunks
    context = "\n\n".join([
        item.text for item in rag_result.content if hasattr(item, "text")
    ])

    # Construct chat messages for inference
    messages = [
        {"role": "system", "content": context_msg},
        {
            "role": "user",
            "content": f"CONTEXT:\n{context}\n\nQUESTION:\n{query}"
        }
    ]

    # Call the model for final answer generation
    response = client.inference.chat_completion(
        messages=messages,
        model_id=model_id
    ).completion_message.content

    

    return response , context


### RAG Based Q&A

In [63]:
rag_answer , context = qa_with_rag(client= client, query=query, vector_db_id= vector_db_id, model_id= model_id)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


In [65]:
print(rag_answer)

The photograph for the SBI RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT & SALES 2025) should be a recent, color passport-style picture taken against a light-colored background. The candidate should look straight at the camera with a relaxed face. The photo dimensions should be 4.5 cm x 3.5 cm, and the digital file size should be between 20 kb-50 kb. The photo should be clearly visible with no harsh shadows, red-eye, or reflections if glasses are worn. Caps, hats, and dark glasses are not allowed, but religious headwear is permitted as long as it does not cover the face. Candidates need to upload one photo during the online application process and retain about 8 copies of the same for further processes. They can also capture a live photograph using a webcam or mobile phone through the application form.

Confidence: 100%

Sources: RAG_demo_doc.pdf


In [66]:
print(context)

knowledge_search tool found 5 chunks:
BEGIN of knowledge_search tool results.


Result 1
Content:  should be of appropriate size and clearly visible.
- It  is  advisable  that  candidate  retains  about  8  copies  of  the  same  photograph which is uploaded at the time of online application as these would be needed for further processes of this selection process .

## Photograph Capture

- In addition to the above photograph, candidates will also be required to capture and upload their live photograph either by using webcam or mobile phone.
- On selecting 'Capture Photo' option, candidates
Metadata: {'source': 'RAG_demo_doc.pdf', 'document_id': 'RAG_demo_doc'}


Result 2
Content:  as and when required. The signature, photograph and left thumb impression is of mine'.
- b. Left Thumb Impression: If a candidate is not having left thumb, he/she may use his /her right thumb for applying).

## (i) Photograph Image: (4.5 cm x 3.5 cm)

- Photograph must be a recent passport style colour pictu

### Without RAG ,  Q&A

In [56]:
def ask_base_llm(client, model_id: str, query: str) -> str:
    """
    Ask the base LLM (without RAG context) using the LlamaStack client.
    Includes strong anti-hallucination guardrails and concise answer formatting.
    """
    
    # Context message — to minimize hallucinations
    system_prompt = """
    You are a factual and precise assistant.
    Answer ONLY using verified, commonly accepted knowledge.
    If you are uncertain or the information is not explicitly known,
    respond with: "I'm not sure about that based on my current knowledge."
    Avoid speculation, assumptions, or made-up information.
    Be concise and direct.
    """

    # Construct message list for chat completion
    messages = [
        {"role": "system", "content": system_prompt.strip()},
        {"role": "user", "content": f"QUESTION:\n{query}\n\nAnswer concisely:"}
    ]
    
    # Call the LlamaStack inference API
    try:
        resp = client.inference.chat_completion(messages=messages, model_id=model_id)
    except Exception as e:
        print("❌ Error during inference:", e)
        return f"Error: {e}"

    # Extract the answer safely
    answer = None
    if hasattr(resp, "completion_message") and resp.completion_message:
        answer = resp.completion_message.content
    elif hasattr(resp, "choices") and resp.choices:
        answer = resp.choices[0].message.content
    else:
        answer = str(resp)
    
    return answer

In [76]:
without_rag_answer= ask_base_llm(client=client, model_id= model_id , query= query)
print(without_rag_answer)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


1. File Format: The photograph should be in JPEG/JPG format.
2. File Size: The size should not exceed 200 KB.
3. Dimensions: The photo should be of size 2.5" x 3.5" (centimeters: 6.35 x 8.89).
4. Background: The photo should be a recent passport size color photograph with a light background.
5. Quality: The photo should be clear and well-focused, showing the full face of the candidate against a light, plain background.
6. Age: The photograph should be taken within three months of the date of submission of Online Application Form.
7. No Mask: The candidate should not wear a mask in the photograph.
8. No Glasses: The candidate should not wear spectacles or sunglasses in the photograph.
9. No Cap: The candidate should not wear a cap/hat/headgear in the photograph.
10. No Headphones: The candidate should not wear any headphones, earphones, or any other electronic gadgets in the photograph.

Please refer to the official SBI website for the most accurate and updated information.


### LLM Evaluation

In [74]:
def llm_as_judge(client, model_id, query, answer, ground_truth=None):
    """
    Simple LLM-based evaluator that scores an answer from 0 to 100
    based on factual correctness, completeness, and clarity.
    If ground_truth is provided, it is used as reference.
    """

    # Build the prompt
    system_msg = (
        "You are an impartial evaluator. "
        "Judge how correct, complete, and relevant the answer is "
        "based on the given question and ground truth (if available). "
        "Give only a JSON response with keys: score (0-100) and reasoning."
    )

    if ground_truth:
        user_prompt = f"""
QUESTION: {query}

GROUND TRUTH: {ground_truth}

MODEL ANSWER: {answer}

Evaluate how close the model answer is to the ground truth.
Score 0 if completely wrong or hallucinated, 100 if perfectly accurate.
"""
    else:
        user_prompt = f"""
QUESTION: {query}

MODEL ANSWER: {answer}

No ground truth available — score based on how factual and relevant the answer seems.
"""

    # Ask the LLM to judge
    resp = client.inference.chat_completion(
        model_id=model_id,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_prompt}
        ]
    )

    # Extract text
    text = (
        resp.completion_message.content
        if hasattr(resp, "completion_message")
        else str(resp)
    )

    # Try to parse score
    import re, json
    score = None
    reasoning = text.strip()

    # Look for JSON or number in response
    try:
        parsed = json.loads(text)
        score = parsed.get("score")
        reasoning = parsed.get("reasoning", text)
    except Exception:
        m = re.search(r"(\d{1,3})", text)
        if m:
            score = int(m.group(1))
    
    # Fallback
    if score is None:
        score = 50  # default neutral score

    verdict = (
        "Excellent" if score >= 85 else
        "Good" if score >= 70 else
        "Needs Improvement" if score >= 50 else
        "Incorrect"
    )

    return {"score": score, "verdict": verdict, "reasoning": reasoning[:300]}


In [75]:
print(llm_as_judge(client=client, model_id=model_id, query=query, answer=rag_answer, ground_truth=ground_truth))

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


{'score': 95, 'verdict': 'Excellent', 'reasoning': 'The model answer accurately captures the essential guidelines for uploading photographs for SBI RECRUITMENT OF JUNIOR ASSOCIATES (CUSTOMER SUPPORT & SALES 2025). It correctly mentions the requirements for color, background, face expression, lighting, headwear, dimensions, file size, and the need for'}


In [77]:
print(llm_as_judge(client=client, model_id=model_id, query=query, answer=without_rag_answer, ground_truth=ground_truth))

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service.lsd-llama.svc.cluster.local:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


{'score': 60, 'verdict': 'Needs Improvement', 'reasoning': '{\n  "score": 60,\n  "reasoning": "The model answer provides some correct guidelines such as file format, size, dimensions, background, and quality. However, it deviates significantly from the ground truth in several key areas:\n\n1. It incorrectly states that the candidate should not wear glasses, caps'}
