# Installation

In [1]:
%%writefile requirements.txt
datasets
transformers
sentence-transformers
chromadb
langchain
langchain-community
langchain-chroma
langchain-huggingface
fastapi
uvicorn
python-dotenv
bitsandbytes
accelerate
nest-asyncio
torch


Writing requirements.txt


In [2]:
!pip install -r requirements.txt

Collecting chromadb (from -r requirements.txt (line 4))
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-community (from -r requirements.txt (line 6))
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-chroma (from -r requirements.txt (line 7))
  Downloading langchain_chroma-1.1.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-huggingface (from -r requirements.txt (line 8))
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Collecting bitsandbytes (from -r requirements.txt (line 12))
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pybase64>=1.4.1 (from chromadb->-r requirements.txt (line 4))
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb->-r

# Modules

## Embedding Model

In [3]:
%%writefile embedding_model.py
import os
from langchain_huggingface import HuggingFaceEmbeddings

hf_token = os.getenv("HF_TOKEN")

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedder = HuggingFaceEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs={'device': 'cuda'} 
)

print(f"[INFO] Embedding model '{MODEL_NAME}' loaded")

Writing embedding_model.py


In [4]:
from kaggle_secrets import UserSecretsClient
import os

user_secrets = UserSecretsClient()
os.environ['HF_TOKEN'] = user_secrets.get_secret("HF_TOKEN")

!python embedding_model.py

2025-12-27 00:14:55.765224: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766794495.955741     135 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766794496.012874     135 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766794496.461165     135 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766794496.461216     135 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766794496.461220     135 computation_placer.cc:177] computation placer alr

## LLM Model

In [5]:
%%writefile llm_model.py
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

hf_token = os.getenv("HF_TOKEN")

LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print(f"[INFO] Loading tokenizer for Mistral...")
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, token=hf_token)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"[INFO] Loading Mistral-7B-v0.2 in 4-bit...")
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    token=hf_token
)

print(f"[INFO] LLM model loaded successfully on {model.device}")

Writing llm_model.py


In [6]:
import os
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
os.environ['HF_TOKEN'] = user_secrets.get_secret("HF_TOKEN")
!python llm_model.py

[INFO] Loading tokenizer for Mistral...
tokenizer_config.json: 2.10kB [00:00, 9.26MB/s]
tokenizer.model: 100%|███████████████████████| 493k/493k [00:00<00:00, 1.72MB/s]
tokenizer.json: 1.80MB [00:00, 40.3MB/s]
special_tokens_map.json: 100%|█████████████████| 414/414 [00:00<00:00, 3.85MB/s]
[INFO] Loading Mistral-7B-v0.2 in 4-bit...
config.json: 100%|█████████████████████████████| 596/596 [00:00<00:00, 5.23MB/s]
2025-12-27 00:15:25.551240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766794525.572580     164 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766794525.579411     164 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766794525

## Preprocessing

In [7]:
%%writefile preprocess.py
import os
import json
from datasets import load_dataset
try:
    from embedding_model import embedder
    from llm_model import tokenizer
    print("[SUCCESS] Imported embedder and tokenizer")
except ImportError:
    print("[ERROR] Could not find embedding_model.py or llm_model.py. Make sure to run those cells first!")

print("Tokenizer loaded successfully")


def load_triviaqa_subset(n_samples=700):
    print(f"[load_triviaqa_subset] Loading {n_samples} samples from TriviaQA...")
    dataset = load_dataset("mandarjoshi/trivia_qa", "unfiltered", split=f"train[:{n_samples}]")
    docs = []

    for item in dataset:
        sr = item.get("search_results", {})
        contexts = sr.get("search_context", [])

        for ctx in contexts:
            if ctx and ctx.strip():
                docs.append({
                    "doc_id": item["question_id"],
                    "text": ctx.strip()
                })

    print(f"[load_triviaqa_subset] Docs extracted: {len(docs)}")
    # if docs:
    #     print("[load_triviaqa_subset] Example doc:", docs[0])
    return docs

import re

def clean_text(text, min_length=15):
    if not text:
        return None
    cleaned = text.strip()
    cleaned = re.sub(r'\s+', ' ', cleaned)
    if len(cleaned) < min_length:
        return None

    return cleaned

def chunk_text(text, min_tokens=50, max_tokens=256, overlap=50): 
    token_ids = tokenizer(text, add_special_tokens=False)["input_ids"]
    if len(token_ids) < max_tokens:
       return [tokenizer.decode(token_ids)]
    
    chunks = []
    start = 0
    while start < len(token_ids):
        end = start + max_tokens
        chunk_ids = token_ids[start:end]
        chunks.append(tokenizer.decode(chunk_ids))
        start += max_tokens - overlap 
    return chunks

def preprocess_triviaqa(n_samples=700, min_tokens_chunk=50, max_tokens_chunk=256, overlap_chunk=50):
    print(f"[preprocess_triviaqa] Starting preprocessing for {n_samples} samples...")
    docs = load_triviaqa_subset(n_samples)
    final_chunks = []

    for idx, d in enumerate(docs):
        cleaned = clean_text(d["text"])
        if not cleaned:
            print(f"[clean_text] Doc {d['doc_id']} skipped (too short)")
            continue

        chunks = chunk_text(cleaned, min_tokens=min_tokens_chunk, max_tokens=max_tokens_chunk, overlap=overlap_chunk)
        if not chunks:
            print(f"[chunk_text] Doc {d['doc_id']} produced 0 chunks")
            continue

        for i, ch in enumerate(chunks):
            final_chunks.append({
                "doc_id": d["doc_id"],
                "chunk_id": i,
                "text": ch
            })
        if idx < 3: 
            print(f"[preprocess_triviaqa] Doc {d['doc_id']} → {len(chunks)} chunks")

    print(f"[preprocess_triviaqa] Total chunks generated: {len(final_chunks)}")
    if final_chunks:
        print("[preprocess_triviaqa] Example chunk:", final_chunks[0])
    return final_chunks

def store_chunks_metadata(final_chunks, output_file="chunks_metadata.json"):
    metadata_list = []

    for chunk in final_chunks:
        text = chunk.get("text", "")
        meta = {
            "doc_id": chunk.get("doc_id"),
            "chunk_id": chunk.get("chunk_id"),
            "text": text,  
            "text_length": len(text),
            "num_tokens": len(tokenizer(text, add_special_tokens=False)["input_ids"])
        }
        metadata_list.append(meta)
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(metadata_list, f, ensure_ascii=False, indent=2)
    
    print(f"[store_chunks_metadata] Stored metadata for {len(metadata_list)} chunks in {output_file}")
    return metadata_list




#TEST
if __name__ == "__main__":
    print("[main] Running full preprocessing test...")
    final_chunks = preprocess_triviaqa(n_samples=700, min_tokens_chunk=50, max_tokens_chunk=256, overlap_chunk=50)

    print(f"[main] Test complete. Total chunks generated: {len(final_chunks)}")
    if final_chunks:
        print("[main] Example chunk:", final_chunks[0])
    chunks_metadata = store_chunks_metadata(final_chunks)
    print(chunks_metadata[:2])  

Writing preprocess.py


In [8]:
import os
import sys
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
os.environ['HF_TOKEN'] = user_secrets.get_secret("HF_TOKEN")
sys.path.append('/kaggle/working')

!python preprocess.py

2025-12-27 00:16:54.560258: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766794614.581264     255 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766794614.588053     255 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766794614.604829     255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766794614.604858     255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766794614.604865     255 computation_placer.cc:177] computation placer alr

### Analyzes document length before and after chunking

In [9]:
import json
import numpy as np
from preprocess import load_triviaqa_subset


# Load documents
raw_docs = load_triviaqa_subset(n_samples=700)

original_lengths = [
    len(doc["text"].split())
    for doc in raw_docs
]
orig_p50 = np.percentile(original_lengths, 50)
orig_p90 = np.percentile(original_lengths, 90)
orig_max = np.max(original_lengths)

print("\n---Original Paragraph Analysis (Before Chunking) ---")
print(f"Total Paragraphs: {len(original_lengths)}")
print(f"Median Length (P50): {orig_p50:.2f} words")
print(f"90th Percentile (P90): {orig_p90:.2f} words")
print(f"Max Paragraph Length: {orig_max} words")


# Load CHUNKED documents
with open("chunks_metadata.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

chunk_word_counts = [
    len(chunk["text"].split())
    for chunk in chunks
]

chunk_avg = np.mean(chunk_word_counts)
chunk_p90 = np.percentile(chunk_word_counts, 90)
chunk_max = np.max(chunk_word_counts)

print("\n--- Chunk Statistics (After Chunking) ---")
print(f"Total Chunks: {len(chunks)}")
print(f"Average Words per Chunk: {chunk_avg:.2f}")
print(f"90th Percentile (P90): {chunk_p90:.2f} words")
print(f"Max Words in a Chunk: {chunk_max} words")


# Comparison Summary
print("\n--- Chunking Effect Summary ---")
print(f"Original P90 → {orig_p90:.2f} words")
print(f"Chunked  P90 → {chunk_p90:.2f} words")
print(f"Reduction Factor ≈ {(orig_p90 / chunk_p90):.2f}x")


2025-12-27 00:36:17.150329: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766795777.176099      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766795777.183487      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766795777.212030      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766795777.212051      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766795777.212054      55 computation_placer.cc:177] computation placer alr

[INFO] Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded
[INFO] Loading tokenizer for Mistral...
[INFO] Loading Mistral-7B-v0.2 in 4-bit...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[INFO] LLM model loaded successfully on cuda:0
[SUCCESS] Imported embedder and tokenizer
Tokenizer loaded successfully
[load_triviaqa_subset] Loading 700 samples from TriviaQA...


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/47 [00:00<?, ?it/s]

[load_triviaqa_subset] Docs extracted: 5541

---Original Paragraph Analysis (Before Chunking) ---
Total Paragraphs: 5541
Median Length (P50): 1038.00 words
90th Percentile (P90): 6098.00 words
Max Paragraph Length: 89615 words

--- Chunk Statistics (After Chunking) ---
Total Chunks: 128151
Average Words per Chunk: 158.20
90th Percentile (P90): 193.00 words
Max Words in a Chunk: 234 words

--- Chunking Effect Summary ---
Original P90 → 6098.00 words
Chunked  P90 → 193.00 words
Reduction Factor ≈ 31.60x


## Vector DB

In [11]:
# #Delete chroma_db
# import shutil
# import os
# db_path = "/kaggle/working/chroma_db"

# if os.path.exists(db_path):
#     shutil.rmtree(db_path)
#     print(f"Deleted old database at {db_path}")
# else:
#     print("No old database found, starting fresh!")

In [10]:
%%writefile vector_store.py
import json
import uuid
import os
import shutil
from langchain_community.vectorstores import Chroma
from embedding_model import embedder 

persist_dir = "/kaggle/working/chroma_db"
if os.path.exists(persist_dir):
    shutil.rmtree(persist_dir)
    print(f"[INFO] Deleted old database at {persist_dir}")

if not os.path.exists("chunks_metadata.json"):
    print("[ERROR] chunks_metadata.json not found! Please run preprocess.py first.")
else:
    with open("chunks_metadata.json", "r", encoding="utf-8") as f:
        chunks = json.load(f)
    print(f"[INFO] Loaded {len(chunks)} chunks from JSON")

    vectorstore = Chroma(
        persist_directory=persist_dir,
        embedding_function=embedder
    )

    texts = [chunk["text"] for chunk in chunks]
    ids = [f'{chunk["doc_id"]}_{chunk["chunk_id"]}_{uuid.uuid4().hex}' for chunk in chunks]

    print(f"[INFO] Generating embeddings for {len(texts)} texts using GPU...")
    all_embeddings = embedder.embed_documents(texts)

    batch_size = 2000
    for start in range(0, len(texts), batch_size):
        end = start + batch_size
        batch_ids = ids[start:end]
        batch_texts = texts[start:end]
        batch_embeddings = all_embeddings[start:end]

        vectorstore._collection.add(
            ids=batch_ids,
            documents=batch_texts,
            embeddings=batch_embeddings
        )
        print(f"[INFO] Stored batch {start}–{min(end, len(texts))} items")

    vectorstore.persist()
    print(f"[INFO] Stored total {len(texts)} embeddings in ChromaDB at '{persist_dir}'")

Writing vector_store.py


In [11]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ['HF_TOKEN'] = user_secrets.get_secret("HF_TOKEN")

!python vector_store.py

2025-12-27 00:37:02.695568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766795822.717804     471 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766795822.724588     471 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766795822.744476     471 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766795822.744509     471 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766795822.744514     471 computation_placer.cc:177] computation placer alr

### Test chroma_db Retrival

In [19]:
from langchain_community.vectorstores import Chroma
from embedding_model import embedder
import os

persist_dir = "/kaggle/working/chroma_db"

if os.path.exists(persist_dir):
    db = Chroma(persist_directory=persist_dir, embedding_function=embedder)
    
    query = "Who wrote The Railway Children?"
    results = db.similarity_search(query, k=5) 

    print(f"🔍 Searching for: '{query}'\n" + "="*50)
    for i, res in enumerate(results):
        print(f"📄 Result {i+1}:")
        print(f"{res.page_content[:400]}...")
        print("-" * 30)
else:
    print("❌ Database not found! Please run 'python vector_store.py' first.")

🔍 Searching for: 'Who wrote The Railway Children?'
📄 Result 1:
constructed film by John Schlesinger which explores the relationship between three people and the break-up of two love affairs. Peter Finch plays a homosexual doctor in his 40s and Glenda Jackson an employment counsellor in her 30s. Both are in love with Murray Head's boyish sculptor; he divides his attentions between both of them without showing a preference. Great performances all round. 66. The...
------------------------------
📄 Result 2:
One of six controversial, stylish Mitford sisters, she spied on her siblings for MI5 because of their Nazi sympathies. Iris Murdoch Novelist & philosopher, 1919-1999 Particularly admired in the Sixties and Seventies, the philosopher and novelist is considered one of the greatest post-war writers. She won the Booker Prize in 1978 for The Sea, with several of her other works adapted for the screen. ...
------------------------------
📄 Result 3:
lo-American popular culture. Whether Lord P

## Pipeline

In [13]:
%%writefile pipeline.py
import time
import re
from transformers import pipeline
from llm_model import model, tokenizer
from embedding_model import embedder
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate


# LLM pipeline 
raw_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=64,         
    temperature=0.0,            
    do_sample=False,
    return_full_text=False
)

llm_pipeline = HuggingFacePipeline(pipeline=raw_pipeline)


#vector DB
persist_dir = "/kaggle/working/chroma_db"
vector_db = Chroma(
    persist_directory=persist_dir,
    embedding_function=embedder
)

retriever = vector_db.as_retriever(search_kwargs={"k":5})


# prompt 
prompt_template = """<s>[INST]
You are a STRICT answer-only bot.

Rules:
- Answer ONLY using the context
- Answer must be very short (1–3 words)
- If the answer is not explicitly present, reply EXACTLY with:
Not found in context
- No explanations
- No full sentences

Context:
{context}

Question:
{question}
[/INST]
Answer:
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)


# QA CHAIN  
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_pipeline,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
)


#  RAG Function
def run_rag(question: str):
    start_time = time.time()
    if question:
        question = question.strip()  
        question = re.sub(r'\s+', ' ', question)  

    # Handle empty or None queries 
    if not question or not question.strip():
        latency_ms = int((time.time() - start_time) * 1000)
        print(f"[WARNING] Empty or None query received. Latency: {latency_ms}ms")
        return {
            "question": question,
            "answer": "Empty query received",
            "retrieved_context": "",
            "latency_ms": latency_ms
        }

    try:
        # Retrieve context
        docs = retriever.invoke(question)
        context_string = "\n".join(doc.page_content.strip() for doc in docs)

        # Generate answer
        response = qa_chain.invoke({"query": question})
        raw_answer = response["result"].strip()
        answer = raw_answer.split("\n")[0].strip()

        # Clean answer 
        if (
            not answer
            or len(answer.split()) > 3
            or "not found" in answer.lower()
            or "context" in answer.lower()
            or "does not" in answer.lower()
        ):
            final_answer = "Not found in context"
        else:
            final_answer = answer.rstrip(".")

        latency_ms = int((time.time() - start_time) * 1000)
        return {
            "question": question,
            "answer": final_answer,
            "retrieved_context": context_string,
            "latency_ms": latency_ms
        }

    except Exception as e:
        latency_ms = int((time.time() - start_time) * 1000)
        print(f"[ERROR] Exception during processing: {str(e)}. Latency: {latency_ms}ms")
        return {
            "question": question,
            "answer": "Not found in context",
            "retrieved_context": "",
            "latency_ms": latency_ms
        }


# TEST 
if __name__ == "__main__":
    query = "Miami Beach in Florida borders which ocean?"
    result = run_rag(query)
    
    print("--- RAG Result ---")
    print(f"Question: {result['question']}")
    print(f"Answer: {result['answer']}")
    print(f"Latency: {result['latency_ms']} ms")
    print("\n--- Context Found (Snippets) ---")
    print(result["retrieved_context"][:500])


Writing pipeline.py


In [14]:
import os
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
os.environ['HF_TOKEN'] = user_secrets.get_secret("HF_TOKEN")

!python pipeline.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2025-12-27 00:45:52.804101: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766796352.826913     534 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766796352.833504     534 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766796352.851375     534 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766796352.851403     534 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766796352.851407     534 computation_placer.cc:177] computation placer alr

# APP

In [15]:
%%writefile app.py
import time
import json
import requests
import nest_asyncio
import uvicorn
from threading import Thread
from fastapi import FastAPI
from pydantic import BaseModel
from pipeline import run_rag   

app = FastAPI()

class QueryRequest(BaseModel):
    question: str

class QueryResponse(BaseModel):
    question: str
    answer: str
    retrieved_context: str 
    latency_ms: int

@app.post("/query", response_model=QueryResponse)
def query_endpoint(req: QueryRequest):
    result = run_rag(req.question)
    return QueryResponse(
        question=result.get("question", req.question),
        answer=result.get("answer", "No answer generated"),
        retrieved_context=result.get("retrieved_context", ""),
        latency_ms=result.get("latency_ms", 0)
    )

nest_asyncio.apply()

def run_api():
    uvicorn.run(app, host="0.0.0.0", port=8050, log_level="error")

thread = Thread(target=run_api, daemon=True)
thread.start()

print("[INFO] Waiting for server to stabilize...")
time.sleep(15) 

url = "http://127.0.0.1:8050/query"
payload = {"question": "Which country left the Commonwealthin 1972 and rejoined in 1989?"}

try:
    resp = requests.post(url, json=payload)
    print("POST status:", resp.status_code)
    if resp.status_code == 200:
        print("POST response:")
        print(json.dumps(resp.json(), indent=4))
    else:
        print("Error Response:", resp.text)
except Exception as e:
    print(f"[ERROR] Connection failed: {e}")

Writing app.py


In [16]:
!python app.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2025-12-27 00:46:40.140243: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766796400.161364     599 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766796400.168217     599 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766796400.184573     599 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766796400.184603     599 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766796400.184607     599 computation_placer.cc:177] computation placer alr

# Evaluation

In [20]:
import pandas as pd
import time
import re
from pipeline import run_rag

evaluation_set = [
    {"question": "Which number Beethoven symphony is known as 'The Pastoral'?", "ground_truth": "Sixth"},
    {"question": "Miami Beach in Florida borders which ocean?", "ground_truth": "Atlantic"},
    {"question": "What is the name of the perfume launched by British boyband JLS in January 2013?", "ground_truth": "Love"},
    {"question": "Caroline of Brunswick was the queen consort of which British King?", "ground_truth": "George IV"},
    {"question": "What is the official march of the Royal Navy?", "ground_truth": "Heart of Oak"},
    {"question": "Technically a shoal of fish becomes a school of fish when it is?", "ground_truth": "Swimming in the same direction"},
    {"question": "On which island was the famous photograph taken showing US Marines raising the US flag over Mt Suribachi in February 1945?", "ground_truth": "Iwo Jima"},
    {"question": "What was the first name of the character played by John Travolta in Saturday Night Fever?", "ground_truth": "Tony (Manero)"},
    {"question": "Jonas Salk developed a vaccine against what?", "ground_truth": "Polio"},
    {"question": "Who is said to have cut the Gordian Knot?", "ground_truth": "Alexander the Great"},
    {"question": "The Italian cheese called dolcelatte translates into English as what?", "ground_truth": "Sweet milk"},
    {"question": "What is the title of the last Harry Potter novel, published in 2007?", "ground_truth": "Harry Potter and the Deathly Hallows"},
    {"question": "Who was the first professional cricketer to captain England?", "ground_truth": "Len Hutton"},
    {"question": "Which country left the Commonwealth in 1972 and rejoined in 1989?", "ground_truth": "Pakistan"},
    {"question": "Wisent is an alternative name for which animal?", "ground_truth": "(European) Bison"},
    {"question": "The site of Carthage is now in a suburb of which modem capital city?", "ground_truth": "Tunis"},
    {"question": "In which country is the annual International Alphorn Festival held?", "ground_truth": "SWITZERLAND"},
    {"question": "David Balfour and Alan Breck are characters in books by which author?", "ground_truth": "ROBERT LOUIS STEVENSON"},
    {"question": "High Willhays is the highest point of what National Park?", "ground_truth": "DARTMOOR"},
    {"question": "In 1973 the Paris Peace Accords were held in an attempt to end which war?", "ground_truth": "Vietnam"}
]

def normalize(text):
    """Remove punctuation, convert to lowercase, and strip spaces"""
    return re.sub(r'[^\w\s]', '', text).lower().strip() if text else ''

def evaluate_system(test_set):
    results = []
    latencies = []

    print(f"Starting Evaluation on {len(test_set)} questions...\n")

    for item in test_set:
        q = item["question"]
        gt = item["ground_truth"]

        res = run_rag(q)
        generated_ans = res.get("answer", "").strip()
        context = res.get("retrieved_context", "")
        latency = res.get("latency_ms", 0)

        latencies.append(latency)

        # Normalize for comparison
        gen_norm = normalize(generated_ans)
        gt_norm = normalize(gt)
        context_norm = normalize(context)

        relevance = "Yes" if gt_norm in context_norm else "No"
        context_ok = "Yes" if gt_norm in context_norm else "No"

        # Evaluate correctness
        if generated_ans == "Not found in context":
            status = "Incorrect"
        elif not context_ok:
            status = "Incorrect"
        elif gen_norm == gt_norm:
            status = "Correct"
        elif gt_norm in gen_norm or gen_norm in gt_norm:
            status = "Partially Correct"
        else:
            status = "Incorrect"

        results.append({
            "Question": q,
            "Ground Truth": gt,
            "RAG Answer": generated_ans,
            "Context Correct?": context_ok,
            "Evaluation Status": status,
            "Latency (ms)": latency,
            "Relevance": relevance
        })

    df = pd.DataFrame(results)
    accuracy = df["Evaluation Status"].isin(["Correct", "Partially Correct"]).sum() / len(df) * 100
    avg_latency = round(sum(latencies) / len(latencies), 2)
    relevance_pct = (df["Relevance"] == "Yes").sum() / len(df) * 100

    print("\n📊 Evaluation Summary")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Average Latency: {avg_latency} ms")
    print(f"Relevance: {relevance_pct:.2f}%")

    return df

# Run evaluation
df_final_results = evaluate_system(evaluation_set)
df_final_results


Starting Evaluation on 20 questions...


📊 Evaluation Summary
Accuracy: 55.00%
Average Latency: 4142.1 ms
Relevance: 50.00%


Unnamed: 0,Question,Ground Truth,RAG Answer,Context Correct?,Evaluation Status,Latency (ms),Relevance
0,Which number Beethoven symphony is known as 'T...,Sixth,Sixth,Yes,Correct,2019,Yes
1,Miami Beach in Florida borders which ocean?,Atlantic,Atlantic Ocean,Yes,Partially Correct,2488,Yes
2,What is the name of the perfume launched by Br...,Love,Not found in context,No,Incorrect,6056,No
3,Caroline of Brunswick was the queen consort of...,George IV,George IV,Yes,Correct,2438,Yes
4,What is the official march of the Royal Navy?,Heart of Oak,Not found in context,No,Incorrect,4867,No
5,Technically a shoal of fish becomes a school o...,Swimming in the same direction,Not found in context,No,Incorrect,8709,No
6,On which island was the famous photograph take...,Iwo Jima,Iwo Jima,Yes,Correct,2544,Yes
7,What was the first name of the character playe...,Tony (Manero),Tony Manero,Yes,Correct,4648,Yes
8,Jonas Salk developed a vaccine against what?,Polio,Polio,Yes,Correct,6224,Yes
9,Who is said to have cut the Gordian Knot?,Alexander the Great,Alexander the Great,Yes,Correct,2382,Yes
