In [4]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

# Load CSV
df = pd.read_csv(r"C:\Users\MISTY ROY\OneDrive\Desktop\Pythonfiles_vscode\rag_proj\scheme_data.csv")
df.fillna("", inplace=True)
df = df.drop_duplicates(subset=["slug"])

# Combine all relevant info into pre-formatted content per scheme
def format_content(row):
    return f"""
Scheme Name: {row['scheme_name']}
Level: {row['level']}
Category: {row['schemeCategory']}
Tags: {row['tags']}

Details:
{row['details']}

Benefits:
{row['benefits']}

Eligibility:
{row['eligibility']}

Application Process:
{row['application']}

Documents Required:
{row['documents']}
"""

df["content"] = df.apply(format_content, axis=1)

# Create metadata dictionary
df["metadata"] = df.apply(lambda row: {
    "scheme_name": row["scheme_name"],
    "slug": row["slug"],
    "level": row["level"],
    "category": row["schemeCategory"],
    "tags": row["tags"]
}, axis=1)

# Split content into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,   # adjusted for GPT-Neo-125M context
    chunk_overlap=50
)

documents = []
print("Splitting schemes into text chunks...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Splitting"):
    chunks = text_splitter.split_text(row["content"])
    for i, chunk in enumerate(chunks):
        if chunk.strip():
            documents.append({
                "id": f"{row['slug']}_chunk{i+1}",
                "text": chunk,
                "metadata": row["metadata"]
            })

print(f"✅ Split {len(df)} schemes into {len(documents)} text chunks")


  df.fillna("", inplace=True)


Splitting schemes into text chunks...


Splitting: 100%|██████████| 3397/3397 [00:01<00:00, 2055.12it/s]

✅ Split 3397 schemes into 45243 text chunks





In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.schema import Document
from langchain.vectorstores import Chroma
from tqdm import tqdm
from langchain_huggingface import HuggingFacePipeline


# Initialize embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Convert to LangChain Document objects
docs_with_metadata = [
    Document(page_content=doc["text"], metadata=doc["metadata"])
    for doc in documents
]

# Initialize empty Chroma store
vectordb = Chroma(
    collection_name="schemes_db",
    persist_directory="./chroma_store3",
    embedding_function=embeddings
)

# Batch size for embedding + adding
batch_size = 500
print("Embedding and adding documents in batches...")

for i in tqdm(range(0, len(docs_with_metadata), batch_size), desc="Embedding batches"):
    batch = docs_with_metadata[i : i + batch_size]
    vectordb.add_documents(batch)

# Persist DB
vectordb.persist()
print(f"✅ Chroma vector store created with {len(documents)} chunks")

Embedding and adding documents in batches...


Embedding batches: 100%|██████████| 91/91 [08:01<00:00,  5.29s/it]

✅ Chroma vector store created with 45243 chunks



  vectordb.persist()


In [7]:
from transformers import pipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from tqdm import tqdm
from langchain_huggingface import HuggingFacePipeline


# 1. Reload embeddings (must match stored DB)
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# 2. Load Chroma vector DB
vectordb = Chroma(
    collection_name="schemes_db",
    persist_directory="./chroma_store3",
    embedding_function=embeddings
)
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# 3. Prompt
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template = """
You are a government schemes assistant.
Answer ONLY using the provided context.
If information is missing, write "Not available".

Context:
{context}

User Question:
{question}

Answer in this exact JSON format:
{{
  "Scheme Name": "...",
  "Eligibility": "...",
  "Benefits": "...",
  "Application Process": "...",
  "Required Documents": "...",
  "Validity / Duration": "...",
  "Level": "...",
  "Category": "...",
  "Tags": "..."
}}
"""
)


# 4. Load local GPT-Neo model
print("Loading GPT-Neo-125M pipeline...")
pipe = pipeline(
    "text-generation",
    model="EleutherAI/gpt-neo-1.3B",
    device=-1,           # -1 for CPU, 0 for GPU
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7
)

llm = HuggingFacePipeline(pipeline=pipe)

# 5. RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# 6. Query example
query = "Tell me all information about Prime Minister’s Fellowship for Doctoral Research"
print(f"Running query: {query}")

result = qa_chain.invoke({"query": query})

print("\nAnswer:\n", result['result'])
print("\nSources:")
for doc in result['source_documents']:
    print("-", doc.metadata['scheme_name'], ":", doc.page_content[:200], "...")


Loading GPT-Neo-125M pipeline...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Running query: Tell me all information about Prime Minister’s Fellowship for Doctoral Research

Answer:
 
You are a government schemes assistant.
Answer ONLY using the provided context.
If information is missing, write "Not available".

Context:
on industrially relevant subject areas. Key Features of the Fellowship: In addition to the attractive scholarship, the Prime Minister’s Fellowship emphasizes providing a unique and invigorating experience to selected fellows. It ensures the best national and international exposure for them and provides mentoring through industry and academic experts through the mechanism of annual review meetings. In addition, periodic mentorship sessions are also organized with the help of expert external

Prime Minister’s Fellowship for Doctoral Research scheme is a prestigious initiative of the Science and Engineering Research Board (SERB), Department of Science & Technology, Government of India towards the advancement of university research engagements in l