## Basic

In [None]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

# --------------------------
# Load your CSV
# --------------------------
df = pd.read_csv(r"C:\Users\MISTY ROY\OneDrive\Desktop\Pythonfiles_vscode\rag_proj\scheme_data.csv")
df.fillna("", inplace=True)
df = df.drop_duplicates(subset=["slug"])

# Combine relevant columns into one text field
df["content"] = (
    df["details"] + "\n\n" +
    "Benefits: " + df["benefits"] + "\n\n" +
    "Eligibility: " + df["eligibility"] + "\n\n" +
    "Application: " + df["application"] + "\n\n" +
    "Documents Required: " + df["documents"]
)

# Add metadata
df["metadata"] = df.apply(lambda row: {
    "scheme_name": row["scheme_name"],
    "slug": row["slug"],
    "level": row["level"],
    "category": row["schemeCategory"],
    "tags": row["tags"]
}, axis=1)

print(f"Loaded CSV with {len(df)} rows")

Loaded CSV with 3397 rows


  df.fillna("", inplace=True)


In [14]:
# --------------------------
# Split long text into smaller chunks
# --------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,    # max characters per chunk
    chunk_overlap=50   # overlap between chunks
)

documents = []
for idx, row in df.iterrows():
    chunks = text_splitter.split_text(row["content"])
    for i, chunk in enumerate(chunks):
        if chunk.strip():  # skip empty chunks
            documents.append({
                "id": f"{row['slug']}_chunk{i+1}",
                "text": chunk,
                "metadata": row["metadata"]
            })

print(f"✅ Split into {len(documents)} text chunks")


✅ Split into 33404 text chunks


In [16]:
# --------------------------
# Initialize local embeddings
# --------------------------
embeddings = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2"  # small, fast, high-quality local model
)

# # Optional test
# test_vector = embeddings.embed_documents(["Hello world"])
# print(f"Local embeddings working, vector length: {len(test_vector[0])}")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Local embeddings working, vector length: 384


In [18]:
# --------------------------
# Create Chroma vector store
# --------------------------
from tqdm import tqdm
from langchain.schema import Document

# Prepare empty list for Document objects
docs_with_vectors = []

print("Embedding documents with tqdm...")
for doc in tqdm(documents, desc="Embedding"):
    vector = embeddings.embed_documents([doc["text"]])[0]  # embed single chunk
    docs_with_vectors.append(
        Document(page_content=doc["text"], metadata=doc["metadata"])
    )

# Create Chroma vector store using precomputed embeddings
vectordb = Chroma.from_documents(
    documents=docs_with_vectors,
    embedding=embeddings,
    ids=[doc["id"] for doc in documents],
    collection_name="schemes_db",
    persist_directory="./chroma_store"
)

vectordb.persist()
print(f"Chroma vector store created with {len(documents)} documents")



Embedding documents with tqdm...


Embedding: 100%|██████████| 33404/33404 [10:23<00:00, 53.59it/s]


✅ Chroma vector store created with 33404 documents


  vectordb.persist()


In [None]:
'''better code'''

# from tqdm import tqdm
# from langchain.schema import Document

# # Precompute embeddings
# docs_with_vectors = []
# vectors = []

# print("Embedding documents with tqdm...")
# for doc in tqdm(documents, desc="Embedding"):
#     vec = embeddings.embed_documents([doc["text"]])[0]
#     vectors.append(vec)
#     docs_with_vectors.append(
#         Document(page_content=doc["text"], metadata=doc["metadata"])
#     )

# # Pass precomputed embeddings to Chroma
# vectordb = Chroma.from_documents(
#     documents=docs_with_vectors,
#     embedding=None,  # already embedded
#     ids=[doc["id"] for doc in documents],
#     collection_name="schemes_db",
#     persist_directory="./chroma_store"
# )

# vectordb._collection.add(
#     documents=[doc.page_content for doc in docs_with_vectors],
#     metadatas=[doc.metadata for doc in docs_with_vectors],
#     ids=[doc["id"] for doc in documents],
#     embeddings=vectors
# )

# vectordb.persist()


In [19]:
from langchain.vectorstores import Chroma

# Load persisted Chroma vector store
vectordb = Chroma(
    persist_directory="./chroma_store",
    embedding_function=embeddings,
    collection_name="schemes_db"
)

# Create a retriever for fetching relevant chunks
retriever = vectordb.as_retriever(search_kwargs={"k": 3})  # fetch top 3 relevant chunks


  vectordb = Chroma(


In [20]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a friendly and knowledgeable assistant specialized in Indian government schemes.

1. Greet the user first with: "Hello! How may I assist you with government schemes today?"

2. Behavior:
   - If the user's question is about one or more schemes listed in the provided context:
       - Provide a **structured answer for each relevant scheme**.
       - Include the following metadata for every scheme if available:
           - **Scheme Name**  
           - **Eligibility**  
           - **Benefits**  
           - **Application Process**  
           - **Required Documents**  
           - **Validity / Duration**  
           - **Level (Central/State)**  
           - **Scheme Category**  
           - **Tags**  
       - Present each scheme clearly, like a **numbered mini-report**.
       - If a field is missing, mention "Not available".

   - If the user's question is **not related** to any scheme in the context:
       - Respond politely: "Certainly, I don't know the answer to that. Please enter a valid question related to government schemes."

3. Always base your answer **only on the provided context**.  
   - Do not make up information.
   - If the context does not have enough details, say: "Not enough information."

Here is the information you can use:

{context}

User Question: {question}

Answer in structured, multi-scheme format:
"""
)


In [21]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.chains import RetrievalQA

# Initialize local transformer LLM
pipe = pipeline(
    "text-generation",
    model="google/flan-t5-small",  # CPU-friendly local model
    max_length=512,
    do_sample=True,
    temperature=0.7
)

llm = HuggingFacePipeline(pipeline=pipe)

# Create RetrievalQA chain with prompt template
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff",  # concatenates retrieved chunks
    chain_type_kwargs={"prompt": prompt}
)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGe

In [22]:
query = "Tell me all information about Prime Minister’s Fellowship for Doctoral Research"

result = qa_chain({"query": query})

print("Answer:\n", result['result'])

print("\nSources:")
for doc in result['source_documents']:
    print("-", doc.metadata['scheme_name'], ":", doc.page_content[:200], "...")


  result = qa_chain({"query": query})
Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors


Answer:
 
You are a friendly and knowledgeable assistant specialized in Indian government schemes.

1. Greet the user first with: "Hello! How may I assist you with government schemes today?"

2. Behavior:
   - If the user's question is about one or more schemes listed in the provided context:
       - Provide a **structured answer for each relevant scheme**.
       - Include the following metadata for every scheme if available:
           - **Scheme Name**  
           - **Eligibility**  
           - **Benefits**  
           - **Application Process**  
           - **Required Documents**  
           - **Validity / Duration**  
           - **Level (Central/State)**  
           - **Scheme Category**  
           - **Tags**  
       - Present each scheme clearly, like a **numbered mini-report**.
       - If a field is missing, mention "Not available".

   - If the user's question is **not related** to any scheme in the context:
       - Respond politely: "Certainly, I don't know the 

## Basic1

In [None]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load CSV
df = pd.read_csv(r"C:\Users\MISTY ROY\OneDrive\Desktop\Pythonfiles_vscode\rag_proj\scheme_data.csv")
df.fillna("", inplace=True)
df = df.drop_duplicates(subset=["slug"])

# Combine all relevant info into pre-formatted content per scheme
def format_content(row):
    return f"""
Scheme Name: {row['scheme_name']}
Level: {row['level']}
Category: {row['schemeCategory']}
Tags: {row['tags']}

Details:
{row['details']}

Benefits:
{row['benefits']}

Eligibility:
{row['eligibility']}

Application Process:
{row['application']}

Documents Required:
{row['documents']}
"""

df["content"] = df.apply(format_content, axis=1)

# Create metadata dictionary
df["metadata"] = df.apply(lambda row: {
    "scheme_name": row["scheme_name"],
    "slug": row["slug"],
    "level": row["level"],
    "category": row["schemeCategory"],
    "tags": row["tags"]
}, axis=1)

# Split content into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)

documents = []
for idx, row in df.iterrows():
    chunks = text_splitter.split_text(row["content"])
    for i, chunk in enumerate(chunks):
        if chunk.strip():
            documents.append({
                "id": f"{row['slug']}_chunk{i+1}",
                "text": chunk,
                "metadata": row["metadata"]
            })

print(f"✅ Split {len(df)} schemes into {len(documents)} text chunks")

  df.fillna("", inplace=True)


✅ Split 3397 schemes into 54772 text chunks


In [5]:
from langchain.embeddings import SentenceTransformerEmbeddings

# Initialize local embeddings
embeddings = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2"  
)


  embeddings = SentenceTransformerEmbeddings(


In [6]:
from tqdm import tqdm
from langchain.schema import Document
from langchain.vectorstores import Chroma

# --------------------------
# Step 2: Precompute embeddings in batches
# --------------------------
docs_with_vectors = []
vectors = []

batch_size = 1000  # adjust based on memory/speed
print("Embedding chunks with tqdm...")

for i in tqdm(range(0, len(documents), batch_size), desc="Embedding batches"):
    batch_docs = documents[i:i+batch_size]
    texts = [doc["text"] for doc in batch_docs]
    
    # Embed the batch
    batch_vectors = embeddings.embed_documents(texts)
    vectors.extend(batch_vectors)
    
    # Create Document objects
    for doc in batch_docs:
        docs_with_vectors.append(Document(page_content=doc["text"], metadata=doc["metadata"]))

# --------------------------
# Step 3: Create Chroma collection with precomputed embeddings in batches
# --------------------------
vectordb = Chroma(
    collection_name="schemes_db",       
    persist_directory="./chroma_store2",
    embedding_function=None              
)

add_batch_size = 5000 
print("Adding documents to Chroma in batches...")

for i in tqdm(range(0, len(docs_with_vectors), add_batch_size), desc="Adding batches"):
    batch_docs = docs_with_vectors[i:i+add_batch_size]
    vectordb._collection.add(
        documents=[doc.page_content for doc in batch_docs],
        metadatas=[doc.metadata for doc in batch_docs],
        ids=[doc.metadata["slug"] + f"_chunk{j+1}" for j, doc in enumerate(batch_docs)],  # ensure unique IDs
        embeddings=vectors[i:i+add_batch_size]
    )

# Persist the vector store
vectordb.persist()
print(f"✅ Chroma vector store created with {len(documents)} chunks")


Embedding chunks with tqdm...


Embedding batches: 100%|██████████| 55/55 [05:35<00:00,  6.10s/it]
  vectordb = Chroma(


Adding documents to Chroma in batches...


Adding batches: 100%|██████████| 11/11 [00:31<00:00,  2.87s/it]

✅ Chroma vector store created with 54772 chunks



  vectordb.persist()


In [10]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.prompts import PromptTemplate

# --------------------------
# 1️⃣ Load embeddings (same as used for precomputing)
# --------------------------
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# --------------------------
# 2️⃣ Load existing Chroma collection
# --------------------------
vectordb = Chroma(
    collection_name="schemes_db",          # your collection name
    persist_directory="./chroma_store2",   # directory where you saved vectors
    embedding_function=embeddings           # needed for query embedding
)

retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# --------------------------
# 3️⃣ Prepare your Prompt Template
# --------------------------
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a friendly and knowledgeable assistant specialized in Indian government schemes.

1. Greet the user first with: "Hello! How may I assist you with government schemes today?"

2. Behavior:
   - If the user's question is about one or more schemes listed in the provided context:
       - Provide a **structured answer for each relevant scheme**.
       - Include the following metadata for every scheme if available:
           - **Scheme Name**  
           - **Eligibility**  
           - **Benefits**  
           - **Application Process**  
           - **Required Documents**  
           - **Validity / Duration**  
           - **Level (Central/State)**  
           - **Scheme Category**  
           - **Tags**  
       - Present each scheme clearly, like a **numbered mini-report**.
       - If a field is missing, mention "Not available".

   - If the user's question is **not related** to any scheme in the context:
       - Respond politely: "Certainly, I don't know the answer to that. Please enter a valid question related to government schemes."

3. Always base your answer **only on the provided context**.  
   - Do not make up information.
   - If the context does not have enough details, say: "Not enough information."

Here is the information you can use:

{context}

User Question: {question}

Answer in structured, multi-scheme format:
"""
)

# --------------------------
# 4️⃣ Initialize local LLM
# --------------------------
pipe = pipeline(
    "text-generation",
    model="google/flan-t5-small",  # CPU-friendly local model
    max_length=512,
    do_sample=True,
    temperature=0.7
)
llm = HuggingFacePipeline(pipeline=pipe)

# --------------------------
# 5️⃣ Create RetrievalQA chain
# --------------------------
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# --------------------------
# 6️⃣ Query example
# --------------------------
query = "Tell me all information about Prime Minister’s Fellowship for Doctoral Research"
result = qa_chain.invoke({"query": query})  # use invoke() instead of deprecated __call__

print("Answer:\n", result['result'])
print("\nSources:")
for doc in result['source_documents']:
    print("-", doc.metadata['scheme_name'], ":", doc.page_content[:200], "...")


Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaFor

Answer:
 
You are a friendly and knowledgeable assistant specialized in Indian government schemes.

1. Greet the user first with: "Hello! How may I assist you with government schemes today?"

2. Behavior:
   - If the user's question is about one or more schemes listed in the provided context:
       - Provide a **structured answer for each relevant scheme**.
       - Include the following metadata for every scheme if available:
           - **Scheme Name**  
           - **Eligibility**  
           - **Benefits**  
           - **Application Process**  
           - **Required Documents**  
           - **Validity / Duration**  
           - **Level (Central/State)**  
           - **Scheme Category**  
           - **Tags**  
       - Present each scheme clearly, like a **numbered mini-report**.
       - If a field is missing, mention "Not available".

   - If the user's question is **not related** to any scheme in the context:
       - Respond politely: "Certainly, I don't know the 

In [1]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.prompts import PromptTemplate

# 1. Load embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# 2. Load Chroma vector store
vectordb = Chroma(
    collection_name="schemes_db",
    persist_directory="./chroma_store2",
    embedding_function=embeddings
)

retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# 3. Prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""Hello! How may I assist you with government schemes today?

Answer ONLY using the provided context. 
If context is insufficient, reply: "Not enough information."

Context:
{context}

User Question: {question}

Answer in structured format:
- Scheme Name
- Eligibility
- Benefits
- Application Process
- Required Documents
- Validity / Duration
- Level (Central/State)
- Category
- Tags
"""
)

# 4. Initialize local model (seq2seq style)
pipe = pipeline(
    "text2text-generation",       # FIXED here
    model="google/flan-t5-base",  # use base for longer context
    max_length=512,
    temperature=0.7
)
llm = HuggingFacePipeline(pipeline=pipe)

# 5. RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# 6. Query
query = "Tell me all information about Prime Minister’s Fellowship for Doctoral Research"
result = qa_chain.invoke({"query": query})

print("Answer:\n", result['result'])
print("\nSources:")
for doc in result['source_documents']:
    print("-", doc.metadata['scheme_name'], ":", doc.page_content[:200], "...")


  from .autonotebook import tqdm as notebook_tqdm
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  vectordb = Chroma(
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOS

Answer:
 Scheme Name - Eligibility - Benefits - Application Process - Required Documents - Validity / Duration - Level (Central/State) - Category - Tags

Sources:
- Prime Minister’s Fellowship for Doctoral Research : Features of the Fellowship: In addition to the attractive scholarship, the Prime Minister’s Fellowship emphasizes providing a unique and invigorating experience to selected fellows. It ensures the bes ...
- Prime Minister’s Fellowship for Doctoral Research : Prime Minister’s Fellowship for Doctoral Research scheme is a prestigious initiative of the Science and Engineering Research Board (SERB), Department of Science & Technology, Government of India towar ...
- Prime Minister’s Fellowship for Doctoral Research : Scheme Name: Prime Minister’s Fellowship for Doctoral Research
Level: Central
Category: Education & Learning, Science, IT & Communications
Tags: Fellowship, Doctoral, Research, SERB, PhD, Fellow ...
- KSCSTE Post-Doctoral Fellowship Programme : the quality of the p