In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("./datasets/final_gemini_extract.csv")

### Tag Scheme Type

In [9]:
def classify_scheme(row):
    if row["State"] == "All India" or row["Ministry"].strip() != "":
        return "Central"
    else:
        return "State"

In [15]:
df["Scheme_Level"] = df.apply(classify_scheme, axis=1)

### Prepare Documents for RAG

#### Columns considered: 
"Scheme_Name", "Eligibility", "Benefits", "Details", "Application_Process", "Documents_Required", "Ministry", "State", "URL"

### Build Document Objects 

In [16]:
from langchain_core.documents import Document

In [17]:
# Fill NaNs to avoid issues
df.fillna("", inplace=True)

In [18]:
# Fields to include in the document content
def build_document(row):
    content = f"""
    Scheme Name: {row['Scheme_Name']}
    State: {row['State']}
    Ministry: {row['Ministry']}
    Scheme Level: {row['Scheme_Level']}
    Eligibility: {row['Eligibility']}
    Benefits: {row['Benefits']}
    Details: {row['Details']}
    Application Process: {row['Application_Process']}
    Documents Required: {row['Documents_Required']}
    URL: {row['URL']}
    """
    metadata = {
        "scheme_name": row["Scheme_Name"],
        "state": row["State"],
        "ministry": row["Ministry"],
        "scheme_level": row["Scheme_Level"],
        "url": row["URL"],
    }
    return Document(page_content=content.strip(), metadata=metadata)

# Convert all rows into LangChain Document objects
documents = [build_document(row) for _, row in df.iterrows()]

### Output:
documents is a list of langchain_core.documents.Document objects

### Embed your documents using Gemini and save FAISS index

In [19]:
import os
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS

In [20]:
# Load environment variable
load_dotenv()
google_api_key = os.getenv("GEMINI_API_KEY")

In [21]:
# Initialize Gemini embedding model
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=google_api_key)


In [22]:
# Embed your LangChain Documents 
vectorstore = FAISS.from_documents(documents, embedding_model)

# Save locally
vectorstore.save_local("scheme_vector_index_gemini")