In [1]:
import os
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

# --- 1. SETUP ---

load_dotenv()
api_key = os.getenv("GROQ_API_KEY")

#JSON file - RAG Dataset
file_path = "IndicLegalQA Dataset.json"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 2. Load the dataset ---
try:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    print("Please update the 'file_path' variable with the correct location of your dataset.")
    exit()
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from the file '{file_path}'.")
    print("Please ensure it is a valid JSON file.")
    exit()

In [3]:
# --- 3. Prepare documents ---
# We format each item in the JSON file into a LangChain Document object.
# This combines the structured data into a single text block for the RAG pipeline.
documents = [
    Document(
        page_content=f"Case: {item['case_name']}\nDate: {item['judgement_date']}\nQuestion: {item['question']}\nAnswer: {item['answer']}"
    )
    for item in data
]

In [4]:
# --- 4. Split documents into chunks ---
# This breaks down the long documents into smaller pieces that are easier for the model to process.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
chunks = text_splitter.split_documents(documents)

In [12]:
# --- 5. Create embeddings and FAISS vector store ---
# This converts the text chunks into numerical vectors (embeddings) and stores them
# in a FAISS index for efficient similarity searching.
print("Creating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

if os.path.exists("faiss_index"):
    vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    print("Loaded existing FAISS index.")
else:
    print("Creating FAISS vector store... This might take a few minutes.")
    vector_store = FAISS.from_documents(chunks, embeddings)
    vector_store.save_local("faiss_index")
    print("Vector store created and saved successfully.")

Creating embeddings...
Creating FAISS vector store... This might take a few minutes.
Creating FAISS vector store... This might take a few minutes.
Vector store created and saved successfully.
Vector store created and saved successfully.


In [13]:
# --- 6. Setup the RAG chain with Groq ---
# Initialize the Groq LLM with your API key and the desired model.
groq_llm = ChatGroq(groq_api_key=api_key, model_name="openai/gpt-oss-120b")

# Create a retriever from our vector store to fetch relevant documents.
retriever = vector_store.as_retriever()

# Define the prompt template to guide the LLM's responses.
prompt_template = """
You are a legal expert on Indian law. Use the following context to answer the question.
If you don't know the answer, just say you don't know.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Create the RetrievalQA chain, which combines the retriever and the LLM.
qa_chain = RetrievalQA.from_chain_type(
    llm=groq_llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [14]:
# --- 7. Query the RAG system ---
# Define the question you want to ask.
question = "What was the final decision of the Armed Forces Tribunal in the case Union of India vs. Maj. Gen. Manomoy Ganguly?"

# Invoke the chain to get the answer.
print(f"\nQuerying the RAG system with: '{question}'")
result = qa_chain.invoke({"query": question})


Querying the RAG system with: 'What was the final decision of the Armed Forces Tribunal in the case Union of India vs. Maj. Gen. Manomoy Ganguly?'


In [15]:
# --- 8. Print the results ---
print("\n--- Question ---")
print(question)

print("\n--- Answer ---")
print(result["result"])

print("\n--- Source Documents Used ---")
for i, doc in enumerate(result["source_documents"]):
    print(f"Source {i+1}:\n{doc.page_content}")
    print("---------------------------------")


--- Question ---
What was the final decision of the Armed Forces Tribunal in the case Union of India vs. Maj. Gen. Manomoy Ganguly?

--- Answer ---
The Armed Forces Tribunal (AFT) directed that Maj. Gen. Manomoy Ganguly be posted as Director General Medical Services (Army) **as expeditiously as possible, and within one month of the date of its judgment**.

--- Source Documents Used ---
Source 1:
Case: Union of India vs. Maj. Gen. Manomoy Ganguly
Date: 1st August 2018
Question: What decision did the Armed Forces Tribunal (AFT) make regarding Maj. Gen. Manomoy Ganguly's promotion?
Answer: The AFT directed the appellants to post Maj. Gen. Manomoy Ganguly as DGMS (Army) as expeditiously as possible and within one month from the date of the judgement.
---------------------------------
Source 2:
Case: Union of India vs. Maj. Gen. Manomoy Ganguly
Date: 1st August 2018
Question: What was the main issue in the case Union of India vs. Maj. Gen. Manomoy Ganguly?
Answer: The main issue was Maj. G