In [1]:
# STEP 1 ‚Äî Import Required Libraries

import json              # To read JSON dataset file
import pandas as pd      # To work with dataset as table

In [2]:
import os
os.listdir()

['.ipynb_checkpoints', 'IndicLegalQA Dataset', 'Stage1_Legal_Search.ipynb']

In [3]:
os.listdir("IndicLegalQA Dataset")

['IndicLegalQA Dataset_10K.json']

In [4]:
# STEP 2 ‚Äî Load Dataset From JSON File

# Open dataset file (inside folder)
with open("IndicLegalQA Dataset/IndicLegalQA Dataset_10K.json", "r", encoding="utf-8") as f:
    data = json.load(f)   # Load JSON data into Python

# Convert dataset into table format (DataFrame)
df = pd.DataFrame(data)

# Show dataset info
print("‚úÖ Dataset Loaded Successfully")
print("üìä Total Records:", len(df))

# Show first 5 rows
df.head()

‚úÖ Dataset Loaded Successfully
üìä Total Records: 10002


Unnamed: 0,case_name,judgment_date,question,answer,reference_pdf,judgement_date
0,Union of India vs. Maj. Gen. Manomoy Ganguly,1st August 2018,Who is the respondent in the case Union of Ind...,The respondent is Maj. Gen. Manomoy Ganguly.,,
1,Union of India vs. Maj. Gen. Manomoy Ganguly,1st August 2018,What was the main issue in the case Union of I...,The main issue was Maj. Gen. Manomoy Ganguly's...,,
2,Union of India vs. Maj. Gen. Manomoy Ganguly,1st August 2018,What decision did the Armed Forces Tribunal (A...,The AFT directed the appellants to post Maj. G...,,
3,Union of India vs. Maj. Gen. Manomoy Ganguly,1st August 2018,What was the reason given by the appellants fo...,The appellants argued that Maj. Gen. Manomoy G...,,
4,Union of India vs. Maj. Gen. Manomoy Ganguly,1st August 2018,How did the Supreme Court of India rule on the...,The Supreme Court of India upheld the AFT's de...,,


In [5]:
# STEP 3 ‚Äî Extract Only Questions Column

# Convert question column into list format
questions = df["question"].astype(str).tolist()

# Show total questions count
print("‚úÖ Questions Extracted")
print("üìä Total Questions:", len(questions))

# Show first question
print("\nüìù Sample Question:")
print(questions[0])

‚úÖ Questions Extracted
üìä Total Questions: 10002

üìù Sample Question:
Who is the respondent in the case Union of India vs. Maj. Gen. Manomoy Ganguly?


In [6]:
# STEP ‚Äî Import Text Processing Libraries

from sklearn.feature_extraction.text import TfidfVectorizer   # Converts text ‚Üí numbers
from sklearn.metrics.pairwise import cosine_similarity        # Finds similarity between questions

print("‚úÖ Search Libraries Imported Successfully")

‚úÖ Search Libraries Imported Successfully


In [7]:
# STEP ‚Äî Convert Questions Into Vectors (TF-IDF)

# Create TF-IDF vectorizer model
vectorizer = TfidfVectorizer()

# Train vectorizer using all dataset questions
X = vectorizer.fit_transform(questions)

print("‚úÖ Text Vectorization Completed")
print("üìä Total Questions Vectorized:", X.shape[0])
print("üìä Total Features Created:", X.shape[1])

‚úÖ Text Vectorization Completed
üìä Total Questions Vectorized: 10002
üìä Total Features Created: 6490


In [8]:
# STEP ‚Äî Create Legal Question Search Function

def search_legal_answer(user_question):
    
    # Convert user question ‚Üí vector
    user_vec = vectorizer.transform([user_question])
    
    # Compare with all dataset questions
    similarity = cosine_similarity(user_vec, X)
    
    # Get index of best matching question
    best_index = similarity.argmax()
    
    # Return full dataset row (case name + answer + date)
    return df.iloc[best_index]

In [9]:
# STEP ‚Äî Test Legal Search Engine

# Ask any legal question
query = "Who is the respondent in Union of India vs Manomoy Ganguly?"

# Search answer
result = search_legal_answer(query)

# Show result
print("\n‚úÖ RESULT FOUND")
print("üìå Case Name:", result["case_name"])
print("üìÖ Judgment Date:", result["judgment_date"])
print("üí¨ Answer:", result["answer"])



‚úÖ RESULT FOUND
üìå Case Name: Union of India vs. Maj. Gen. Manomoy Ganguly
üìÖ Judgment Date: 1st August 2018
üí¨ Answer: The respondent is Maj. Gen. Manomoy Ganguly.


In [10]:
# STEP ‚Äî Test Legal Search Engine

# Ask any legal question
query = "Bcci?"

# Search answer
result = search_legal_answer(query)

# Show result
print("\n‚úÖ RESULT FOUND")
print("üìå Case Name:", result["case_name"])
print("üìÖ Judgment Date:", result["judgment_date"])
print("üí¨ Answer:", result["answer"])



‚úÖ RESULT FOUND
üìå Case Name: Union of India vs. Board of Control for Cricket in India & Ors.
üìÖ Judgment Date: 22nd August 2017
üí¨ Answer: The Supreme Court of India upheld the High Court's decision and ruled that the live broadcast signals shared by BCCI with Prasar Bharati should not be retransmitted by cable operators. The court emphasized that the signals were meant for Prasar Bharati's terrestrial and DTH networks only.


In [11]:
import sys
!{sys.executable} -m pip install sentence-transformers



In [12]:
import sys
!{sys.executable} -m pip install faiss-cpu



In [13]:
from sentence_transformers import SentenceTransformer
import faiss

print("‚úÖ Stage 2 Libraries Ready")

‚úÖ Stage 2 Libraries Ready


In [14]:
# STEP ‚Äî Load AI Embedding Model

# Load pre-trained sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

print("‚úÖ AI Embedding Model Loaded")



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


‚úÖ AI Embedding Model Loaded


In [15]:
# STEP ‚Äî Convert All Questions Into AI Embeddings

question_embeddings = model.encode(
    questions,
    show_progress_bar=True
)

print("‚úÖ All Questions Converted To AI Embeddings")
print("üìä Embedding Shape:", question_embeddings.shape)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

‚úÖ All Questions Converted To AI Embeddings
üìä Embedding Shape: (10002, 384)


In [16]:
# STEP ‚Äî Build FAISS AI Search Index

import numpy as np

# Convert embeddings to float32 (FAISS requirement)
question_embeddings = np.array(question_embeddings).astype("float32")

# Get embedding dimension (384)
dimension = question_embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(dimension)

# Add embeddings to FAISS index
index.add(question_embeddings)

print("‚úÖ FAISS AI Search Index Created")
print("üìä Total Indexed Questions:", index.ntotal)


‚úÖ FAISS AI Search Index Created
üìä Total Indexed Questions: 10002


In [17]:
# STEP ‚Äî Create AI Semantic Legal Search Function

def ai_search_legal_answer(user_question, top_k=1):
    
    # Convert user question ‚Üí AI embedding
    user_embedding = model.encode([user_question])
    user_embedding = np.array(user_embedding).astype("float32")
    
    # Search FAISS index
    distances, indices = index.search(user_embedding, top_k)
    
    # Get best match row
    best_match = df.iloc[indices[0][0]]
    
    return best_match

In [18]:
# STEP ‚Äî Test AI Semantic Legal Search

query = "Who is the opposite party in Union of India vs Manomoy case?"

result = ai_search_legal_answer(query)

print("\n‚úÖ AI SEARCH RESULT")
print("üìå Case Name:", result["case_name"])
print("üìÖ Judgment Date:", result["judgment_date"])
print("üí¨ Answer:", result["answer"])


‚úÖ AI SEARCH RESULT
üìå Case Name: Union of India vs. Maj. Gen. Manomoy Ganguly
üìÖ Judgment Date: 1st August 2018
üí¨ Answer: The respondent is Maj. Gen. Manomoy Ganguly.


In [19]:
# STEP ‚Äî Test AI Semantic Legal Search

query = "Tell me about Union of India vs Manomoy Ganguly case?"

result = ai_search_legal_answer(query)

print("\n‚úÖ AI SEARCH RESULT")
print("üìå Case Name:", result["case_name"])
print("üìÖ Judgment Date:", result["judgment_date"])
print("üí¨ Answer:", result["answer"])



‚úÖ AI SEARCH RESULT
üìå Case Name: Union of India vs. Maj. Gen. Manomoy Ganguly
üìÖ Judgment Date: 1st August 2018
üí¨ Answer: The main issue was Maj. Gen. Manomoy Ganguly's denial of promotion to the position of Director General Medical Services (Army) despite being eligible and senior.


In [20]:
# STEP ‚Äî Test AI Semantic Legal Search

query = "cases of only Board of cricket?"

result = ai_search_legal_answer(query)

print("\n‚úÖ AI SEARCH RESULT")
print("üìå Case Name:", result["case_name"])
print("üìÖ Judgment Date:", result["judgment_date"])
print("üí¨ Answer:", result["answer"])



‚úÖ AI SEARCH RESULT
üìå Case Name: Union of India vs. Board of Control for Cricket in India & Ors.
üìÖ Judgment Date: 22nd August 2017
üí¨ Answer: The main issue was whether the live broadcast signals of cricket matches shared by BCCI with Prasar Bharati should be retransmitted by cable operators, and whether this arrangement was legal under Section 3 of the Sports Broadcasting Signals (Mandatory Sharing with Prasar Bharati) Act, 2007 and Section 8 of the Cable Television Networks (Regulation) Act, 1995.


In [21]:
# Check if BCCI exists in dataset

bcci_cases = df[
    df["question"].str.contains("BCCI", case=False, na=False) |
    df["case_name"].str.contains("BCCI", case=False, na=False)
]

print("BCCI Cases Found:", len(bcci_cases))
bcci_cases.head()


BCCI Cases Found: 2


Unnamed: 0,case_name,judgment_date,question,answer,reference_pdf,judgement_date
29,Union of India vs. Board of Control for Cricke...,22nd August 2017,What did the Supreme Court of India decide reg...,The Supreme Court of India upheld the High Cou...,,
31,Union of India vs. Board of Control for Cricke...,22nd August 2017,What was the stance of the Board of Control fo...,BCCI argued that retransmitting the live signa...,,


In [22]:
# ‚≠ê Production Style Hybrid Search (Keyword OR Matching)

def hybrid_search_legal_answer(user_question, top_k=1):
    
    # Split query into words
    keywords = user_question.split()
    
    # Remove common stopwords
    stopwords = ["of", "the", "is", "in", "on", "for", "to", "and", "cases"]
    keywords = [w for w in keywords if w.lower() not in stopwords]
    
    # If keywords exist ‚Üí search
    if len(keywords) > 0:
        
        keyword_condition = False
        
        for word in keywords:
            condition = (
                df["question"].str.contains(word, case=False, na=False) |
                df["case_name"].str.contains(word, case=False, na=False)
            )
            
            if keyword_condition is False:
                keyword_condition = condition
            else:
                keyword_condition = keyword_condition | condition
        
        keyword_matches = df[keyword_condition]
        
        if len(keyword_matches) > 0:
            print("üü¢ Keyword Match Found")
            return keyword_matches.iloc[0]
    
    # Fallback to AI
    print("üß† Using AI Semantic Search")
    return ai_search_legal_answer(user_question, top_k=top_k)


In [23]:
query = "cases of BCCI"
result = hybrid_search_legal_answer(query)


üü¢ Keyword Match Found


In [24]:
import sys
!{sys.executable} -m pip install ollama



In [25]:
import sys
!{sys.executable} -m pip uninstall -y ollama
!{sys.executable} -m pip install ollama --upgrade


Found existing installation: ollama 0.6.1
Uninstalling ollama-0.6.1:
  Successfully uninstalled ollama-0.6.1
Collecting ollama
  Using cached ollama-0.6.1-py3-none-any.whl.metadata (4.3 kB)
Using cached ollama-0.6.1-py3-none-any.whl (14 kB)
Installing collected packages: ollama
Successfully installed ollama-0.6.1


In [26]:
import ollama
print("‚úÖ Ollama Python Connected")


‚úÖ Ollama Python Connected


In [27]:
def ask_mistral(prompt):
    
    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response["message"]["content"]


In [28]:
print(ask_mistral("Explain law simply"))

 Law is a set of rules that are created and enforced by a community or government to regulate behavior, resolve disputes, protect rights, and maintain social order. It can be divided into two main categories: civil law (which deals with private matters like contracts and property) and criminal law (which deals with crimes against the state or individuals). Laws are often written down in codes, statutes, and constitutions, but they can also exist as customs or traditions that have been accepted as law over time.

The legal system involves various institutions and professionals such as courts, judges, lawyers, and police officers who work to interpret and apply the laws fairly and consistently. People are expected to follow the law, and those who break the law may face penalties, which can range from fines to imprisonment.

It's important to note that laws can vary greatly between different countries, states, and even local communities, reflecting their unique cultural, historical, and s

In [29]:
# ‚≠ê FINAL ‚Äî Legal AI Chatbot Function

def legal_chatbot(user_question):
    
    # STEP 1 ‚Äî Retrieve Best Legal Case
    case_result = hybrid_search_legal_answer(user_question)
    
    # STEP 2 ‚Äî Prepare Context For LLM
    context = f"""
    Legal Case Name: {case_result['case_name']}
    Judgment Date: {case_result['judgment_date']}
    Legal Answer / Summary: {case_result['answer']}
    """
    
    # STEP 3 ‚Äî Create Prompt For LLM
    prompt = f"""
    You are a legal assistant for Indian law.

    Use the following legal case information to answer the user's question clearly and simply.

    {context}

    User Question:
    {user_question}

    Give a helpful legal explanation.
    """
    
    # STEP 4 ‚Äî Ask Mistral
    final_answer = ask_mistral(prompt)
    
    return final_answer


In [30]:
response = legal_chatbot(
    "Explain the Union of India vs Manomoy Ganguly promotion case"
)

print(response)


üü¢ Keyword Match Found
 In the case of Union of India vs. Maj. Gen. Manomoy Ganguly, the Supreme Court of India ruled on August 1st, 2018, regarding a dispute about the promotion of Major General Manomoy Ganguly within the Indian Army.

The petition was filed by Maj. Gen. Ganguly challenging his non-promotion to the rank of Lieutenant General. The issue centered around allegations of procedural irregularities during the selection process for promotions at the level of Major General.

The Court examined whether the Defence Ministry followed proper guidelines and rules while selecting officers for promotion, and whether Maj. Gen. Ganguly was unfairly treated due to alleged biases or misconduct during the evaluation process.

The Court found that there were indeed procedural irregularities in the selection process, specifically with regards to the evaluation criteria used. The Court ordered a fresh evaluation of candidates for promotions at the level of Major General, using transparent 

In [31]:
response = legal_chatbot(
    "what is bcci case"
)

print(response)


üü¢ Keyword Match Found
 The BCCI (Board of Control for Cricket in India) case refers to a significant case heard by the Supreme Court of India from 1995 to 2017, known as Union of India vs. Board of Control for Cricket in India & Ors. This case involved various allegations of corruption and mismanagement within the BCCI, the governing body responsible for cricket in India.

The case led to several important judgments, including the introduction of reforms aimed at improving transparency and accountability within the BCCI. One of the most significant outcomes was the establishment of a three-member Committee of Administrators (CoA) to oversee the functioning of the BCCI for a period of time.

The case has had a lasting impact on sports governance in India, serving as an example of the court's role in ensuring transparency and accountability in public bodies.


In [32]:
# ‚≠ê Top K AI Case Search (Using FAISS)

def ai_search_top_cases(user_question, top_k=3):
    
    # Convert question ‚Üí embedding
    user_embedding = model.encode([user_question])
    user_embedding = np.array(user_embedding).astype("float32")
    
    # Search FAISS
    distances, indices = index.search(user_embedding, top_k)
    
    # Get top case rows
    results = df.iloc[indices[0]]
    
    return results


In [33]:
top_cases = ai_search_top_cases(
    "Army promotion dispute Supreme Court case",
    top_k=3
)

top_cases[["case_name", "judgment_date"]]


Unnamed: 0,case_name,judgment_date
552,M. Elangovan vs. Union of India & Ors.,17th July 2017
1921,Kerala Transport Development Finance Corporati...,31st January 2022
3903,Lance Nayak Raj Bahadur & Ors. vs. State of UP...,


In [35]:
top_cases = ai_search_top_cases(
    "murder",
    top_k=3
)

print("‚≠ê Top 3 Similar Legal Cases:\n")

for i, row in top_cases.iterrows():
    print("üìå Case:", row["case_name"])
    print("üìÖ Date:", row["judgment_date"])
    print("üí¨ Summary:", str(row["answer"])[:200], "...")
    print("-" * 50)


‚≠ê Top 3 Similar Legal Cases:

üìå Case: Dharam Pal vs. The State of Haryana
üìÖ Date: 10th August 2017
üí¨ Summary: Bharat Bhushan Arora left for Delhi on his scooter on 10th September 1996. He did not return by 2:00 am on 11th September 1996, prompting a search by his family. His body was found at 5:30 pm on 11th  ...
--------------------------------------------------
üìå Case: Murugan vs. State of Tamil Nadu
üìÖ Date: nan
üí¨ Summary: The motive behind the murder was that Kumar held a grudge against Geetha's father for not agreeing to his proposal to marry Geetha. ...
--------------------------------------------------
üìå Case: Yogesh @ Sonu Tharu vs. The State
üìÖ Date: nan
üí¨ Summary: The prosecution stated that during a birthday party, an altercation occurred between Yogesh @ Sonu Tharu and the deceased. Pradeep Dabas took a gun and fired at the deceased but missed. Later, Pradeep ...
--------------------------------------------------


In [36]:
def legal_chatbot(user_question):
    
    # STEP 1 ‚Äî Main Case (Hybrid Search)
    main_case = hybrid_search_legal_answer(user_question)
    
    # STEP 2 ‚Äî Top 3 Similar Cases (AI Search)
    top_cases = ai_search_top_cases(user_question, top_k=3)
    
    # STEP 3 ‚Äî Build Context For LLM
    context = f"""
    Main Legal Case:
    Case Name: {main_case['case_name']}
    Judgment Date: {main_case['judgment_date']}
    Case Summary: {main_case['answer']}
    """
    
    # STEP 4 ‚Äî Create Prompt
    prompt = f"""
    You are an Indian legal assistant.

    Use the legal case below to answer the user's question clearly.

    {context}

    User Question:
    {user_question}

    Give clear legal explanation.
    """
    
    # STEP 5 ‚Äî Get LLM Answer
    llm_answer = ask_mistral(prompt)
    
    # STEP 6 ‚Äî Format Similar Cases Text
    similar_text = "\n\nüìö Similar Cases You May Refer:\n"
    
    for i, row in enumerate(top_cases.itertuples(), 1):
        similar_text += f"{i}Ô∏è‚É£ {row.case_name} ({row.judgment_date})\n"
    
    # STEP 7 ‚Äî Final Response
    final_response = llm_answer + similar_text
    
    return final_response


# response = legal_chatbot(
    "Army promotion dispute Supreme Court case"
)

print(response)