In [None]:
# Update system packages and install Tesseract for OCR and Poppler for PDF handling
!sudo apt-get update
!sudo apt-get install -y tesseract-ocr poppler-utils

# Install the required Python libraries for extraction, cleaning, and chunking
!pip install PyPDF2 pytesseract pdf2image langchain sentence-transformers

In [None]:
# üß† Step 1.2: Imports and Function Definitions
# This cell contains all the code for our processing logic.
# Running this cell just defines the functions; it does not start the main task yet.

import os
import re
import json
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
import nltk
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- CONFIGURATION ---
BASE_DIRECTORY = "/kaggle/input/legal-dataset-sc-judgments-india-19502024/supreme_court_judgments/"
# We will save batched results here
BATCH_OUTPUT_DIR = "/kaggle/working/chunked_batches/"
# Final merged file
FINAL_OUTPUT_PATH = "/kaggle/working/chunked_legal_data.json"

# Download NLTK data if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# --- FUNCTION DEFINITIONS ---

def extract_text_from_pdf(pdf_path):
    """Extracts raw text from a PDF, using OCR as a fallback."""
    raw_text = ""
    try:
        with open(pdf_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            if pdf_reader.is_encrypted:
                return ""
            for page in pdf_reader.pages:
                raw_text += page.extract_text() or ""
    except Exception:
        raw_text = ""
    
    if len(raw_text.strip()) < 200:
        try:
            images = convert_from_path(pdf_path)
            ocr_text = "".join([pytesseract.image_to_string(img) for img in images])
            return ocr_text
        except Exception:
            return ""
    return raw_text

def clean_text(text):
    """Cleans the raw text."""
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'indian kanoon - http://indiankanoon\.org/doc/\d+/', '', text)
    text = re.sub(r'[^a-z0-9\s.,]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Initialize the text splitter globally
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

print(" Functions are defined and ready to use.")

In [None]:
# üöÄ Step 1.3: Main Processing - Process Data Year by Year
# This is the main workload. It will process one year at a time and save the results.
# If you stop and restart the notebook, it will automatically skip the completed years.

print("Starting batch processing of all years...")
os.makedirs(BATCH_OUTPUT_DIR, exist_ok=True)

year_directories = sorted(os.listdir(BASE_DIRECTORY))

for year_dir in year_directories:
    year_path = os.path.join(BASE_DIRECTORY, year_dir)
    output_file_path = os.path.join(BATCH_OUTPUT_DIR, f"chunked_{year_dir}.json")

    # Skip if this year has already been processed
    if os.path.exists(output_file_path):
        print(f"--- Year {year_dir} already processed. Skipping. ---")
        continue

    if not os.path.isdir(year_path):
        continue

    print(f"--- Processing Year: {year_dir} ---")
    yearly_data = {}
    
    for filename in os.listdir(year_path):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(year_path, filename)
            relative_path = os.path.relpath(full_path, BASE_DIRECTORY)
            
            print(f"-> Processing file: {filename}")
            raw_text = extract_text_from_pdf(full_path)
            
            if raw_text and len(raw_text.strip()) > 100:
                cleaned_text = clean_text(raw_text)
                chunks = text_splitter.split_text(cleaned_text)
                if chunks:
                    yearly_data[relative_path] = chunks
                    print(f"  [‚úì] Processed and split into {len(chunks)} chunks.")
            else:
                print(f"  [‚úó] No usable text found in {filename}.")

    # Save the result for the current year
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(yearly_data, f, indent=4)
    print(f"‚úÖ Saved results for year {year_dir} to {output_file_path}\n")

print("\nüéâ All years have been processed!")

In [None]:
# üì¶ Step 1.4: Merge All Batch Files
# This cell reads all the individual JSON files created above
# and combines them into one final file for the next stage of the project.

print(f"Merging all batch files from '{BATCH_OUTPUT_DIR}'...")
all_chunked_data = {}

for filename in sorted(os.listdir(BATCH_OUTPUT_DIR)):
    if filename.endswith(".json"):
        file_path = os.path.join(BATCH_OUTPUT_DIR, filename)
        print(f"-> Merging {filename}")
        with open(file_path, 'r', encoding='utf-8') as f:
            yearly_data = json.load(f)
            all_chunked_data.update(yearly_data)

# Save the final merged dictionary
with open(FINAL_OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(all_chunked_data, f, indent=4)

print(f"\n‚úÖ All data merged successfully into '{FINAL_OUTPUT_PATH}'")
print(f"Total documents processed: {len(all_chunked_data)}")

In [None]:
# üß† Step 2 (Corrected for a Single Notebook Workflow)

import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import os # Import os to check if the file exists

# --- CONFIGURATION ---
# IMPORTANT: Since you are in the same notebook, we now point directly to the
# output file created by your Step 1 script in the /kaggle/working/ directory.
CHUNKED_DATA_PATH = "/kaggle/working/chunked_legal_data.json"

# Output paths for the files we will create
INDEX_PATH = "/kaggle/working/judgments.index"
MAPPING_PATH = "/kaggle/working/index_to_chunk_map.json"

# The pre-trained model for creating embeddings
MODEL_NAME = 'all-MiniLM-L6-v2'

# --- 1. VERIFY AND LOAD DATA ---
print("-> Checking for the chunked data file...")
if not os.path.exists(CHUNKED_DATA_PATH):
    print(f"[‚ùå] ERROR: File not found at '{CHUNKED_DATA_PATH}'.")
    print("Please make sure you have successfully run the Step 1 script in a cell above this one.")
else:
    print("-> File found! Loading chunked data...")
    with open(CHUNKED_DATA_PATH, 'r') as f:
        chunked_data = json.load(f)

    # --- 2. PREPARE AND FLATTEN DATA ---
    all_chunks_text = []
    index_to_chunk_map = []
    print("-> Preparing and flattening data for indexing...")
    for doc_name, chunks in chunked_data.items():
        for chunk_text in chunks:
            all_chunks_text.append(chunk_text)
            index_to_chunk_map.append({
                "doc_name": doc_name,
                "chunk_text": chunk_text
            })
    print(f"-> Total chunks to be indexed: {len(all_chunks_text)}")

    # --- 3. CREATE EMBEDDINGS ---
    print(f"-> Loading embedding model: {MODEL_NAME}")
    model = SentenceTransformer(MODEL_NAME)
    print("-> Creating embeddings for all chunks... (This will take a significant amount of time)")
    chunk_embeddings = model.encode(all_chunks_text, show_progress_bar=True, convert_to_numpy=True)
    embedding_dim = chunk_embeddings.shape[1]
    print(f"-> Embeddings created with dimension: {embedding_dim}")

    # --- 4. BUILD FAISS INDEX ---
    print("-> Building FAISS index...")
    index = faiss.IndexFlatL2(embedding_dim)
    faiss.normalize_L2(chunk_embeddings)
    index.add(chunk_embeddings)
    print(f"-> FAISS index built. Total vectors in index: {index.ntotal}")

    # --- 5. SAVE THE INDEX AND MAPPING ---
    print(f"-> Saving FAISS index to {INDEX_PATH}")
    faiss.write_index(index, INDEX_PATH)
    print(f"-> Saving the chunk mapping file to {MAPPING_PATH}")
    with open(MAPPING_PATH, 'w') as f:
        json.dump(index_to_chunk_map, f, indent=4)
    print("\n[‚úÖ] Step 2 Complete! Your vector index is ready in the '/kaggle/working/' directory.")

In [None]:
# ‚öôÔ∏è Step 3.1: Install App Libraries
# Streamlit is for building the web interface.
# Pyngrok is a tool to create a public URL for our app running inside Kaggle.

!pip install streamlit pyngrok

In [None]:
import os
import zipfile

print("-> Zipping just the large index_to_chunk_map.json file...")

# Define the file you want to zip
json_path = "/kaggle/working/index_to_chunk_map.json"

# Define the name of the new zip file
zip_path = "/kaggle/working/mapping_file_only.zip"

if os.path.exists(json_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        print(f"    - Compressing {os.path.basename(json_path)}... (This may take a moment)")
        # Add the file to the zip. 
        # os.path.basename(json_path) makes it so it doesn't store the full /kaggle/working/ path
        zipf.write(json_path, os.path.basename(json_path))
    
    print(f"\n[‚úÖ] Compression complete!")
    print(f"You can now download the smaller '{os.path.basename(zip_path)}' file.")

else:
    print(f"[‚ùå] ERROR: File not found at '{json_path}'.")
    print("Please make sure you have successfully run the 'Step 2: Create and Save FAISS Index' cell first.")