<a href="https://colab.research.google.com/github/JannsenRamos/Automated-Legal-Knowledge-Base/blob/main/Selective_Indexing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install pydantic langchain_openai pymupdf



In [15]:
import os
import re
import json
import fitz  # PyMuPDF
from typing import List, Optional
from datetime import datetime
from pydantic import BaseModel, Field

In [16]:
# 1. UNIFIED MODELS (Fixes Pylance "Undefined" Errors)
class LegalMetadata(BaseModel):
    source_file: str
    jurisdiction: str    # "PH" or "HK"
    corpus_category: str # "wages", "contracts", etc.
    timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())

class LegalOrdinanceChunk(BaseModel):
    jurisdiction: str
    section_id: str
    title: str
    content: str
    is_repealed: bool = False
    metadata: LegalMetadata

In [17]:
# 2. SELECTIVE INDEXING RULES
ROUTING_RULES = {
    "PH": {
        "wages": ["wage", "salary", "pay", "overtime", "payroll", "deduction"],
        "contracts": ["contract", "dismissal", "termination", "probationary", "resignation"],
        "benefits": ["maternity", "paternity", "retirement", "holiday", "service incentive"],
    },
    "HK": {
        "wages": ["wage", "payment", "deduction", "overtime", "end of year payment"],
        "contracts": ["notice", "termination", "probation", "summary dismissal", "damages"],
        "benefits": ["maternity", "paternity", "leave", "medical certificate", "rest day"],
    }
}

In [18]:
# 3. AI JURISDICTION DETECTOR
def identify_jurisdiction(sample_text, api_key):
    # This acts as the gatekeeper to choose the right regex pattern
    prompt = f"Identify the jurisdiction: 'PH_LABOR' or 'HK_ORDINANCE'. Text: {sample_text[:800]}"
    try:
        # Assuming llm_router is defined in your previous cells
        response = llm_router.invoke(prompt)
        return response.content.strip()
    except:
        return "HK_ORDINANCE" if "Cap. 57" in sample_text else "PH_LABOR"

In [20]:
drive.mount('/content/drive')

# --- STEP 2: DEFINE DIRECTORIES ---
BASE_DIR = "/content/drive/My Drive/Labor_Law_System"
JSON_DIR = os.path.join(BASE_DIR, "structured_json")
os.makedirs(JSON_DIR, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
# --- STEP 4: THE UNIFIED SYSTEM ---
def unified_production_extractor(pdf_path, api_key):
    doc = fitz.open(pdf_path)
    file_name = os.path.basename(pdf_path)
    full_text = "".join([page.get_text() for page in doc]).replace('\xa0', ' ')

    # A. Determine Jurisdiction once for the whole file
    jurisdiction = "HK" if "Cap. 57" in full_text[:3000] or "Hong Kong" in full_text[:3000] else "PH"

    # B. Set Regex based on Jurisdiction
    pattern = re.compile(r"(?m)^(\d+[A-Z]*)\.\s+(.*)") if jurisdiction == "HK" else re.compile(r"(?i)ART\.?\s*(\d+)")
    matches = list(pattern.finditer(full_text))
    chunks = []

    # C. Process and Validate
    for i in range(len(matches)):
        start_idx = matches[i].end()
        end_idx = matches[i+1].start() if i+1 < len(matches) else len(full_text)
        content = full_text[start_idx:end_idx].strip()

        # --- DATA QUALITY GATEKEEPER ---
        # Skip ToC: If content is too short or contains only dots/page numbers, it's not a law.
        if len(content) < 60 or "........" in content or re.search(r"^\d+-\d+$", content):
            continue

        # D. Apply Selective Indexing
        category = "general"
        rules = ROUTING_RULES.get(jurisdiction, {})
        for cat, keywords in rules.items():
            if any(k in content.lower() for k in keywords):
                category = cat
                break

        sec_id = matches[i].group(1)
        title = matches[i].group(2).strip() if jurisdiction == "HK" else content.split('\n')[0][:100]

        chunk = LegalOrdinanceChunk(
            jurisdiction=jurisdiction,
            section_id=sec_id,
            title=title,
            content=content,
            is_repealed="(Repealed)" in title or "repealed" in content.lower(),
            metadata=LegalMetadata(source_file=file_name, jurisdiction=jurisdiction, corpus_category=category)
        )

        save_dir = os.path.join(JSON_DIR, jurisdiction, category)
        os.makedirs(save_dir, exist_ok=True)
        with open(os.path.join(save_dir, f"sec_{sec_id}.json"), "w") as f:
            f.write(chunk.model_dump_json(indent=2))

        chunks.append(chunk)

    print(f"Update Complete! {len(chunks)} laws extracted and sorted by jurisdiction and category.")
    return chunks

In [28]:
pdf_to_process = "/content/Cap 57 Consolidated version for the Whole Chapter (24-08-2025) (English).pdf"

from google.colab import userdata
api_key = userdata.get('OPENROUTER_API_KEY')

processed_chunks = unified_production_extractor(pdf_to_process, api_key)

if processed_chunks:
    print(f"\n--- VERIFICATION SAMPLE ---")
    print(f"ID: {processed_chunks[0].section_id}")
    print(f"Title: {processed_chunks[0].title}")
    print(f"Content Preview: {processed_chunks[0].content[:200]}...")

Update Complete! 306 laws extracted and sorted by jurisdiction and category.

--- VERIFICATION SAMPLE ---
ID: 8A
Title: Damages for wrongful termination of contract
Content Preview: 2-14
Last updated date
24.8.2025
Employment Ordinance
T-2
Cap. 57...


In [29]:
pdf_to_process = "/content/Labor-Code-of-the-Philippines-DOLE.pdf"

from google.colab import userdata
api_key = userdata.get('OPENROUTER_API_KEY')

processed_chunks = unified_production_extractor(pdf_to_process, api_key)

if processed_chunks:
    print(f"\n--- VERIFICATION SAMPLE ---")
    print(f"ID: {processed_chunks[0].section_id}")
    print(f"Title: {processed_chunks[0].title}")
    print(f"Content Preview: {processed_chunks[0].content[:200]}...")

Update Complete! 307 laws extracted and sorted by jurisdiction and category.

--- VERIFICATION SAMPLE ---
ID: 1
Title: . Name of Decree. This Decree shall be known as the "Labor Code of the Philippines".
Content Preview: . Name of Decree. This Decree shall be known as the "Labor Code of the Philippines"....
