<a href="https://colab.research.google.com/github/JannsenRamos/Automated-Legal-Knowledge-Base/blob/main/Automated_PDF_Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydantic langchain_openai pymupdf

Collecting langchain_openai
  Downloading langchain_openai-1.1.7-py3-none-any.whl.metadata (2.6 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading langchain_openai-1.1.7-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.8/84.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, langchain_openai
Successfully installed langchain_openai-1.1.7 pymupdf-1.26.7


In [2]:
import os
import re
import json
import time
from typing import List, Optional, Union, Literal
from datetime import datetime
from google.colab import drive, userdata

In [3]:
# 1. SETUP & IMPORTS
from pydantic import BaseModel, Field, ConfigDict
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
import fitz  # PyMuPDF

#MOUNT DRIVE
drive.mount('/content/drive')
BASE_DIR = "/content/drive/My Drive/Labor_Law_System"
JSON_DIR = os.path.join(BASE_DIR, "structured_json")
os.makedirs(JSON_DIR, exist_ok=True)

Mounted at /content/drive


In [4]:
# Defined the Pydantic Template
class DocumentMetadata(BaseModel):
    source_file: str
    file_type: str
    page_number: int
    corpus_category: str  # e.g., "wages", "contracts"
    timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())

class LaborArticleChunk(BaseModel):
    node_type: Literal["labor_article"] = "labor_article"
    article_number: int
    old_article_number: Optional[int] = None
    title: str
    content: str
    is_repealed: bool = False
    metadata: DocumentMetadata

class DocumentManifest(BaseModel):
    """The Root structure for structured output."""
    project_name: str = "Labor Law Knowledge Base"
    chunks: List[LaborArticleChunk]

In [7]:
#3. Router to check which template fits the pdf file
llm_router = ChatOpenAI(
    model="openai/gpt-oss-120b:free",
    api_key=userdata.get('OPENROUTER_API_KEY'),
    base_url="https://openrouter.ai/api/v1",
    temperature=0
)

def identify_document_pattern(sample_text):
    prompt = f"Identify the type of this legal text. Return 'LABOR_CODE' or 'GENERIC'. Text: {sample_text[:500]}"
    try:
        response = llm_router.invoke(prompt)
        return response.content.strip()
    except:
        return "LABOR_CODE" # Default fallback

In [9]:
# Applies Selective Indexing using specific keywords for each chunks for separation
ROUTING_RULES = {
    "wages": ["wage", "salary", "pay", "overtime", "payroll", "deduction", "night shift"],
    "contracts": ["contract", "dismissal", "tenure", "termination", "probationary", "resignation"],
}

def local_production_extractor(pdf_path):
    doc = fitz.open(pdf_path)
    file_name = os.path.basename(pdf_path)

    # Regex: Finds "ART. 130 [132]"
    pattern = r"ART\.\s+(\d+)(?:\s+\[(\d+)\])?"

    full_text = "".join([page.get_text() for page in doc])
    parts = re.split(pattern, full_text)

    processed_count = 0
    # Parts array is: [prefix, art_num, old_num, content, ...]
    for i in range(1, len(parts), 3):
        art_num = parts[i]
        old_num = parts[i+1]
        content = parts[i+2].strip()

        # Keyword-based Routing
        category = "general"
        for cat, keywords in ROUTING_RULES.items():
            if any(k in content.lower() for k in keywords):
                category = cat; break

        # Build Metadata
        meta = DocumentMetadata(
            source_file=file_name,
            file_type="legal_code",
            page_number=0, # Simplified for full-text split
            corpus_category=category
        )

        # Create Pydantic Object
        chunk = LaborArticleChunk(
            article_number=int(art_num),
            old_article_number=int(old_num) if old_num else None,
            title=content.split('\n')[0][:100], # First line as title
            content=content,
            is_repealed="repealed" in content.lower() or "ra 10151" in content.lower(),
            metadata=meta
        )

        # Save to Category Folder
        save_path = os.path.join(JSON_DIR, category)
        os.makedirs(save_path, exist_ok=True)
        with open(f"{save_path}/art_{art_num}.json", "w") as f:
            f.write(chunk.model_dump_json(indent=2))

        processed_count += 1

    print(f"Finished! Processed {processed_count} articles into {JSON_DIR}")

# --- 5. EXECUTION ---
pdf_to_process = "/content/2022.Labor_.Code_.DOLE-edition.pdf"
local_production_extractor(pdf_to_process)

Finished! Processed 319 articles into /content/drive/My Drive/Labor_Law_System/structured_json
