In [46]:
import fitz  # PyMuPDF
import re
import os
import glob
import json
import pandas as pd

# ---------- Extract Text from PDF with Error Handling ----------
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"❌ Failed to extract text from {pdf_path}: {e}")
        return None

# ---------- Extract Case ID (INDEX NO.) from text ----------
def extract_case_id(text):
    match = re.search(r'INDEX\s+NO\.?:?\s*([A-Z0-9/-]+)', text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return None

# ---------- Clean up raw legal text ----------
def clean_legal_text(text):
    text = re.sub(r'\n[A-Z\s]{4,}\n', '\n', text)
    text = text.replace('“', '"').replace('”', '"').replace("’", "'")
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ---------- Load Case Labels from CSV ----------
def load_case_labels(csv_path):
    df = pd.read_csv(csv_path, usecols=["case_number", "case_type"])
    df = df[df['case_number'].notna()]
    df = df[~df['case_number'].str.lower().str.contains("not assigned")]
    df['case_number'] = df['case_number'].str.strip()
    return dict(zip(df['case_number'], df['case_type']))

# ---------- Main Function to Build Both Labeled and Unlabeled Datasets ----------
def build_dataset(pdf_dir, csv_label_path, output_labeled="labeled_data.json", output_unlabeled="unlabeled_data.json", skip_unlabeled=True):
    label_map = load_case_labels(csv_label_path)
    labeled = []
    unlabeled = []

    for pdf_file in glob.glob(os.path.join(pdf_dir, "*.pdf")):
        raw_text = extract_text_from_pdf(pdf_file)
        if raw_text is None:
            continue

        case_id = extract_case_id(raw_text)
        clean_text = clean_legal_text(raw_text)

        if case_id and case_id in label_map:
            labeled.append({
                "case_id": case_id,
                "text": clean_text,
                "label": label_map[case_id]
            })
            print(f"✅ Labeled: {case_id}")
        else:
            print(f"🔍 Unlabeled: {case_id if case_id else 'No INDEX NO.'} ({os.path.basename(pdf_file)})")
            if not skip_unlabeled:
                unlabeled.append({
                    "case_id": case_id,
                    "text": clean_text,
                    "label": None  # Can be replaced later
                })

    # Save labeled and unlabeled data
    with open(output_labeled, "w") as f:
        json.dump(labeled, f, indent=2)
    print(f"\n📦 Saved {len(labeled)} labeled cases to {output_labeled}")

    if not skip_unlabeled:
        with open(output_unlabeled, "w") as f:
            json.dump(unlabeled, f, indent=2)
        print(f"📦 Saved {len(unlabeled)} unlabeled cases to {output_unlabeled}")


In [48]:
build_dataset(
    pdf_dir="nyscef complaints/nyscef/",
    csv_label_path="nyscef complaints/cases.csv",
    output_labeled="labeled_data.json",
    output_unlabeled="unlabeled_data.json",
    skip_unlabeled=False
)

🔍 Unlabeled: Date (f73449b0-9972-49e3-8b73-203e3871842b.pdf)
🔍 Unlabeled: Date (ba5f58ce-c935-488a-a859-589594c25aa4.pdf)
🔍 Unlabeled: -against- (b32f312c-ea20-4ec9-a628-af32d7ad0d73.pdf)
✅ Labeled: E2025005155
🔍 Unlabeled: UNASSIGNED (23954fba-1a05-42df-8fad-79edfb0c3f8d.pdf)
🔍 Unlabeled: 654650/2022 (b3cd2ca2-72d3-4281-8307-230d1cb4ded6.pdf)
✅ Labeled: 804596/2025E
🔍 Unlabeled: EFCA2025-000563 (66d77df2-a2cf-403a-9888-b1ed2c6a37bf.pdf)
✅ Labeled: CV25-0102
🔍 Unlabeled: 815971/2022E (9c80b546-3e1f-4819-9b74-1d0f60d03d46.pdf)
🔍 Unlabeled: AUDREY (34705704-7fa3-4a5f-8c30-eeb67d2a2765.pdf)
✅ Labeled: 137669-2023
🔍 Unlabeled: Plaintiff (a134a266-93c2-4006-ab15-88ef6ef22f67.pdf)
✅ Labeled: 57924/2025
✅ Labeled: 603981/2025
🔍 Unlabeled: Purchased (0b4b3d59-f30c-45db-987e-56aa1405ef0f.pdf)
🔍 Unlabeled: BANKERS (7c069b15-b170-4ae0-8c3d-c712cb35cc66.pdf)
🔍 Unlabeled: SARAH (1a7d92d5-5e15-4b11-819d-6b051731103b.pdf)
✅ Labeled: 507150/2025
✅ Labeled: EFCV-25-167547
🔍 Unlabeled: 602077/2025 (2fb4

In [50]:
import json

with open("labeled_data.json") as f:
    data = json.load(f)

formatted = []
for item in data:
    prompt = (
        "Classify the following legal complaint:\n\n"
        f"{item['text']}\n\n"
        "Explain your reasoning and conclude with the case type."
    )
    response = f"The complaint is categorized as:\n{item['label']}"
    formatted.append({"prompt": prompt, "response": response})

with open("legal_instruct_dataset.json", "w") as f:
    json.dump(formatted, f, indent=2)
