In [1]:
import os
os.chdir(r"C:\Users\niksh\Downloads\pdf_outline_submission_ready\pdf_outline_extractor_dual")
print("Files:", os.listdir())


Files: ['.ipynb_checkpoints', 'Dockerfile', 'expected', 'input', 'main.py', 'output', 'pdf_json_outputs.zip', 'README.md', 'requirements.txt', 'Untitled.ipynb']


In [2]:
!python main.py


In [3]:
import os
print("📁 input/:", os.listdir("input"))


📁 input/: ['file01.pdf', 'file02.pdf', 'file03.pdf', 'file04.pdf', 'file05.pdf']


In [4]:


%run main.py




In [5]:
for file in os.listdir("input"):
    print(f"=> Extracting: {file}")  

=> Extracting: file01.pdf
=> Extracting: file02.pdf
=> Extracting: file03.pdf
=> Extracting: file04.pdf
=> Extracting: file05.pdf


In [6]:
import os
import json
import fitz  # PyMuPDF
from sklearn.cluster import KMeans
import pandas as pd

input_dir = "input"
output_dir = "output"
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

def extract_outline_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    blocks_data = []

    for page_num, page in enumerate(doc, start=1):
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        blocks_data.append({
                            "text": span["text"].strip(),
                            "size": round(span["size"], 2),
                            "font": span["font"],
                            "flags": span["flags"],
                            "page": page_num,
                            "y": span["origin"][1]
                        })

    blocks_df = pd.DataFrame(blocks_data)
    blocks_df = blocks_df[blocks_df["text"].str.len() > 3].reset_index(drop=True)

    font_sizes = blocks_df[["size"]].values
    n_clusters = min(len(set(blocks_df["size"])), 4)
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(font_sizes)
    blocks_df["cluster"] = kmeans.labels_

    cluster_sizes = blocks_df.groupby("cluster")["size"].mean().sort_values(ascending=False)
    level_map = {cid: f"H{idx+1}" for idx, cid in enumerate(cluster_sizes.index)}
    blocks_df["level"] = blocks_df["cluster"].map(level_map)

    page1_blocks = blocks_df[blocks_df["page"] == 1]
    top_blocks = page1_blocks.sort_values(by=["size", "y"], ascending=[False, True])
    title = top_blocks.iloc[0]["text"] if not top_blocks.empty else "Untitled Document"

    outline = []
    seen = set()
    for _, row in blocks_df.iterrows():
        key = (row["text"], row["level"], row["page"])
        if key not in seen:
            seen.add(key)
            outline.append({
                "level": row["level"],
                "text": row["text"],
                "page": row["page"]
            })

    return {
        "title": title,
        "outline": outline
    }

# Run for all PDFs
for file in os.listdir(input_dir):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(input_dir, file)
        json_path = os.path.join(output_dir, file.replace(".pdf", ".json"))
        result = extract_outline_from_pdf(pdf_path)
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2)
        print(f"✅ Saved: {json_path}")


✅ Saved: output\file01.json




✅ Saved: output\file02.json




✅ Saved: output\file03.json




✅ Saved: output\file04.json
✅ Saved: output\file05.json


In [7]:
output_dir = "output"


In [8]:
import os

output_dir = r"C:\Users\niksh\Downloads\pdf_outline_submission_ready\pdf_outline_extractor_dual\output"
print("📁 JSON files found:", os.listdir(output_dir))


📁 JSON files found: ['file01.json', 'file02.json', 'file03.json', 'file04.json', 'file05.json']


In [9]:
import zipfile
import os

output_dir = r"C:\Users\niksh\Downloads\pdf_outline_submission_ready\pdf_outline_extractor_dual\output"
zip_path = r"C:\Users\niksh\Downloads\pdf_json_outputs.zip"

try:
    with zipfile.ZipFile(zip_path, "w") as zipf:
        for file in os.listdir(output_dir):
            if file.endswith(".json"):
                file_path = os.path.join(output_dir, file)
                zipf.write(file_path, arcname=file)

    print("✅ ZIP created successfully at:", zip_path)
except Exception as e:
    print("❌ Error:", e)


✅ ZIP created successfully at: C:\Users\niksh\Downloads\pdf_json_outputs.zip


In [10]:
import os
print("📂 Current working directory:", os.getcwd())
print("📁 All files and folders:", os.listdir())


📂 Current working directory: C:\Users\niksh\Downloads\pdf_outline_submission_ready\pdf_outline_extractor_dual
📁 All files and folders: ['.ipynb_checkpoints', 'Dockerfile', 'expected', 'input', 'main.py', 'output', 'pdf_json_outputs.zip', 'README.md', 'requirements.txt', 'Untitled.ipynb']


In [11]:
!python main.py


In [12]:
import os
print("📁 output/:", os.listdir("output"))


📁 output/: ['file01.json', 'file02.json', 'file03.json', 'file04.json', 'file05.json']


In [13]:
import zipfile
import os

output_dir = "output"  # folder containing JSON files
zip_path = "pdf_json_outputs.zip"

with zipfile.ZipFile(zip_path, "w") as zipf:
    for file in os.listdir(output_dir):
        if file.endswith(".json"):
            zipf.write(os.path.join(output_dir, file), arcname=file)

print("✅ Zipped at:", zip_path)


✅ Zipped at: pdf_json_outputs.zip
