# Decide the ranges for color printouts

Scan the pages of the given pdf and check if any minute text or intense colorful picture is there. This returns 2 lists.
1. Page numbers recommended to color printout
2. Remaining page numbers

In [34]:
import fitz  # PyMuPDF
import numpy as np
from PIL import Image

def pages_to_ranges(pages):
    """Convert a list of page numbers into a compact range string."""
    if not pages:
        return ""
    pages = sorted(pages)
    ranges = []
    start = prev = pages[0]

    for p in pages[1:]:
        if p == prev + 1:
            prev = p
        else:
            if start == prev:
                ranges.append(str(start))
            else:
                ranges.append(f"{start}-{prev}")
            start = prev = p
    # add last range
    if start == prev:
        ranges.append(str(start))
    else:
        ranges.append(f"{start}-{prev}")

    return ", ".join(ranges)


def classify_pdf_pages(file_path):
    """
    Classify PDF pages into color or monochrome lists.
    - Uses adaptive colorfulness threshold (median * 1.3)
    - Flags pages as color if more than 50% of text spans are tiny (<7pt)
    """
    doc = fitz.open(file_path)

    colorfulness_scores = []
    text_ratios = []

    # First pass: collect stats
    for page in doc:
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        arr = np.array(Image.frombytes("RGB", [pix.width, pix.height], pix.samples))
        std_per_channel = arr.std(axis=(0, 1))
        colorfulness = std_per_channel.mean()
        colorfulness_scores.append(colorfulness)

        words = page.get_text("dict")["blocks"]
        small_text_count, total_text_count = 0, 0
        for block in words:
            if block["type"] == 0:
                for line in block["lines"]:
                    for span in line["spans"]:
                        total_text_count += 1
                        if span["size"] < 7:
                            small_text_count += 1
        ratio = small_text_count / total_text_count if total_text_count else 0
        text_ratios.append(ratio)

    # Adaptive thresholds
    median_colorfulness = np.median(colorfulness_scores)
    color_threshold = max(35, median_colorfulness * 1.3)
    small_text_threshold = 0.5

    # Classify
    color_pages, mono_pages = [], []
    for i, (c_score, t_ratio) in enumerate(zip(colorfulness_scores, text_ratios)):
        if c_score > color_threshold or t_ratio > small_text_threshold:
            color_pages.append(i + 1)
        else:
            mono_pages.append(i + 1)

    doc.close()
    return color_pages, mono_pages

In [27]:
color, mono = classify_pdf_pages('/Users/jawahar/Downloads/Advanced Deep Learning.pdf')

In [32]:
print("Color pages count: ", len(color))
print("Monochrome pages:", len(mono))
print( ' _______________ ')
print("Color pages:", pages_to_ranges(color))
print("Monochrome pages:", pages_to_ranges(mono))

Color pages count:  87
Monochrome pages: 123
 _______________ 
Color pages: 2, 5-7, 11-12, 14-17, 19, 24, 34, 42-44, 49, 59, 61, 65-68, 70-71, 73-77, 80, 82-84, 87-89, 91, 100, 104-114, 118, 125, 127-128, 130, 134-135, 139, 143-144, 146, 148-153, 159, 161-163, 165, 171, 175-176, 179, 181-185, 203-208
Monochrome pages: 1, 3-4, 8-10, 13, 18, 20-23, 25-33, 35-41, 45-48, 50-58, 60, 62-64, 69, 72, 78-79, 81, 85-86, 90, 92-99, 101-103, 115-117, 119-124, 126, 129, 131-133, 136-138, 140-142, 145, 147, 154-158, 160, 164, 166-170, 172-174, 177-178, 180, 186-202, 209-210
