In [3]:
# ============================================================
# PRINT MEDIA SIZE ESTIMATION â€“ FINAL STABLE VERSION
# ============================================================

# -------------------------------
# 1. INSTALL DEPENDENCIES
# -------------------------------
!pip install pymupdf pytesseract pillow pandas openpyxl
!apt-get install -y tesseract-ocr

# -------------------------------
# 2. IMPORT LIBRARIES
# -------------------------------
import fitz
import pytesseract
import pandas as pd
import re
import zipfile
import os
from PIL import Image
from google.colab import drive, files
from collections import defaultdict

# -------------------------------
# 3. MOUNT DRIVE
# -------------------------------
drive.mount("/content/drive")

# -------------------------------
# 4. STRICT PROJECT PATHS
# -------------------------------
PROJECT_ROOT = "/content/drive/MyDrive/Print_Media_Automation"
PDF_PATH = f"{PROJECT_ROOT}/Image Sizes.pdf"
OUTPUT_EXCEL = f"{PROJECT_ROOT}/Output_Excel/Print_Analysis_Result.xlsx"

os.makedirs(os.path.dirname(OUTPUT_EXCEL), exist_ok=True)

# ============================================================
# 5. TRAINING PHASE (PDF = TV, NO OCR)
# ============================================================
REFERENCE_SIZES = [
    {"cm_w": 8,  "cm_h": 12},
    {"cm_w": 12, "cm_h": 12},
    {"cm_w": 4,  "cm_h": 4},
    # add more if your PDF has more samples
]

pdf = fitz.open(PDF_PATH)
training_refs = []

for idx, page in enumerate(pdf):
    if idx >= len(REFERENCE_SIZES):
        break

    pix = page.get_pixmap(dpi=200)
    ref = REFERENCE_SIZES[idx]

    training_refs.append({
        "px_w": pix.width,
        "px_h": pix.height,
        "cm_w": ref["cm_w"],
        "cm_h": ref["cm_h"],
        "ratio": pix.width / pix.height
    })

training_df = pd.DataFrame(training_refs)

if training_df.empty:
    raise ValueError("Training failed: reference size list is empty.")

print(f"âœ… Training completed using {len(training_df)} reference samples")

# ============================================================
# 6. ASK USER TO UPLOAD ZIP (REAL WORK)
# ============================================================
print("ðŸ“¦ Upload ZIP containing print clips")
uploaded = files.upload()
zip_name = list(uploaded.keys())[0]

# Runtime-only extraction (NOT Drive)
TEMP_DIR = "/content/runtime_images"
os.makedirs(TEMP_DIR, exist_ok=True)

with zipfile.ZipFile(zip_name, "r") as z:
    z.extractall(TEMP_DIR)

# ============================================================
# 7. PROCESS IMAGES & ESTIMATE SIZE
# ============================================================
results = []
skipped = defaultdict(int)

for root, _, files_in_dir in os.walk(TEMP_DIR):
    for file in files_in_dir:
        if not file.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        try:
            img_path = os.path.join(root, file)
            img = Image.open(img_path)
            px_w, px_h = img.size
            ratio = px_w / px_h

            # Find closest visual reference
            training_df["diff"] = abs(training_df["ratio"] - ratio)
            ref = training_df.sort_values("diff").iloc[0]

            scale_w = px_w / ref["px_w"]
            scale_h = px_h / ref["px_h"]

            width_cm = round(ref["cm_w"] * scale_w, 2)
            length_cm = round(ref["cm_h"] * scale_h, 2)

            # Metadata extraction
            folder_path = root.replace(TEMP_DIR, "")
            city = os.path.basename(root)

            publication = re.split(r'\d|pg|page', file, flags=re.I)[0]
            publication = re.sub(r'[_\-]', ' ', publication).strip()

            date_match = re.search(r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', file)
            date = date_match.group(0) if date_match else ""

            page_match = re.search(r'pg[\s\-]*\d+', file, flags=re.I)
            page = page_match.group(0) if page_match else ""

            results.append({
                "Folder Path": folder_path,
                "File Name": file,
                "Publication Name": publication,
                "City": city,
                "Date": date,
                "Page Number": page,
                "Length (cm)": length_cm,
                "Width (cm)": width_cm
            })

        except Exception as e:
            skipped[type(e).__name__] += 1

# ============================================================
# 8. WRITE EXCEL (ALWAYS OVERWRITE)
# ============================================================
output_df = pd.DataFrame(results)
output_df.to_excel(OUTPUT_EXCEL, index=False)

print("ðŸŽ‰ PROCESS COMPLETED")
print(f"ðŸ“Š Images processed: {len(results)}")

for err, cnt in skipped.items():
    print(f"{cnt} clips skipped due to {err}")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
âœ… Training completed using 3 reference samples
ðŸ“¦ Upload ZIP containing print clips


Saving CSB Live Del and Guwahati Print Clips.zip to CSB Live Del and Guwahati Print Clips (1).zip
ðŸŽ‰ PROCESS COMPLETED
ðŸ“Š Images processed: 84
