In [4]:
import os
import re
import glob
import math
import argparse
from pathlib import Path

import numpy as np
import pandas as pd
import cv2

from tqdm import tqdm
from skimage import io, exposure, filters, morphology, measure, util

# =========================
# CONFIG (edite aqui)
# =========================
MIAS_DIR = r"C:\Users\jhter\OneDrive - cefet-rj.br\Processamento-Imagem\Trabalho-1\MIAS"
CSV_PATH = r"C:\Users\jhter\OneDrive - cefet-rj.br\Processamento-Imagem\Trabalho-1\mias_derived_info.csv"
OUTPUT_DIR = r"C:\Users\jhter\OneDrive - cefet-rj.br\Processamento-Imagem\Trabalho-1\outputs_preproc"

# Padrões (pode alterar via CLI)
TARGET_SIZE = 1024         # 0 para não redimensionar; ou ex.: 1024
NORM_LOW_PCT = 0.5         # percentil baixo
NORM_HIGH_PCT = 99.5       # percentil alto


# =========================
# Utilidades
# =========================
def ensure_dir(p):
    Path(p).mkdir(parents=True, exist_ok=True)

def img_to_float01(img):
    img = img.astype(np.float32)
    if img.max() > 1.0:
        img /= 255.0
    return np.clip(img, 0.0, 1.0)

def save_uint8(image_f32, out_path):
    u8 = np.clip(image_f32, 0, 1)
    u8 = (u8 * 255.0 + 0.5).astype(np.uint8)
    cv2.imwrite(out_path, u8)

def robust_normalize(img, low_pct=0.5, high_pct=99.5):
    lo, hi = np.percentile(img, (low_pct, high_pct))
    if hi <= lo:
        return np.clip(img, 0, 1)
    return np.clip((img - lo) / (hi - lo), 0, 1)

def read_pgm(path):
    img = io.imread(path)  # skimage suporta .pgm
    return img_to_float01(img)

def breast_silhouette(img):
    """
    Gera silhueta da mama conforme descrito no artigo:
    equalização global -> limiarização Otsu -> maior componente -> morfologia/fechamento + fill holes.
    """
    eq = exposure.equalize_hist(img)
    thr = filters.threshold_otsu(eq)
    mask = (eq > thr).astype(np.uint8)

    labels = measure.label(mask, connectivity=2)
    if labels.max() == 0:
        return mask

    # maior componente conexo
    binc = np.bincount(labels.ravel())
    binc[0] = 0  # ignora fundo
    largest = binc.argmax()
    mask = (labels == largest).astype(np.uint8)

    # morfologia
    mask = morphology.binary_closing(mask, morphology.disk(5)).astype(np.uint8)
    mask = morphology.binary_fill_holes(mask).astype(np.uint8)
    return mask

def crop_to_breast(img, mask):
    ys, xs = np.where(mask > 0)
    if len(xs) == 0:
        return img, mask, (0, img.shape[0], 0, img.shape[1])

    y0, y1 = ys.min(), ys.max()
    x0, x1 = xs.min(), xs.max()

    cropped_img = img[y0:y1+1, x0:x1+1] * mask[y0:y1+1, x0:x1+1]
    cropped_mask = mask[y0:y1+1, x0:x1+1]
    return cropped_img, cropped_mask, (y0, y1+1, x0, x1+1)

def resize_if_needed(img, mask, target_size):
    if not target_size or target_size <= 0:
        return img, mask
    return (cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_AREA),
            cv2.resize(mask, (target_size, target_size), interpolation=cv2.INTER_NEAREST))

def extract_refnum(filename):
    """
    Extrai REFNUM do nome do arquivo (ex.: mdb001, mdb123).
    """
    base = os.path.basename(filename).lower()
    m = re.search(r"(mdb\d{3})", base)
    return m.group(1) if m else os.path.splitext(base)[0]


# =========================
# Pipeline principal
# =========================
def preprocess_one(img_path, out_img_dir, out_mask_dir, target_size, low_pct, high_pct):
    img = read_pgm(img_path)

    # 1) silhueta
    mask = breast_silhouette(img)

    # 2) crop
    img_c, mask_c, (y0, y1, x0, x1) = crop_to_breast(img, mask)

    # 3) normalização robusta
    img_n = robust_normalize(img_c, low_pct, high_pct)

    # 4) resize (opcional)
    img_r, mask_r = resize_if_needed(img_n, mask_c, target_size)

    # salvar
    basename = os.path.splitext(os.path.basename(img_path))[0]
    out_img = os.path.join(out_img_dir, f"{basename}_preproc.png")
    out_msk = os.path.join(out_mask_dir, f"{basename}_mask.png")
    save_uint8(img_r, out_img)
    save_uint8(mask_r.astype(np.float32), out_msk)

    H, W = img_r.shape
    return {
        "input_path": img_path,
        "output_img": out_img,
        "output_mask": out_msk,
        "crop_y0": y0, "crop_y1": y1, "crop_x0": x0, "crop_x1": x1,
        "height": H, "width": W
    }

def load_metadata(csv_path):
    if csv_path and os.path.isfile(csv_path):
        df = pd.read_csv(csv_path)
        # padroniza REFNUM em minúsculo para casar melhor
        df["REFNUM"] = df["REFNUM"].astype(str).str.lower()
        return df
    return pd.DataFrame()

def run_preprocess(mias_dir, csv_path, out_dir, target_size, low_pct, high_pct):
    assert os.path.isdir(mias_dir), f"Diretório MIAS não encontrado: {mias_dir}"
    ensure_dir(out_dir)
    out_img_dir = os.path.join(out_dir, "images")
    out_mask_dir = os.path.join(out_dir, "masks")
    ensure_dir(out_img_dir)
    ensure_dir(out_mask_dir)

    # carrega metadados (se disponível)
    meta = load_metadata(csv_path)
    has_meta = not meta.empty
    meta = meta.set_index("REFNUM") if has_meta else meta

    # encontra arquivos .pgm
    pgms = glob.glob(os.path.join(mias_dir, "*.pgm")) + glob.glob(os.path.join(mias_dir, "*.PGM"))
    assert len(pgms) > 0, f"Nenhuma imagem .pgm encontrada em: {mias_dir}"

    logs = []
    for p in tqdm(sorted(pgms), desc="Pré-processando"):
        info = preprocess_one(p, out_img_dir, out_mask_dir, target_size, low_pct, high_pct)
        # agrega metadados via REFNUM
        ref = extract_refnum(p)
        if has_meta and ref in meta.index:
            row = meta.loc[ref]
            info.update({
                "REFNUM": ref,
                "BG": row.get("BG", ""),
                "CLASS": row.get("CLASS", ""),
                "SEVERITY": row.get("SEVERITY", ""),
                "X": row.get("X", np.nan),
                "Y": row.get("Y", np.nan),
                "RADIUS": row.get("RADIUS", np.nan),
                "DENSITY": row.get("DENSITY", ""),
                "BI-RADS": row.get("BI-RADS", ""),
                "CLASS_FULL": row.get("CLASS_FULL", ""),
                "CLASS_GROUP": row.get("CLASS_GROUP", ""),
            })
        else:
            info.update({"REFNUM": ref})
        logs.append(info)

    df_log = pd.DataFrame(logs)
    out_csv = os.path.join(out_dir, "preprocess_manifest.csv")
    df_log.to_csv(out_csv, index=False)

    # resumo de balanceamento (se houver metadados)
    if has_meta and "SEVERITY" in df_log.columns:
        print("\nBalanceamento por SEVERITY:")
        print(df_log["SEVERITY"].value_counts(dropna=False))

    print(f"\nPré-processamento concluído!\n- Imagens: {len(pgms)}"
          f"\n- Saída: {out_dir}\n- Manifest: {out_csv}")


# =========================
# CLI
# =========================
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Pré-processamento das imagens MIAS (silhueta -> crop -> normalização -> (opcional) resize).")
    parser.add_argument("--mias_dir", type=str, default=MIAS_DIR, help="Pasta com as imagens MIAS (.pgm)")
    parser.add_argument("--csv_path", type=str, default=CSV_PATH, help="CSV de metadados (opcional)")
    parser.add_argument("--out_dir", type=str, default=OUTPUT_DIR, help="Pasta de saída")
    parser.add_argument("--target_size", type=int, default=TARGET_SIZE, help="Tamanho alvo (ex.: 1024). Use 0 para não redimensionar.")
    parser.add_argument("--low_pct", type=float, default=NORM_LOW_PCT, help="Percentil inferior para normalização (ex.: 0.5)")
    parser.add_argument("--high_pct", type=float, default=NORM_HIGH_PCT, help="Percentil superior para normalização (ex.: 99.5)")
    args = parser.parse_args()

    run_preprocess(
        mias_dir=args.mias_dir,
        csv_path=args.csv_path,
        out_dir=args.out_dir,
        target_size=args.target_size,
        low_pct=args.low_pct,
        high_pct=args.high_pct
    )


usage: ipykernel_launcher.py [-h] [--mias_dir MIAS_DIR] [--csv_path CSV_PATH]
                             [--out_dir OUT_DIR] [--target_size TARGET_SIZE]
                             [--low_pct LOW_PCT] [--high_pct HIGH_PCT]
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\jhter\AppData\Roaming\jupyter\runtime\kernel-v3f8069bc2ade04c83a8ba1488cb2f1acfd677ac8b.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
