In [0]:
# # paths adapted to DBFS-style locations
# src_dir = "/Volumes/users/kevin_romero/kaggle/x-ray-kaggle/chest-xray-pneumonia/chest_xray"
# dst_dir = "/Volumes/users/kevin_romero/kaggle/x-ray-kaggle/chest-xray-pneumonia/chest_xray_aug"

# import shutil
# shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)

In [0]:
CATALOG = dbutils.widgets.get("CATALOG")
SCHEMA = dbutils.widgets.get("SCHEMA")
VOLUME = dbutils.widgets.get("VOLUME")
TABLE = dbutils.widgets.get("TABLE")

table_path = f"{CATALOG}.{SCHEMA}.{TABLE}"

In [0]:
from PIL import Image, ImageEnhance
import io, os, random

def augment_image_file(in_path: str, out_path: str):
    # load grayscale X‑ray
    img = Image.open(in_path).convert("L")  # Pillow load & convert [web:46][web:47]

    # 1) random +/-10° rotation
    angle = random.uniform(-10, 10)
    img = img.rotate(angle, resample=Image.BILINEAR, expand=False)  # rotate in-place space [web:46][web:56]

    # 2) random 5–10% zoom
    zoom = 1.0 + random.uniform(0.05, 0.10)
    w, h = img.size
    new_w, new_h = int(w / zoom), int(h / zoom)
    left = (w - new_w) // 2
    top = (h - new_h) // 2
    img = img.crop((left, top, left + new_w, top + new_h)).resize((w, h), Image.BILINEAR)

    # 3) brightness and contrast jitter (factors around 1.0 are mild changes) [web:40][web:47][web:54]
    img = ImageEnhance.Brightness(img).enhance(random.uniform(0.9, 1.1))
    img = ImageEnhance.Contrast(img).enhance(random.uniform(0.9, 1.1))

    # ensure output directory exists and save JPEG
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    img.save(out_path, format="JPEG")

In [0]:
src_root = "/Volumes/users/kevin_romero/kaggle/x-ray-kaggle/chest-xray-pneumonia/chest_xray"
dst_root = "/Volumes/users/kevin_romero/kaggle/x-ray-kaggle/chest-xray-pneumonia/chest_xray_aug"

cleaned_df = spark.read.table(table_path)

paths = [r.path for r in cleaned_df.select("path").collect()]

for p in paths:
    rel = os.path.relpath(p, src_root)
    out_path = os.path.join(dst_root, rel)
    augment_image_file(p, out_path)