In [1]:
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import sys
import os

# Add parent directory to Python path
sys.path.append(os.path.abspath("../skin_scan"))

# Now you can import the module
import preprocessing
from data import get_metadata_from_bq

In [3]:
def augment_images_balanced(processed_metadata: pd.DataFrame, width=96, height=96, bucket_name="skin_scan_mohnatz"):
    from google.cloud import storage
    from tensorflow.keras.preprocessing.image import ImageDataGenerator
    from collections import Counter
    from PIL import Image
    import numpy as np
    import pandas as pd
    import os
    import io

    client = storage.Client()
    bucket = client.bucket(bucket_name)

    print("📦 Fetching image list from bucket...")
    blobs = bucket.list_blobs(prefix="train_all_images/")
    image_lookup = {
        os.path.basename(blob.name).split('.')[0]: blob.name
        for blob in blobs if blob.name.lower().endswith('.jpg')
    }

    augmenter = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    image_list = []
    metadata_rows = []
    counter = 0  # Tracker for how many images added

    class_counts = Counter(processed_metadata['dx'])
    max_class_size = max(class_counts.values())

    print("🔁 Starting image processing and augmentation...")
    for dx_class, group in processed_metadata.groupby("dx"):
        group_size = len(group)
        n_aug_per_img = 2 if group_size < max_class_size else 0

        for _, row in group.iterrows():
            image_id = row["image_id"]
            if image_id not in image_lookup:
                continue

            blob = bucket.blob(image_lookup[image_id])
            image_bytes = blob.download_as_bytes()

            try:
                img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            except:
                print(f"⚠️ Error opening image: {image_id}")
                continue

            img = img.resize((width, height))
            img_np = np.array(img) / 255.0

            image_list.append(img_np)
            metadata_rows.append(row.copy())
            counter += 1

            if counter % 1000 == 0:
                print(f"✅ {counter} images added so far...")

            for i in range(n_aug_per_img):
                aug_img = augmenter.random_transform(img_np)
                image_list.append(aug_img)

                row_copy = row.copy()
                row_copy["image_id"] = f"{image_id}_aug{i}"
                metadata_rows.append(row_copy)
                counter += 1

                if counter % 1000 == 0:
                    print(f"✅ {counter} images added so far...")

    print(f"✅ Done: {len(image_list)} total images created")

    X_images = np.stack(image_list)
    metadata_df = pd.DataFrame(metadata_rows).reset_index(drop=True)

    return X_images, metadata_df

In [4]:
metadata = get_metadata_from_bq()
processed_metadata = preprocessing.preprocess_metadata(metadata)

In [5]:
nv_rows = metadata[metadata['dx'] == 'nv']
non_nv_rows = metadata[metadata['dx'] != 'nv']
shuffled_nv_rows = nv_rows.sample(frac=1, random_state=42).reset_index(drop=True)
nv_rows_small = shuffled_nv_rows.iloc[:4000]
metadata_cleaned = pd.concat([nv_rows_small, non_nv_rows], ignore_index=True)

In [None]:
processed_metadata = preprocessing.preprocess_metadata(metadata_cleaned)

#processed_metadata_test = processed_metadata.iloc[:10]

X_images, balanced_metadata = augment_images_balanced(processed_metadata)

# Continue with:
X_metadata, y, preprocessor, class_names = preprocessing.prepare_data_for_model(balanced_metadata)


2025-06-09 16:11:41.555973: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-09 16:11:41.571154: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-09 16:11:41.703875: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-09 16:11:41.828677: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749481901.930042   93748 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749481901.96

📦 Fetching image list from bucket...
🔁 Starting image processing and augmentation...
✅ 1000 images added so far...
✅ 2000 images added so far...
✅ 3000 images added so far...
✅ 4000 images added so far...
✅ 5000 images added so far...
✅ 6000 images added so far...
✅ 7000 images added so far...
✅ 8000 images added so far...
✅ 9000 images added so far...
✅ 10000 images added so far...
✅ 11000 images added so far...
✅ 12000 images added so far...
✅ 13000 images added so far...
✅ 14000 images added so far...
✅ Done: 14671 total images created
