# HAM10000 Skin Cancer Classifier – End-to-End Training

This Colab notebook builds a production-ready skin cancer classifier using the HAM10000 dataset. It covers environment setup, data ingestion and cleaning, imbalance-aware training with EfficientNetB0, rich evaluation (accuracy, confusion matrix, classification report, ROC curves), and artifact export (`model.h5`, `label_map.json`, and a reusable preprocessing script).

## 1. Setup & Dependencies
- Run the cell below once per runtime to install all required packages.
- Provide your Kaggle API token (`kaggle.json`) when prompted so that the notebook can download HAM10000 automatically.
- Make sure you have at least 20GB of free disk space.

In [None]:
!pip install -q kaggle pandas scikit-learn seaborn matplotlib tensorflow==2.13.0 tensorflow-addons tensorflow-io pillow albumentations --upgrade

In [None]:
import os
import json
import random
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (classification_report, confusion_matrix,
                             ConfusionMatrixDisplay, roc_curve, auc, recall_score)
import matplotlib.pyplot as plt
import seaborn as sns

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
DATASET_DIR = Path('/content/HAM10000')
EXPORT_DIR = Path('/content/drive/MyDrive/skin-cancer-artifacts')
EXPORT_DIR.mkdir(parents=True, exist_ok=True)


## 2. Connect Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 3. Download HAM10000 from Kaggle
Upload your `kaggle.json` (Account → Create API Token) using the file picker. The cell will handle installing the CLI, configuring credentials, and extracting the dataset.

In [None]:
from google.colab import files

if not Path('kaggle.json').exists():
    print('Upload kaggle.json (download from https://www.kaggle.com/ -> Account)')
    uploaded = files.upload()
    if 'kaggle.json' not in uploaded:
        raise RuntimeError('kaggle.json is required!')

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

if not DATASET_DIR.exists():
    DATASET_DIR.mkdir(parents=True, exist_ok=True)
    !kaggle datasets download -d kmader/skin-cancer-mnist-ham10000 -p /content/HAM10000
    !unzip -oq /content/HAM10000/skin-cancer-mnist-ham10000.zip -d /content/HAM10000

metadata_path = DATASET_DIR / 'HAM10000_metadata.csv'
assert metadata_path.exists(), 'Metadata CSV missing – check Kaggle download.'

## 4. Data Loading, Cleaning, and Label Encoding

In [None]:
metadata = pd.read_csv(metadata_path)
print(f"Raw metadata rows: {len(metadata):,}")

# Handle missing values
default_age = metadata['age'].median()
metadata['age'] = metadata['age'].fillna(default_age)
metadata['sex'] = metadata['sex'].fillna('unknown')
metadata['localization'] = metadata['localization'].fillna('unknown')
metadata['dx_type'] = metadata['dx_type'].fillna('unknown')

# Merge with file paths
image_dirs = [DATASET_DIR / 'HAM10000_images_part_1', DATASET_DIR / 'HAM10000_images_part_2']
image_records = []
for img_dir in image_dirs:
    for img_path in img_dir.glob('*.jpg'):
        image_records.append({'image_id': img_path.stem, 'file_path': img_path.as_posix()})
paths_df = pd.DataFrame(image_records)

metadata = metadata.merge(paths_df, how='inner', on='image_id')
print(f"Metadata rows after merge: {len(metadata):,}")

metadata.head()

In [None]:
label_encoder = LabelEncoder()
metadata['label_idx'] = label_encoder.fit_transform(metadata['dx'])
class_names = list(label_encoder.classes_)
num_classes = len(class_names)

label_map = {int(idx): name for idx, name in enumerate(class_names)}
print('Classes:', label_map)


In [None]:
train_df, temp_df = train_test_split(
    metadata,
    test_size=0.3,
    stratify=metadata['label_idx'],
    random_state=SEED
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label_idx'],
    random_state=SEED
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

## 5. Data Pipeline, Augmentation, and Class Weights

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_aug = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.3,
    horizontal_flip=True,
    fill_mode='nearest',
    brightness_range=[0.7, 1.3]
)
val_aug = ImageDataGenerator(rescale=1./255)
test_aug = ImageDataGenerator(rescale=1./255)

train_gen = train_aug.flow_from_dataframe(
    dataframe=train_df,
    x_col='file_path',
    y_col='dx',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=BATCH_SIZE,
    shuffle=True,
    seed=SEED
)
val_gen = val_aug.flow_from_dataframe(
    dataframe=val_df,
    x_col='file_path',
    y_col='dx',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=BATCH_SIZE,
    shuffle=False
)
test_gen = test_aug.flow_from_dataframe(
    dataframe=test_df,
    x_col='file_path',
    y_col='dx',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=BATCH_SIZE,
    shuffle=False
)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['dx']),
    y=train_df['dx']
)
label_to_index = train_gen.class_indices
weights_by_index = {label_to_index[label]: weight for label, weight in zip(np.unique(train_df['dx']), class_weights)}
weights_by_index

## 6. Model Architecture – EfficientNetB0 Transfer Learning

In [None]:
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB0

base_model = EfficientNetB0(
    include_top=False,
    weights='imagenet',
    input_shape=(*IMAGE_SIZE, 3)
)
base_model.trainable = False

inputs = layers.Input(shape=(*IMAGE_SIZE, 3), name='input_image')
x = layers.Rescaling(1./255)(inputs)
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(num_classes, activation='softmax', name='predictions')(x)

model = models.Model(inputs, outputs, name='ham10000_efficientnetb0')
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

## 7. Training with EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

In [None]:
checkpoint_path = EXPORT_DIR / 'model-best.h5'
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3, min_lr=1e-6),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path.as_posix(), monitor='val_loss', save_best_only=True)
]

epochs = 30
history = model.fit(
    train_gen,
    epochs=epochs,
    validation_data=val_gen,
    class_weight=weights_by_index,
    callbacks=callbacks
)

### Optional Fine-Tuning
Unfreeze the last 50 layers of EfficientNetB0 for a short fine-tuning stage once the top classifier converges.

In [None]:
for layer in base_model.layers[-50:]:
    layer.trainable = True

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

fine_tune_history = model.fit(
    train_gen,
    epochs=10,
    validation_data=val_gen,
    class_weight=weights_by_index,
    callbacks=callbacks
)

## 8. Evaluation – Accuracy, Classification Report, Per-Class Recall

In [None]:
test_loss, test_acc = model.evaluate(test_gen, verbose=1)
print(f"Test Accuracy: {test_acc:.4f}")

# Collect predictions
probs = model.predict(test_gen)
y_pred = np.argmax(probs, axis=1)
y_true = test_gen.classes

report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True)
print(json.dumps(report, indent=2))

per_class_recall = {class_names[i]: recall_score(y_true == i, y_pred == i) for i in range(num_classes)}
print('Per-class recall:', per_class_recall)

In [None]:
cm = confusion_matrix(y_true, y_pred)
fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(ax=ax, cmap='Blues', colorbar=False)
plt.title('HAM10000 Confusion Matrix')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 9. ROC Curves

In [None]:
from sklearn.preprocessing import label_binarize

y_true_binarized = label_binarize(y_true, classes=list(range(num_classes)))
fig, ax = plt.subplots(figsize=(8, 6))
for idx, class_name in enumerate(class_names):
    fpr, tpr, _ = roc_curve(y_true_binarized[:, idx], probs[:, idx])
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, label=f"{class_name} (AUC={roc_auc:.3f})")
ax.plot([0, 1], [0, 1], 'k--', label='Chance')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Per-Class ROC Curves')
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()

## 10. Export Artifacts (`model.h5`, `label_map.json`, `preprocess.py`)
The exported files live in Google Drive so they can be downloaded, versioned, or synced into your deployment repository.

In [None]:
model_path = EXPORT_DIR / 'ham10000_effnetb0.h5'
label_map_path = EXPORT_DIR / 'label_map.json'
preprocess_path = EXPORT_DIR / 'preprocessing.py'

model.save(model_path)
print('Saved model to', model_path)

with open(label_map_path, 'w') as f:
    json.dump({"index_to_label": label_map, "label_to_index": {v: k for k, v in label_map.items()}}, f, indent=2)
print('Saved label map to', label_map_path)

preprocess_script = f'''from pathlib import Path
import numpy as np
from PIL import Image

IMAGE_SIZE = {IMAGE_SIZE}


def load_image(image_path_or_bytes):
    """Load an image from disk or a BytesIO stream and convert to RGB."""
    if hasattr(image_path_or_bytes, 'read'):
        image = Image.open(image_path_or_bytes)
    else:
        image = Image.open(Path(image_path_or_bytes))
    return image.convert('RGB')


def preprocess_image(image):
    """Resize to 224x224 and normalize to [0, 1]."""
    image = image.resize(IMAGE_SIZE)
    arr = np.asarray(image).astype('float32') / 255.0
    return np.expand_dims(arr, axis=0)
'''
with open(preprocess_path, 'w') as f:
    f.write(preprocess_script)
print('Saved preprocessing helper to', preprocess_path)


✅ **Next steps**: download the three exported files and copy them into the repository's `model/` directory (or mount from cloud storage) so the Flask API can load them for inference.