# Stage 1 Ferning Classification – Simple Notebook Training

This notebook is a simplified, self-contained version of the Stage 1 training pipeline.

It:
- Loads the existing master index and fold splits CSVs
- Builds a small EfficientNetB3-based model
- Trains on one cross-validation fold using a Numpy data generator
- Computes basic medical metrics (sensitivity, specificity, balanced accuracy, AUC)

Run the cells from top to bottom to train and evaluate the model on your local data.

In [None]:
import os
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB3

from sklearn.metrics import confusion_matrix, roc_auc_score

# Basic TF / GPU setup
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
print(f"TensorFlow version: {tf.__version__}")

physical_gpus = tf.config.list_physical_devices("GPU")
if physical_gpus:
    for gpu in physical_gpus:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
        except Exception as e:
            print(f"Could not set memory growth on GPU: {e}")
    print(f"Using GPU(s): {[g.name for g in physical_gpus]}")
else:
    print("Using CPU (training will be slower)")

In [None]:
# Configuration – adjust these as needed

# Root of this simple repo (this notebook assumes it lives in simple_notebook_repo/)
REPO_ROOT = Path.cwd()
print(f"Repo root: {REPO_ROOT}")

# Paths to your existing data (reusing the same CSVs used by the original project)
MASTER_INDEX_PATH = REPO_ROOT.parent / "local" / "data" / "master_patch_index.csv"
FOLD_SPLITS_PATH = REPO_ROOT.parent / "local" / "data" / "fold_splits.csv"

# Where to save notebook outputs
OUTPUT_DIR = REPO_ROOT / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Training hyperparameters
EPOCHS = 10
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
INPUT_SHAPE = (64, 64, 3)
NUM_CLASSES = 2

# Cross-validation settings (you can lower this to 1 while experimenting)
N_FOLDS = 5

# Class / evaluation config
CLASS_NAMES = ["No Ferning", "Ferning"]
POSITIVE_CLASS_INDEX = 1
THRESHOLD = 0.5

print("Master index:", MASTER_INDEX_PATH)
print("Fold splits :", FOLD_SPLITS_PATH)
print("Output dir  :", OUTPUT_DIR)

In [None]:
# Data loading utilities

assert MASTER_INDEX_PATH.exists(), f"Master index not found: {MASTER_INDEX_PATH}"
assert FOLD_SPLITS_PATH.exists(), f"Fold splits not found: {FOLD_SPLITS_PATH}"

master_index = pd.read_csv(MASTER_INDEX_PATH)
fold_splits = pd.read_csv(FOLD_SPLITS_PATH)

print(f"Loaded master index with {len(master_index)} patches from {master_index['sample_id'].nunique()} samples")
print(f"Loaded fold splits with {len(fold_splits)} rows")


def remap_paths(df: pd.DataFrame) -> pd.DataFrame:
    """Remap old absolute paths in the CSV to the local repo dataset folder."""
    df = df.copy()
    if "path" in df.columns:
        old_prefix = "C:/Users/Jaron/Downloads/dataset/dataset"
        new_prefix = str((REPO_ROOT.parent / "local" / "data" / "dataset" / "dataset").resolve())
        df["path"] = (
            df["path"]
            .astype(str)
            .str.replace(old_prefix, new_prefix, regex=False)
            .str.replace("\\", "/", regex=False)
        )
    return df


master_index = remap_paths(master_index)

# Ensure label_stage1 exists
if "label_stage1" not in master_index.columns:
    master_index["label_stage1"] = master_index["class"].apply(
        lambda x: 1 if x in ["PF", "CF"] else 0
    )


def load_fold_data_simple(fold_num: int):
    """Return train/val DataFrames for the given fold number."""
    fold_data = fold_splits[fold_splits["fold"] == fold_num]
    train_samples = fold_data[fold_data["split"] == "train"]["sample_id"].tolist()
    val_samples = fold_data[fold_data["split"] == "val"]["sample_id"].tolist()

    train_df = master_index[master_index["sample_id"].isin(train_samples)].copy()
    val_df = master_index[master_index["sample_id"].isin(val_samples)].copy()

    print(f"Fold {fold_num}: train={len(train_df)} patches, val={len(val_df)} patches")
    return train_df, val_df


def preprocess_npy(npy_path: str | Path) -> np.ndarray:
    """Load and preprocess a .npy image file into (64, 64, 3) float32."""
    img_array = np.load(npy_path)

    # Detect if already preprocessed
    is_preprocessed = (img_array.min() < 0) or (img_array.max() > 255)

    if is_preprocessed:
        img_array = img_array.astype(np.float32)
        if img_array.ndim == 2:
            img_array = np.stack([img_array] * 3, axis=-1)
        elif img_array.shape[-1] == 1:
            img_array = np.repeat(img_array, 3, axis=-1)
        return img_array

    # Ensure 3D (H, W, C)
    if img_array.ndim == 2:
        img_array = np.stack([img_array] * 3, axis=-1)

    # Resize to (64, 64)
    if img_array.shape[:2] != (64, 64):
        img_array = cv2.resize(img_array, (64, 64))

    # Ensure RGB
    if img_array.ndim == 2:
        img_array = np.stack([img_array] * 3, axis=-1)
    elif img_array.shape[-1] == 1:
        img_array = np.repeat(img_array, 3, axis=-1)

    # Normalize to [0, 1]
    if img_array.max() > 1.0:
        img_array = img_array / 255.0

    img_array = img_array.astype(np.float32)

    # ImageNet normalization
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    img_array = (img_array - mean) / std

    return img_array


class NumpyDataGenerator(keras.utils.Sequence):
    """Minimal .npy-based generator with on-the-fly loading."""

    def __init__(self, dataframe: pd.DataFrame, batch_size: int = 32,
                 shuffle: bool = True, augment: bool = False):
        self.df = dataframe.reset_index(drop=True)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.augment = augment  # kept for API compatibility, not used here
        self.n = len(self.df)
        self.on_epoch_end()

    def __len__(self) -> int:
        return int(np.ceil(self.n / self.batch_size))

    def __getitem__(self, index: int):
        start_idx = index * self.batch_size
        end_idx = min((index + 1) * self.batch_size, self.n)
        batch_indices = self.indices[start_idx:end_idx]
        batch_df = self.df.iloc[batch_indices]

        X = np.array([preprocess_npy(path) for path in batch_df["path"]])

        y = keras.utils.to_categorical(
            batch_df["label_stage1"].values,
            num_classes=NUM_CLASSES,
        )
        return X, y

    def on_epoch_end(self):
        self.indices = np.arange(self.n)
        if self.shuffle:
            np.random.shuffle(self.indices)

    def reset(self):
        self.on_epoch_end()

In [None]:
# Model definition – EfficientNetB3 backbone with simple head


def build_model(input_shape=INPUT_SHAPE, num_classes=NUM_CLASSES):
    try:
        base = EfficientNetB3(
            include_top=False,
            weights="imagenet",
            input_shape=input_shape,
            pooling="avg",
        )
    except Exception as e:
        print(f"Could not load ImageNet weights, using random init: {e}")
        base = EfficientNetB3(
            include_top=False,
            weights=None,
            input_shape=input_shape,
            pooling="avg",
        )

    # Freeze backbone for simple transfer learning
    base.trainable = False

    inputs = layers.Input(shape=input_shape)
    x = base(inputs, training=False)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = keras.Model(inputs, outputs, name="EfficientNetB3_simple")

    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model


model = build_model()
model.summary()}},{

In [None]:
# Evaluation helpers – confusion matrix + medical-style metrics


def evaluate_predictions(y_true, y_pred_proba, threshold=THRESHOLD, class_names=CLASS_NAMES):
    y_true = np.asarray(y_true)
    y_pred_proba = np.asarray(y_pred_proba)

    if y_pred_proba.ndim > 1:
        y_pred_pos = y_pred_proba[:, POSITIVE_CLASS_INDEX]
        y_pred_cls = np.argmax(y_pred_proba, axis=1)
    else:
        y_pred_pos = y_pred_proba
        y_pred_cls = (y_pred_pos >= threshold).astype(int)

    cm = confusion_matrix(y_true, y_pred_cls)
    if cm.shape != (2, 2):
        raise ValueError(f"Expected binary confusion matrix, got shape {cm.shape}")

    tn, fp, fn, tp = cm.ravel()

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    balanced_acc = 0.5 * (sensitivity + specificity)
    f1 = (
        2 * (precision * sensitivity) / (precision + sensitivity)
        if (precision + sensitivity) > 0
        else 0.0
    )

    try:
        auc = roc_auc_score(y_true, y_pred_pos)
    except Exception:
        auc = np.nan

    print("\n" + "=" * 70)
    print("EVALUATION REPORT")
    print("=" * 70)
    print(f"Classes: {class_names}")
    print("\nConfusion matrix:")
    print(cm)
    print("\nPrimary metrics (medical):")
    print(f"Sensitivity (recall / TPR): {sensitivity:.4f}")
    print(f"Specificity (TNR)        : {specificity:.4f}")
    print(f"Balanced accuracy        : {balanced_acc:.4f}")
    print("\nAdditional metrics:")
    print(f"Accuracy                 : {accuracy:.4f}")
    print(f"Precision (PPV)          : {precision:.4f}")
    print(f"F1-score                 : {f1:.4f}")
    print(f"AUC-ROC                  : {auc:.4f}" if not np.isnan(auc) else "AUC-ROC: N/A")

    return {
        "sensitivity": sensitivity,
        "specificity": specificity,
        "balanced_accuracy": balanced_acc,
        "accuracy": accuracy,
        "precision": precision,
        "f1_score": f1,
        "auc": auc,
        "tp": int(tp),
        "tn": int(tn),
        "fp": int(fp),
        "fn": int(fn),
    }

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Main training loop over folds

all_results = []

for fold_num in range(1, N_FOLDS + 1):
    print("\n" + "=" * 70)
    print(f"FOLD {fold_num}/{N_FOLDS}")
    print("=" * 70)

    fold_output_dir = OUTPUT_DIR / f"fold{fold_num}"
    fold_output_dir.mkdir(parents=True, exist_ok=True)

    # Prepare data
    train_df, val_df = load_fold_data_simple(fold_num)

    # Class weights for imbalance
    y_train = train_df["label_stage1"].values
    classes = np.unique(y_train)
    class_weights_arr = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
    class_weights = {int(c): float(w) for c, w in zip(classes, class_weights_arr)}
    print("Class weights:", class_weights)

    train_gen = NumpyDataGenerator(train_df, batch_size=BATCH_SIZE, shuffle=True)
    val_gen = NumpyDataGenerator(val_df, batch_size=BATCH_SIZE, shuffle=False)

    # Build a fresh model for this fold
    tf.keras.backend.clear_session()
    model = build_model()

    callbacks = [
        keras.callbacks.ModelCheckpoint(
            filepath=str(fold_output_dir / "best_model.h5"),
            monitor="val_accuracy",
            mode="max",
            save_best_only=True,
            verbose=1,
        ),
        keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=5,
            restore_best_weights=True,
            verbose=1,
        ),
        keras.callbacks.CSVLogger(str(fold_output_dir / "history.csv")),
    ]

    start_time = datetime.now()
    history = model.fit(
        train_gen,
        validation_data=val_gen,
        epochs=EPOCHS,
        class_weight=class_weights,
        callbacks=callbacks,
        verbose=1,
    )
    elapsed = (datetime.now() - start_time).total_seconds()
    print(f"Training time: {elapsed:.0f}s")

    # Evaluation
    val_gen.reset()
    y_pred_proba = model.predict(val_gen, verbose=0)
    y_true = val_df["label_stage1"].values

    metrics_dict = evaluate_predictions(
        y_true=y_true,
        y_pred_proba=y_pred_proba,
        threshold=THRESHOLD,
        class_names=CLASS_NAMES,
    )

    metrics_dict["fold"] = fold_num
    metrics_dict["train_samples"] = int(len(train_df))
    metrics_dict["val_samples"] = int(len(val_df))

    all_results.append(metrics_dict)

    # Save incremental results
    pd.DataFrame(all_results).to_csv(OUTPUT_DIR / "all_results.csv", index=False)

# Summary across folds
results_df = pd.DataFrame(all_results)
results_df