### _Setup_

In [None]:
# Reset memory
%reset -f

In [None]:
# Install correct package versions
!pip install "tensorflow[and-cuda]"
!pip uninstall numpy pandas -y
!pip install "numpy<2.0" pandas --upgrade --no-cache-dir

In [None]:
# Packages
from typing import Union, List, Tuple, Dict, Any
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, train_test_split, StratifiedShuffleSplit
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, initializers, optimizers, callbacks
import optuna
import matplotlib.pyplot as plt

In [None]:
# GPU check
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

In [None]:
# Data
df = pd.read_csv('data.csv')

### _Functions_

In [None]:
def find_col_index_of_spectra(
    df: pd.DataFrame
) -> int:
    """
    Find the column index where spectral data starts.

    Assumes spectral column names can be converted to float (e.g., "730.5", "731.0").

    Parameters:
        df : Input DataFrame

    Returns:
        Index of the first spectral column, or -1 if not found.
    """
    for idx, col in enumerate(df.columns):
        try:
            float(col)
            return idx
        except (ValueError, TypeError):
            continue
    return -1

def split_train_test(
    df: pd.DataFrame,
    test_variety: str,
    test_season: int       
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split a DataFrame into one training set and two test sets:

    - Variety test set: Variety == test_variety AND Year == 2024
    - Season test set : Year == test_season 

    The training set excludes all rows that belong to any of the test sets.
    The season test set only includes varieties that are present in the training set.

    Parameters:
        df           : Full pandas DataFrame
        test_variety : Variety used for the test set
        test_season  : Year used for the season test

    Returns:
        df_train        : Training set
        df_test_variety : Test set for specified variety and 2024
        df_test_season  : Test set for specified season (filtered by train varieties)
    """

    # Select test set for the specified variety in year 2024
    df_test_variety = df[
        (df["Variety"] == test_variety) &
        (df["Scan Date Year"] == 2024)
    ]

    # Select test set for the specified season (regardless of variety)
    df_test_season = df[
        df["Scan Date Year"] == test_season
    ]

    # Select training set (exclude test variety and test season)
    df_train = df[
        (df["Variety"] != test_variety) &
        (df["Scan Date Year"] != test_season)
    ]

    # Filter season test set to only include varieties present in training set
    train_varieties = df_train["Variety"].unique()
    df_test_season = df_test_season[
        df_test_season["Variety"].isin(train_varieties)
    ]

    return df_train, df_test_variety, df_test_season

def take_subset(
    df: pd.DataFrame, 
    n_subset: int,
    random_state: int
) -> pd.DataFrame:
    """
    Return a stratified subset of the DataFrame based on 10 Brix bins.

    If n_subset >= len(df), the original DataFrame is returned.

    Parameters:
        df       : Input DataFrame with 'Brix (Position)' column
        n_subset : Desired subset size
        random_state : Random seed for reproducibility

    Returns:
        Subset of df with stratification over 10 quantile bins of Brix
    """
    # If requested subset size exceeds full dataset, return a copy of the full DataFrame
    if n_subset >= len(df):
        return df.copy()

    # Bin the Brix values into 10 quantile-based bins for stratification
    binned = pd.qcut(df["Brix (Position)"], q=10, labels=False, duplicates='drop')

    # Initialize stratified sampler
    splitter = StratifiedShuffleSplit(
        n_splits=1,
        train_size=n_subset,
        random_state=random_state
    )

    # Perform stratified split and extract subset indices
    idx_subset, _ = next(splitter.split(df, binned))

    # Return the stratified subset as a new DataFrame with reset index
    return df.iloc[idx_subset].reset_index(drop=True)

def create_train_val_split(
    df: pd.DataFrame,
    validation_size: float,
    random_state: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split a DataFrame into train and validation sets using stratified sampling
    based on 10 quantile bins of the 'Brix (Position)' column.

    Parameters:
        df              : Input DataFrame
        validation_size : Proportion of validation samples (0 < float < 1)
        random_state    : Seed for reproducibility

    Returns:
        df_train, df_val : Stratified training and validation DataFrames
    """
    # Bin the Brix values into 10 quantile-based bins for stratified splitting
    binned = pd.qcut(df["Brix (Position)"], q=10, labels=False, duplicates="drop")

    # Perform stratified train/validation split based on the binned Brix values
    df_train, df_val = train_test_split(
        df,
        test_size=validation_size,
        random_state=random_state,
        stratify=binned
    )

    # Return splits with reset indices
    return df_train.reset_index(drop=True), df_val.reset_index(drop=True)

def split_x_y(
    df: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Split a DataFrame into X (spectral features) and y (target) arrays.
    Assumes find_col_index_of_spectra() is defined globally and returns the index
    where spectral data starts.

    Parameters:
        df : Input DataFrame containing both metadata and spectral data.

    Returns:
        x : NumPy array of shape (n_samples, n_spectral_features)
        y : NumPy array of shape (n_samples, 1) containing Brix values
    """
    # Identify spectral columns (those that can be cast to float, e.g. wavelengths)
    spectra_cols = list(df.columns[find_col_index_of_spectra(df):])

    # Define the target column
    target_cols = ['Brix (Position)']

    # Extract feature and target arrays
    x = df[spectra_cols].values
    y = df[target_cols].values

    return x, y

def rmse_loss(
    y_true, 
    y_pred
):
    """
    Compute the Root Mean Squared Error (RMSE) as a loss function.

    Parameters:
        y_true : Tensor of true target values
        y_pred : Tensor of predicted values

    Returns:
        RMSE as a scalar Tensor
    """
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))  

def rmse_metric(
    y_true, 
    y_pred
):
    """
    Compute the Root Mean Squared Error (RMSE) as a performance metric.

    Parameters:
        y_true : Tensor of true target values
        y_pred : Tensor of predicted values

    Returns:
        RMSE as a scalar Tensor
    """
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))  

def cnn_model(
    input_shape: int,
    kernel_size: int,
    dropout_rate: float,
    l2_strength: float,
    learning_rate: float,
    random_state: int
) -> tf.keras.Model:
    """
    Build and compile a 1D Convolutional Neural Network for regression tasks.

    Parameters:
        input_shape    : Number of input features (spectral length).
        kernel_size    : Size of the 1D convolutional kernel.
        dropout_rate   : Dropout rate used after each dense block.
        l2_strength    : L2 regularization strength for kernel weights.
        learning_rate  : Learning rate for the Adam optimizer.
        random_state   : Random seed for reproducibility.

    Returns:
        model : Compiled Keras model ready for training.
    """
    # Define kernel regularizer and initializer
    kernel_reg  = regularizers.l2(l2_strength)
    kernel_init = initializers.HeNormal(seed=random_state)

    # Build model architecture
    model = models.Sequential([
        tf.keras.Input(shape=(input_shape,)),              # Input layer
        layers.Reshape((input_shape, 1)),                  # Reshape to (timesteps, 1) for Conv1D
        layers.Conv1D(                                     # Convolutional layer
            filters=1,
            kernel_size=kernel_size,
            padding="same",
            activation="elu",
            kernel_initializer=kernel_init,
            kernel_regularizer=kernel_reg
        ),
        layers.Dropout(dropout_rate),                      # Regularization
        layers.Flatten(),                                  # Flatten before dense layers
        layers.Dense(36, activation="elu", kernel_initializer=kernel_init, kernel_regularizer=kernel_reg),
        layers.Dropout(dropout_rate),
        layers.Dense(18, activation="elu", kernel_initializer=kernel_init, kernel_regularizer=kernel_reg),
        layers.Dropout(dropout_rate),
        layers.Dense(12, activation="elu", kernel_initializer=kernel_init, kernel_regularizer=kernel_reg),
        layers.Dense(1, activation="linear", kernel_initializer=kernel_init, kernel_regularizer=kernel_reg)  # Output
    ])

    # Compile the model with custom RMSE loss and metric
    model.compile(
        optimizer=optimizers.Adam(learning_rate=learning_rate),
        loss=rmse_loss,
        metrics=[rmse_metric]
    )

    return model

def perform_optuna_hyperparameter_optimization(
    x_train_data: np.ndarray,
    y_train_data: np.ndarray,
    x_val_data: np.ndarray,
    y_val_data: np.ndarray,
    batch_size_range: List[int],
    l2_strength_range: Tuple[float, float],
    kernel_size_range: Tuple[int, int],
    dropout_rate_range: Tuple[float, float],
    random_state: int,
    patience_callback_reduce_lr: int,
    patience_callback_early_stopping: int,
    epochs: int,
    min_lr: float,
    timeout: int
) -> Tuple[optuna.study.Study, float, Dict[str, Any]]:
    """
    Optimize CNN hyperparameters using a fixed validation set with Optuna.

    Parameters:
        x_train_data, y_train_data : Arrays of training features and targets
        x_val_data, y_val_data     : Arrays of validation features and targets
        batch_size_range           : List of batch sizes to try
        l2_strength_range          : Tuple defining (min, max) for L2 regularization
        kernel_size_range          : Tuple defining (min, max) for Conv1D kernel size
        dropout_rate_range         : Tuple defining (min, max) for dropout rate
        random_state               : Random seed for reproducibility
        patience_callback_*        : Patience for LR scheduler and early stopping
        epochs                     : Max number of training epochs
        min_lr                     : Minimum learning rate for LR scheduler
        timeout                    : Max Optuna runtime (seconds)

    Returns:
        study       : Optuna Study object
        best_value  : Best validation RMSE found
        best_params : Best-performing hyperparameter set
    """

    def objective(trial):
        # Suggest hyperparameters
        batch_size   = trial.suggest_categorical("batch_size", batch_size_range)
        l2_strength  = trial.suggest_float("l2_strength", *l2_strength_range)
        kernel_size  = trial.suggest_int("kernel_size", *kernel_size_range, step=2)
        dropout_rate = trial.suggest_float("dropout_rate", *dropout_rate_range)

        # Adjust learning rate proportionally to batch size
        learning_rate = 0.01 * (batch_size / 256)

        # Print trial configuration
        print(f"\n[Trial {trial.number}] Testing hyperparameters:")
        print(f"  batch_size   = {batch_size}")
        print(f"  l2_strength  = {l2_strength:.2e}")
        print(f"  kernel_size  = {kernel_size}")
        print(f"  dropout_rate = {dropout_rate:.2f}")

        # Build CNN model with trial parameters
        model = cnn_model(
            input_shape=x_train_data.shape[1],
            kernel_size=kernel_size,
            dropout_rate=dropout_rate,
            l2_strength=l2_strength,
            learning_rate=learning_rate,
            random_state=random_state
        )

        # Define training callbacks
        cb = [
            callbacks.ReduceLROnPlateau(
                monitor="val_loss",
                factor=0.5,
                patience=patience_callback_reduce_lr,
                min_lr=min_lr,
                verbose=0
            ),
            callbacks.EarlyStopping(
                monitor="val_loss",
                patience=patience_callback_early_stopping,
                restore_best_weights=True,
                verbose=0
            )
        ]

        # Train the model
        model.fit(
            x_train_data, y_train_data,
            validation_data=(x_val_data, y_val_data),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=cb,
            verbose=1
        )

        # Evaluate model on validation set
        preds = model.predict(x_val_data, batch_size=batch_size, verbose=0)
        rmse = np.sqrt(mean_squared_error(y_val_data, preds))

        print(f"  → Val RMSE: {rmse:.4f}")
        return float(rmse)

    # Create and run Optuna study
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, timeout=timeout)

    return study, float(study.best_value), study.best_params

def train_cnn_model(
    x_train_data: np.ndarray,
    y_train_data: np.ndarray,
    best_params: Dict[str, Any],
    random_state: int,
    patience_callback_reduce_lr: int,
    patience_callback_early_stopping: int,
    epochs: int,
    min_lr: float
) -> tf.keras.Model:
    """
    Train a CNN model using the full training set and Optuna-derived hyperparameters.

    Parameters:
        x_train_data : Training features
        y_train_data : Training targets
        best_params  : Dictionary with optimal hyperparameters from Optuna
        random_state : Seed for reproducibility
        patience_callback_reduce_lr     : Patience for learning rate reduction
        patience_callback_early_stopping: Patience for early stopping
        epochs       : Maximum number of training epochs
        min_lr       : Minimum learning rate allowed by the scheduler

    Returns:
        model : Trained Keras CNN model
    """

    # === Unpack hyperparameters ===
    batch_size   = best_params["batch_size"]
    l2_strength  = best_params["l2_strength"]
    kernel_size  = best_params["kernel_size"]
    dropout_rate = best_params["dropout_rate"]

    # Learning rate scaled with batch size (same formula used in Optuna tuning)
    learning_rate = 0.01 * (batch_size / 256)

    # === Build CNN model with chosen hyperparameters ===
    model = cnn_model(
        input_shape=x_train_data.shape[1],
        kernel_size=kernel_size,
        dropout_rate=dropout_rate,
        l2_strength=l2_strength,
        learning_rate=learning_rate,
        random_state=random_state
    )

    # === Configure callbacks ===
    cb = [
        callbacks.ReduceLROnPlateau(
            monitor="loss",                     # Monitors training loss
            factor=0.5,                         # Halve learning rate when triggered
            patience=patience_callback_reduce_lr,
            min_lr=min_lr,
            verbose=0
        ),
        callbacks.EarlyStopping(
            monitor="loss",                     # Stops early if training loss plateaus
            patience=patience_callback_early_stopping,
            restore_best_weights=True,
            verbose=0
        )
    ]

    # === Train model ===
    model.fit(
        x_train_data,
        y_train_data,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=cb,
        verbose=1
    )

    return model

def test_cnn_model(
    model: tf.keras.Model,
    x_test_data: np.ndarray,
    y_test_data: np.ndarray,
    batch_size: int
) -> Tuple[float, float, float, pd.DataFrame]:
    """
    Evaluate a trained CNN model on a hold-out test set.

    Parameters:
        model       : Trained Keras model
        x_test_data : Test feature matrix
        y_test_data : Test target vector
        batch_size  : Batch size for prediction

    Returns:
        test_rmsep              : Root mean squared error of prediction
        test_r2                 : Coefficient of determination (R² score)
        test_practical_accuracy : Percentage of predictions within ±20% of actual value
        df_results              : DataFrame with columns ['predicted', 'observed']
    """

    # === Run model inference ===
    y_pred = model.predict(x_test_data, batch_size=batch_size, verbose=0).flatten()
    y_true = y_test_data.flatten()

    # === Compute evaluation metrics ===
    test_rmsep = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    test_r2 = float(r2_score(y_true, y_pred))
    pct_error = np.abs(y_pred - y_true) / np.abs(y_true)
    test_practical_accuracy = float((pct_error <= 0.2).mean() * 100.0)

    # === Print evaluation summary ===
    print(f"Test RMSEP: {test_rmsep:.4f}")
    print(f"Test R²: {test_r2:.4f}")
    print(f"Practical accuracy (±20%): {test_practical_accuracy:.1f}%")

    # === Plot parity plot ===
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, alpha=0.7, label="Test Data")
    plt.plot(
        [y_true.min(), y_true.max()],
        [y_true.min(), y_true.max()],
        "k--", lw=2, label="Ideal"
    )
    plt.xlabel("Observed")
    plt.ylabel("Predicted")
    plt.title("Observed vs. Predicted on Test Set (CNN)")
    plt.legend()
    plt.grid(True)
    plt.show()

    # === Create prediction result DataFrame ===
    df_results = pd.DataFrame({
        "predicted": y_pred,
        "observed": y_true
    })

    return test_rmsep, test_r2, test_practical_accuracy, df_results


### _Parameters_

In [None]:
DF              = df
TEST_VARIETY    = "TestVariety"
TEST_SEASON     = 2025

RANDOM_STATE    = 27
N_SUBSET        = 22892
VALIDATION_SIZE = 0.1

BATCH_SIZE_RANGE              = [32, 64, 128, 256, 512, 1024]
L2_STRENGTH_RANGE             = (1e-6, 1e-2)
KERNEL_SIZE_RANGE             = (3, 1025)
DROPOUT_RATE_RANGE            = (0.01, 0.4)
PATIENCE_CALLBACK_REDUCE_LR   = 25
PATIENCE_CALLBACK_EARLY_STOP  = 50

TRAIN_EPOCHS                  = 500
TEST_EPOCHS                   = 1000
MIN_LR                        = 1e-6
TIMEOUT                       = 60 * 60 * 72

### _Run_

In [None]:
# === Split into train and test sets ===
df_train_all, df_test_variety, df_test_season = split_train_test(
    df,
    test_variety=TEST_VARIETY,
    test_season=TEST_SEASON,
)

# === Take subset ===
df_subset = take_subset(
    df_train_all, 
    n_subset=N_SUBSET, 
    random_state=RANDOM_STATE
)

# === Make train/validation split ===
df_train, df_val = create_train_val_split(
    df=df_subset,
    validation_size=VALIDATION_SIZE,
    random_state=RANDOM_STATE
)

# === Convert to x and y arrays ===
x_train_all, y_train_all = split_x_y(
    df_train_all,
)
x_train, y_train = split_x_y(
    df_train,
)
x_val, y_val = split_x_y(
    df_val,
)
x_test_variety, y_test_variety = split_x_y(
    df_test_variety,
)
x_test_season, y_test_season = split_x_y(
    df_test_season,
)

print("begin optuna hyperparameter optimization")
# === CNN hyperparameter tuning ===
study, best_rmse, best_params = perform_optuna_hyperparameter_optimization(
    x_train_data=x_train,
    y_train_data=y_train,
    x_val_data=x_val,
    y_val_data=y_val,
    batch_size_range=BATCH_SIZE_RANGE,
    l2_strength_range=L2_STRENGTH_RANGE,
    kernel_size_range=KERNEL_SIZE_RANGE,
    dropout_rate_range=DROPOUT_RATE_RANGE,
    random_state=RANDOM_STATE,
    patience_callback_reduce_lr=PATIENCE_CALLBACK_REDUCE_LR,
    patience_callback_early_stopping=PATIENCE_CALLBACK_EARLY_STOP,
    epochs=TRAIN_EPOCHS,
    min_lr=MIN_LR,
    timeout=TIMEOUT
)

print(f"Best CNN RMSECV: {best_rmse:.4f}")
print("Best hyperparameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

# === Model training on optimal paramters
cnn_model = train_cnn_model(
    x_train_data=x_train_all,
    y_train_data=y_train_all,
    best_params=best_params,
    random_state=RANDOM_STATE,
    patience_callback_reduce_lr=PATIENCE_CALLBACK_REDUCE_LR,
    patience_callback_early_stopping=PATIENCE_CALLBACK_EARLY_STOP,
    epochs=TEST_EPOCHS,
    min_lr=MIN_LR
)

# === Model evaluation ===
rmsep_variety, r2_variety, acc_variety, results_variety = test_cnn_model(
    model=cnn_model,
    x_test_data=x_test_variety,
    y_test_data=y_test_variety,
    batch_size=best_params["batch_size"]
)

rmsep_season, r2_season, acc_season, results_season = test_cnn_model(
    model=cnn_model,
    x_test_data=x_test_season,
    y_test_data=y_test_season,
    batch_size=best_params["batch_size"]
)

summary = pd.DataFrame({
    "Test Set": ["Variety", "Season"],
    "RMSE": [rmsep_variety, rmsep_season],
    "R²": [r2_variety, r2_season],
    "% Within 20%": [acc_variety, acc_season]
})

print(summary)

### _Sensitivity Analysis_

In [None]:
def get_inference_sample_set(
    df_variety: pd.DataFrame,
    df_season: pd.DataFrame,
    random_state: int,
    sample_size: int = 1000
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Combine two test sets (variety and season), sample rows randomly, and return X and y arrays.

    Parameters:
        df_variety   : DataFrame for variety-based test set
        df_season    : DataFrame for season-based test set
        random_state : Random seed for reproducibility
        sample_size  : Number of rows to sample from combined test set

    Returns:
        x_sample : NumPy array of shape (sample_size, n_features) with spectral features
        y_sample : NumPy array of shape (sample_size,) with corresponding Brix values
    """
    # Combine the two test sets
    df_combined = pd.concat([df_variety, df_season], axis=0)

    # Randomly sample rows from the combined test set
    df_sample = df_combined.sample(
        n=sample_size,
        random_state=random_state
    )

    # Split into X and y arrays
    x_sample, y_sample = split_x_y(df_sample)

    return x_sample, y_sample

def test_cnn_inference_time(
    model: tf.keras.Model,
    x_test: np.ndarray
) -> float:
    """
    Measure average one-by-one inference time of a CNN model in milliseconds.

    Parameters:
        model   : Trained Keras model
        x_test  : Test feature matrix

    Returns:
        avg_inference_time_ms : Average inference time per sample in milliseconds
    """
    times = []

    for x in x_test:
        x_input = np.expand_dims(x, axis=0)  # shape: (1, n_features)
        start = time.time()
        _ = model(x_input, training=False).numpy().flatten()[0]
        end = time.time()
        times.append(end - start)

    avg_inference_time_ms = np.mean(times) * 1000
    print(f"Average inference time: {avg_inference_time_ms:.3f} ms/sample")

    return avg_inference_time_ms

In [None]:
# === Create sample set for inference time measurement ===
x_inference_time, y_inference_time = get_inference_sample_set(
    df_test_variety,
    df_test_season,
    random_state=RANDOM_STATE
)

# === Compute the average inference time ===
cnn_inference_time_ms = test_cnn_inference_time(
    cnn_model, 
    x_inference_time
)