### _Setup_

In [None]:
# Reset memory
%reset -f

In [None]:
# Packages
from typing  import Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from typing import Union, List
import time
from kneed import KneeLocator
from sklearn.model_selection import StratifiedKFold

In [None]:
# Load Data
df = pd.read_csv('data.csv')

### _Functions_

In [None]:
def find_col_index_of_spectra(
    df: pd.DataFrame
) -> int:
    """
    Find the column index where spectral data starts.

    Assumes spectral column names can be converted to float (e.g., "730.5", "731.0").

    Parameters:
        df : Input DataFrame

    Returns:
        Index of the first spectral column, or -1 if not found.
    """
    for idx, col in enumerate(df.columns):
        try:
            float(col)
            return idx
        except (ValueError, TypeError):
            continue
    return -1

def split_train_test(
    df: pd.DataFrame,
    test_variety: str,
    test_season: int       
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split a DataFrame into one training set and two test sets:

    - Variety test set: Variety == test_variety AND Year == 2024
    - Season test set : Year == test_season 

    The training set excludes all rows that belong to any of the test sets.
    The season test set only includes varieties that are present in the training set.

    Parameters:
        df           : Full pandas DataFrame
        test_variety : Variety used for the test set
        test_season  : Year used for the season test

    Returns:
        df_train        : Training set
        df_test_variety : Test set for specified variety and 2024
        df_test_season  : Test set for specified season (filtered by train varieties)
    """

    # Select test set for the specified variety in year 2024
    df_test_variety = df[
        (df["Variety"] == test_variety) &
        (df["Scan Date Year"] == 2024)
    ]

    # Select test set for the specified season (regardless of variety)
    df_test_season = df[
        df["Scan Date Year"] == test_season
    ]

    # Select training set (exclude test variety and test season)
    df_train = df[
        (df["Variety"] != test_variety) &
        (df["Scan Date Year"] != test_season)
    ]

    # Filter season test set to only include varieties present in training set
    train_varieties = df_train["Variety"].unique()
    df_test_season = df_test_season[
        df_test_season["Variety"].isin(train_varieties)
    ]

    return df_train, df_test_variety, df_test_season

def split_x_y(
    df: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Split train and test DataFrames into x (spectra features) and y (target) arrays.
    Assumes find_col_index_of_spectra() is defined globally.

    Parameters:
        df_train: Training set DataFrame.
        df_test : Test set DataFrame.

    Returns:
        x_train: NumPy array of training features.
        y_train: NumPy array of training targets.
    """
    spectra_cols = list(df.columns[find_col_index_of_spectra(df):])
    target_cols = ['Brix (Position)']

    x = df[spectra_cols].values
    y = df[target_cols].values

    return (
        x,
        y
    )

def stratified_cv_splits(
    x_train, 
    y_train, 
    n_splits, 
    random_state, 
    n_bins=10
):
    """
    Generate stratified K-Fold splits for regression by binning y_train into quantiles.

    Parameters:
        x_train      : Feature matrix (NumPy array or DataFrame)
        y_train      : Target values (NumPy array or Series)
        n_splits     : Number of folds for StratifiedKFold
        random_state : Random seed for reproducibility
        n_bins       : Number of quantile bins to stratify target into (default: 10)

    Returns:
        Generator of (train_idx, val_idx) tuples
    """

    # Bin the target into quantiles
    y_binned = pd.qcut(np.ravel(y_train), q=n_bins, labels=False, duplicates='drop')

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for train_idx, val_idx in skf.split(x_train, y_binned):
        yield train_idx, val_idx

def train_cross_validation(
    x_train_data: np.ndarray,
    y_train_data: np.ndarray,
    max_components: int,
    n_splits: int,
    random_state: int,
) -> Tuple[list[float], float, int]:
    """
    Perform stratified K-fold cross-validation for PLS regression using different numbers
    of components, and return the RMSECV list, the RMSECV at knee point,
    and the corresponding number of components.

    Parameters:
        x_train_data  : Feature array for training (NumPy array)
        y_train_data  : Target array for training (NumPy array)
        max_components: Maximum number of PLS components to evaluate
        n_splits      : Number of stratified folds in cross-validation
        random_state  : Seed for reproducibility

    Returns:
        cv_rmsecv     : List of RMSECV scores per component count
        cv_opt_rmsecv : RMSECV at the selected optimal number of components
        cv_opt_A      : Optimal number of components (selected at knee point or min RMSECV)
    """
    cv_rmsecv = []

    # Loop over 1 to max_components
    for n_comp in range(1, max_components + 1):
        mse_folds = []

        # Stratified K-fold CV splits
        for train_idx, val_idx in stratified_cv_splits(x_train_data, y_train_data, n_splits, random_state):
            # Split data into current train/val fold
            X_train_fold = x_train_data[train_idx]
            X_val_fold   = x_train_data[val_idx]
            y_train_fold = y_train_data[train_idx]
            y_val_fold   = y_train_data[val_idx]

            # Train PLS with current number of components
            pls = PLSRegression(n_components=n_comp)
            pls.fit(X_train_fold, y_train_fold)

            # Predict on validation fold and store MSE
            y_pred_val = pls.predict(X_val_fold)
            mse_folds.append(mean_squared_error(y_val_fold, y_pred_val))

        # Compute RMSECV for current component count
        rmsecv = np.sqrt(np.mean(mse_folds))
        cv_rmsecv.append(rmsecv)
        print(f"Components: {n_comp}, RMSECV: {rmsecv:.4f}")

    # === Plot RMSECV curve ===
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_components + 1), cv_rmsecv, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('RMSECV')
    plt.title('RMSECV vs Number of Components')
    plt.grid(True)
    plt.show()

    # === Find optimal number of components using knee/elbow method ===
    x = list(range(1, max_components + 1))
    y = cv_rmsecv

    knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    cv_opt_A = knee_locator.knee

    # Fallback: use component with lowest RMSECV if knee not found
    if cv_opt_A is None:
        print("Knee not found automatically. Using minimum RMSECV as fallback.")
        cv_opt_A = np.argmin(cv_rmsecv) + 1

    # Get RMSECV value at the selected number of components
    cv_opt_rmsecv = cv_rmsecv[cv_opt_A - 1] 

    return (
        cv_rmsecv,
        cv_opt_rmsecv,
        cv_opt_A
    )

def test_pls_model(
    x_train_data: np.ndarray,
    y_train_data: np.ndarray,
    x_test_data: np.ndarray,
    y_test_data: np.ndarray,
    opt_A: int,
) -> Tuple[
    PLSRegression,      # trained model
    pd.DataFrame,       # test_pls_results
    float,              # test_rmsep
    float,              # test_r2
    float               # test_practical_accuracy
]:
    """
    Train a final PLS regression model on the training set using the specified number 
    of latent variables, then evaluate its performance on the test set.

    Parameters:
        x_train_data : NumPy array of training features.
        y_train_data : NumPy array of training target values.
        x_test_data  : NumPy array of test features.
        y_test_data  : NumPy array of test target values.
        opt_A        : Optimal number of latent variables (PLS components) to use.

    Returns:
        pls_model                 : The trained PLS model.
        test_pls_results          : DataFrame with columns ["observed", "predicted"].
        test_rmsep                : Root Mean Squared Error of Prediction on the test set.
        test_r2                   : R² score on the test set.
        test_practical_accuracy   : % of predictions within 20% of actual.
    """

    # === Train final PLS model with selected number of components ===
    pls_model = PLSRegression(n_components=opt_A)
    pls_model.fit(x_train_data, y_train_data)

    # === Predict on test set ===
    y_pred_test = pls_model.predict(x_test_data).flatten()

    # === Flatten ground truth and prediction arrays ===
    y_true = y_test_data.flatten()
    y_pred = y_pred_test.flatten()

    # === Compute test metrics ===
    test_mse = mean_squared_error(y_true, y_pred)
    test_rmsep = np.sqrt(test_mse)
    test_r2 = r2_score(y_true, y_pred)

    # === Compute practical accuracy (±20% tolerance) ===
    pct_error = np.abs(y_pred - y_true) / np.abs(y_true)
    test_practical_accuracy = np.mean(pct_error <= 0.2) * 100.0

    # === Compile results into DataFrame ===
    test_pls_results = pd.DataFrame({
        "observed": y_true,
        "predicted": y_pred
    })

    # === Print test summary ===
    print(f"Test RMSEP: {test_rmsep:.4f}")
    print(f"Test R²: {test_r2:.4f}")
    print(f"Practical accuracy (±20%): {test_practical_accuracy:.1f}%")

    # === Plot observed vs. predicted values ===
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, alpha=0.7, label="Test Data")
    plt.plot(
        [y_true.min(), y_true.max()],
        [y_true.min(), y_true.max()],
        'k--', lw=2, label="Ideal"
    )
    plt.xlabel("Observed")
    plt.ylabel("Predicted")
    plt.title("Observed vs. Predicted on Test Set")
    plt.legend()
    plt.grid(True)
    plt.show()

    # === Return model, predictions, and evaluation metrics ===
    return (
        pls_model,
        test_pls_results,
        test_rmsep,
        test_r2,
        test_practical_accuracy
    )


### _Variables_

In [None]:
DF = df
RANDOM_STATE = 27                                               
TEST_VARIETY = "TestVariety"
TEST_SEASON = 2025
MAX_COMPONENTS = 20
N_SPLITS = 3

### _Run_

In [None]:
# === Split into train and test sets ===
df_train, df_test_variety, df_test_season = split_train_test(
    df,
    test_variety=TEST_VARIETY,
    test_season=TEST_SEASON,
)

# === Convert to x and y arrays ===
x_train, y_train = split_x_y(
    df_train,
)
x_test_variety, y_train_variety = split_x_y(
    df_test_variety,
)
x_test_season, y_train_season = split_x_y(
    df_test_season,
)

# === Perform k-fold cross-validation ===
cv_rmsecv, cv_min_rmsecv, cv_opt_A = train_cross_validation(
    x_train,
    y_train,
    max_components=MAX_COMPONENTS,
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE
)

print(f"Optimal # of components: {cv_opt_A} | CV RMSECV: {cv_min_rmsecv:.4f}")

# === Train final model and evaluate on test sets ===
pls_model_variety, results_variety, rmsep_variety, r2_variety, acc_variety = test_pls_model(
    x_train,
    y_train,
    x_test_variety,
    y_train_variety,
    opt_A=cv_opt_A
)

pls_model_season, results_season, rmsep_season, r2_season, acc_season = test_pls_model(
    x_train,
    y_train,
    x_test_season,
    y_train_season,
    opt_A=cv_opt_A
)

# === Print summary ===
print("\n--- Test Results Summary ---")
print(f"Variety Test  | RMSEP: {rmsep_variety:.4f} | R²: {r2_variety:.4f} | Acc: {acc_variety:.2f}")
print(f"Season Test   | RMSEP: {rmsep_season:.4f}  | R²: {r2_season:.4f}  | Acc: {acc_season:.2f}")

### _Inference Time Analysis_

In [None]:
def get_inference_sample_set(
    df_variety: pd.DataFrame,
    df_season: pd.DataFrame,
    random_state: int,
    sample_size: int = 1000
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Combine two test sets (variety and season), sample rows randomly, and return X and y arrays.

    Parameters:
        df_variety   : DataFrame for variety-based test set
        df_season    : DataFrame for season-based test set
        random_state : Random seed for reproducibility
        sample_size  : Number of rows to sample from combined test set

    Returns:
        x_sample : NumPy array of shape (sample_size, n_features) with spectral features
        y_sample : NumPy array of shape (sample_size,) with corresponding Brix values
    """
    # Combine the two test sets
    df_combined = pd.concat([df_variety, df_season], axis=0)

    # Randomly sample rows from the combined test set
    df_sample = df_combined.sample(
        n=sample_size,
        random_state=random_state
    )

    # Split into X and y arrays
    x_sample, y_sample = split_x_y(df_sample)

    return x_sample, y_sample

def test_pls_inference_time(
    x_train: np.ndarray,
    y_train: np.ndarray,
    x_test: np.ndarray,
    y_test: np.ndarray,
    n_components: int
) -> float:
    """
    Train PLS with given number of components and return average inference time per sample (in ms).

    Parameters:
        x_train      : Training features
        y_train      : Training targets
        x_test       : Test features
        y_test       : Test targets
        n_components : Number of PLS components (LVs)

    Returns:
        avg_inference_time_ms : Average inference time per sample in milliseconds
    """
    # Fit the PLS model
    pls = PLSRegression(n_components=n_components)
    pls.fit(x_train, y_train)

    times = []

    # Predict each sample individually and time it
    for x in x_test:
        x_input = x.reshape(1, -1)
        start = time.time()
        _ = pls.predict(x_input)
        end = time.time()
        times.append(end - start)

    # Compute average inference time in milliseconds
    avg_inference_time_ms = np.mean(times) * 1000

    # Print only the inference time
    print(f"Average inference time: {avg_inference_time_ms:.3f} ms/sample")

    return avg_inference_time_ms

In [None]:
# === Create sample set for inference time measurement ===
x_inference_time, y_inference_time = get_inference_sample_set(
    df_test_variety,
    df_test_season,
    random_state=RANDOM_STATE
)

# === Compute the average inference time ===
inference_time_ms = test_pls_inference_time(
    x_train,
    y_train,
    x_inference_time,
    y_inference_time,
    n_components=cv_opt_A
)