### _Setup_

In [None]:
# Reset memory
%reset -f

In [None]:
# Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from typing import List, Union, Tuple
from scipy.spatial.distance import pdist, squareform, cdist
from sklearn.model_selection import KFold
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
from kneed import KneeLocator
from sklearn.model_selection import StratifiedKFold

In [None]:
# Data
df = pd.read_csv('data.csv')

### _Functions_

In [None]:
def find_col_index_of_spectra(
    df: pd.DataFrame
) -> int:
    """
    Find the column index where spectral data starts.

    Assumes spectral column names can be converted to float (e.g., "730.5", "731.0").

    Parameters:
        df : Input DataFrame

    Returns:
        Index of the first spectral column, or -1 if not found.
    """
    for idx, col in enumerate(df.columns):
        try:
            float(col)
            return idx
        except (ValueError, TypeError):
            continue
    return -1

def split_train_test(
    df: pd.DataFrame,
    test_variety: str,
    test_season: int       
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split a DataFrame into one training set and two test sets:

    - Variety test set: Variety == test_variety AND Year == 2024
    - Season test set : Year == test_season 

    The training set excludes all rows that belong to any of the test sets.
    The season test set only includes varieties that are present in the training set.

    Parameters:
        df           : Full pandas DataFrame
        test_variety : Variety used for the test set
        test_season  : Year used for the season test

    Returns:
        df_train        : Training set
        df_test_variety : Test set for specified variety and 2024
        df_test_season  : Test set for specified season (filtered by train varieties)
    """

    # Select test set for the specified variety in year 2024
    df_test_variety = df[
        (df["Variety"] == test_variety) &
        (df["Scan Date Year"] == 2024)
    ]

    # Select test set for the specified season (regardless of variety)
    df_test_season = df[
        df["Scan Date Year"] == test_season
    ]

    # Select training set (exclude test variety and test season)
    df_train = df[
        (df["Variety"] != test_variety) &
        (df["Scan Date Year"] != test_season)
    ]

    # Filter season test set to only include varieties present in training set
    train_varieties = df_train["Variety"].unique()
    df_test_season = df_test_season[
        df_test_season["Variety"].isin(train_varieties)
    ]

    return df_train, df_test_variety, df_test_season

def split_x_y(
    df: pd.DataFrame,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Split train and test DataFrames into x (spectra features) and y (target) arrays.
    Assumes find_col_index_of_spectra() is defined globally.

    Parameters:
        df_train: Training set DataFrame.
        df_test : Test set DataFrame.

    Returns:
        x_train: NumPy array of training features.
        y_train: NumPy array of training targets.
    """
    spectra_cols = list(df.columns[find_col_index_of_spectra(df):])
    target_cols = ['Brix (Position)']

    x = df[spectra_cols].values
    y = df[target_cols].values

    return (
        x,
        y
    )

def compute_distance_matrix(
    matrix_one: np.ndarray,
    matrix_two: np.ndarray,
    metric: str = "euclidean"
) -> np.ndarray:
    """
    Computes the pairwise distance matrix between two input matrices in float32.
    Optimizes by only computing the upper triangle if both matrices are the same.

    Parameters:
        matrix_one : np.ndarray of shape (n_samples_one, n_features)
        matrix_two : np.ndarray of shape (n_samples_two, n_features)
        metric     : str, distance metric to use (default: 'euclidean')

    Returns:
        distance_matrix : np.ndarray of shape (n_samples_one, n_samples_two)
    """
    # Convert inputs to float32 to reduce memory usage and improve speed
    matrix_one = matrix_one.astype(np.float32)
    matrix_two = matrix_two.astype(np.float32)

    # If both matrices are the same (by reference or content), use symmetric distance matrix
    if matrix_one is matrix_two or np.array_equal(matrix_one, matrix_two):
        print("Detected symmetric distance matrix. Optimizing computation...")
        dist_condensed = pdist(matrix_one, metric=metric)   # Compute condensed distance form
        dist_matrix = squareform(dist_condensed)            # Convert to full symmetric distance matrix
        return dist_matrix
    else:
        # Compute full distance matrix between matrix_one and matrix_two
        return cdist(matrix_one, matrix_two, metric=metric)

def compute_test_weights(
    distances: np.ndarray,
    h: float
) -> np.ndarray:
    """
    Compute similarity-based weights from a distance matrix.

    Parameters:
        distances : np.ndarray
            Distance matrix of shape (n_samples_test, n_samples_train)
        h         : float
            Similarity decay parameter that controls sharpness of the weighting function

    Returns:
        weights : np.ndarray
            Similarity weights of shape (n_samples_test, n_samples_train)
    """
    # Compute the standard deviation across all distances
    s_q = np.std(distances)

    # Avoid division by zero in case of constant distances
    if s_q == 0:
        s_q = 1e-6

    # Compute similarity-based weights using exponential decay
    weights = np.exp(-distances / (s_q * h))

    return weights

def perform_weighted_pls_crossvalidation(
    x_train: np.ndarray,
    y_train: np.ndarray,
    distance_matrix: np.ndarray,
    n_splits: int,
    n_latent_variables: int,
    h: float,
    random_state: int
) -> float:
    """
    Perform stratified K-fold cross-validation using similarity-weighted PLS 
    with centering and unnormalized weights.

    Parameters:
        x_train            : Training features of shape (n_samples, n_features)
        y_train            : Training targets of shape (n_samples,)
        distance_matrix    : Precomputed distance matrix of shape (n_samples, n_samples)
        n_splits           : Number of CV folds
        n_latent_variables : Number of latent variables (PLS components)
        h                  : Similarity decay parameter
        random_state       : Random seed for reproducibility

    Returns:
        rmse_cv            : Root mean squared error over all validation folds
    """
    # Flatten target array and bin it for stratification
    y_train = y_train.flatten()
    y_binned = pd.qcut(y_train, q=10, labels=False, duplicates='drop')

    # Initialize stratified K-Fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    all_squared_errors = []

    # Loop over CV folds
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(x_train, y_binned)):
        # Split into training and validation sets for this fold
        X_tr, Y_tr = x_train[train_idx], y_train[train_idx]
        X_val, Y_val = x_train[val_idx], y_train[val_idx]

        # Extract distances from training to validation samples
        train_to_val_distances = distance_matrix[np.ix_(train_idx, val_idx)]  # (n_train, n_val)

        # Compute similarity weights from validation to training
        val_weights = compute_test_weights(train_to_val_distances.T, h)       # (n_val, n_train)

        # Average the similarity weights over all validation points
        mean_weight_vector = val_weights.mean(axis=0)                         # (n_train,)

        # Compute weighted means for centering
        x_mean = np.average(X_tr, axis=0, weights=mean_weight_vector)
        y_mean = np.average(Y_tr, weights=mean_weight_vector)

        # Center training data
        X_tr_centered = X_tr - x_mean
        Y_tr_centered = Y_tr - y_mean

        # Apply similarity weights to centered data
        X_w = X_tr_centered * mean_weight_vector[:, None]
        Y_w = Y_tr_centered * mean_weight_vector

        # Fit PLS model on weighted, centered training data
        pls = PLSRegression(n_components=n_latent_variables)
        pls.fit(X_w, Y_w)

        # Center validation features using training mean and predict
        X_val_centered = X_val - x_mean
        Y_pred_val = pls.predict(X_val_centered).flatten() + y_mean

        # Compute squared errors and store them
        squared_errors = (Y_val - Y_pred_val) ** 2
        all_squared_errors.extend(squared_errors)

    # Compute and print final CV RMSE
    rmse_cv = np.sqrt(np.mean(all_squared_errors))
    print(f"Finished Stratified CV: RMSE = {rmse_cv:.4f}")

    return rmse_cv

def perform_gridsearch(
    x_train: np.ndarray,
    y_train: np.ndarray,
    distance_metrics: list,
    h_values: list,
    max_n_latent_variables: int,
    n_splits: int,
    random_state: int
) -> pd.DataFrame:
    """
    Perform grid search over distance metrics, h-values, and number of latent variables
    for similarity-weighted PLS regression.

    Parameters:
        x_train                : Training feature matrix (n_samples, n_features)
        y_train                : Training target array (n_samples,)
        distance_metrics       : List of distance metric names (e.g., ['euclidean', 'cosine'])
        h_values               : List of h-values controlling similarity decay
        max_n_latent_variables : Maximum number of PLS components to evaluate
        n_splits               : Number of folds for stratified K-Fold CV
        random_state           : Random seed for reproducibility

    Returns:
        pd.DataFrame : DataFrame containing all evaluated hyperparameter combinations and their RMSECV
    """
    results = []

    for metric in distance_metrics:
        print(f"\nPrecomputing distance matrix for metric: {metric}")
        # Precompute distance matrix for the current metric
        distance_matrix = compute_distance_matrix(x_train, x_train, metric=metric)

        for h in h_values:
            print(f"\nRunning grid search for h = {h:.2f} with distance metric: {metric}")

            for n_lv in range(1, max_n_latent_variables + 1):
                print(f"   Evaluating with {n_lv} latent variables...")

                # Perform CV for current combination of hyperparameters
                rmse_cv = perform_weighted_pls_crossvalidation(
                    x_train=x_train,
                    y_train=y_train,
                    distance_matrix=distance_matrix,
                    n_splits=n_splits,
                    n_latent_variables=n_lv,
                    h=h,
                    random_state=random_state
                )

                # Store results
                results.append({
                    "distance_metric": metric,
                    "h": h,
                    "n_components": n_lv,
                    "RMSECV": rmse_cv
                })

    # Compile all results into a DataFrame
    results_df = pd.DataFrame(results)
    print("\nGrid search completed.")
    return results_df

def evaluate_gridsearch(
    gridsearch_df: pd.DataFrame
) -> Tuple[pd.DataFrame, dict]:
    """
    Evaluate a grid search results DataFrame by identifying the optimal number of
    latent variables (A) per (distance_metric, h) pair using knee point detection.
    Also determines the overall best configuration.

    Parameters:
        gridsearch_df : DataFrame with columns ['distance_metric', 'h', 'n_components', 'RMSECV']

    Returns:
        knees_df      : DataFrame listing best A and RMSECV for each (distance_metric, h)
        best_result   : Dictionary containing the overall best configuration
    """
    results = []

    # === Loop over each distance metric ===
    for metric in gridsearch_df["distance_metric"].unique():
        df_metric = gridsearch_df[gridsearch_df["distance_metric"] == metric]

        # === Loop over each h-value for this metric ===
        for h_val in sorted(df_metric["h"].unique()):
            # Subset and sort rows by number of components
            subset = df_metric[df_metric["h"] == h_val].sort_values("n_components")
            A_vals = subset["n_components"].values
            rmse_vals = subset["RMSECV"].values

            # === Detect knee in the RMSECV curve ===
            knee_locator = KneeLocator(
                A_vals, rmse_vals,
                curve="convex", direction="decreasing", S=1.5
            )

            if knee_locator.knee is not None:
                best_idx = np.where(A_vals == knee_locator.knee)[0][0]
            else:
                best_idx = int(np.argmin(rmse_vals))  # Fallback to minimum RMSECV

            # === Store result ===
            results.append({
                "distance_metric": metric,
                "h": h_val,
                "n_components": A_vals[best_idx],
                "RMSECV": rmse_vals[best_idx]
            })

            # === Plot RMSECV curve and selected knee ===
            plt.figure(figsize=(6, 4))
            plt.plot(A_vals, rmse_vals, marker="o", label=f"h = {h_val}")
            plt.axvline(x=A_vals[best_idx], color="red", linestyle="--", label=f"Knee at A = {A_vals[best_idx]}")
            plt.title(f"RMSECV vs. Latent Variables\nDistance: {metric}, h = {h_val}")
            plt.xlabel("Number of Latent Variables (A)")
            plt.ylabel("RMSECV")
            plt.legend()
            plt.grid(True)
            plt.show()

    # === Compile knee results into DataFrame ===
    knees_df = pd.DataFrame(results)

    if knees_df.empty:
        raise ValueError("No valid knee points found in the grid search results.")

    # === Get the best overall result (lowest RMSECV) ===
    best_row = knees_df.loc[knees_df["RMSECV"].idxmin()]
    best_result = {
        "distance_metric": best_row["distance_metric"],
        "h": best_row["h"],
        "n_components": int(best_row["n_components"]),
        "RMSECV": float(best_row["RMSECV"])
    }

    # === Print best configuration summary ===
    print("\n=== Best Configuration Found ===")
    for k, v in best_result.items():
        print(f"{k:<17}: {v}")

    return knees_df, best_result

def test_lwpls(
    x_train: np.ndarray,
    y_train: np.ndarray,
    x_test: np.ndarray,
    y_test: np.ndarray,
    h: float,
    n_components: int,
    distance_metric: str
) -> Tuple[pd.DataFrame, float, float, float]:
    """
    Evaluate LWPLS on a test set using best hyperparameters.

    Parameters:
        x_train        : Training feature matrix
        y_train        : Training target array
        x_test         : Test feature matrix
        y_test         : Test target array
        h              : Similarity decay factor
        n_components   : Number of PLS components
        distance_metric: Distance metric to use ('euclidean', 'cosine', etc.)

    Returns:
        df_results         : DataFrame with observed and predicted values
        rmsep              : Root mean squared error of prediction
        r2                 : Coefficient of determination
        practical_accuracy : % predictions within ±20% relative error
    """
    # Flatten target arrays
    y_train = y_train.flatten()
    y_true = y_test.flatten()

    # === Step 1: Compute distances and similarity weights ===
    test_distances = compute_distance_matrix(x_test, x_train, distance_metric)  # shape: (n_test, n_train)
    test_weights = compute_test_weights(test_distances, h)                      # shape: (n_test, n_train)

    predictions = []

    # === Step 2: Loop over each test sample ===
    for i in range(x_test.shape[0]):
        weights_i = test_weights[i]  # similarity weights for current test sample

        # Compute weighted means for centering 
        x_mean = np.average(x_train, axis=0, weights=weights_i)
        y_mean = np.average(y_train, weights=weights_i)

        # Center training data
        x_centered = x_train - x_mean
        y_centered = y_train - y_mean

        # Apply similarity weights 
        x_w = x_centered * weights_i[:, None]  # shape: (n_train, n_features)
        y_w = y_centered * weights_i           # shape: (n_train,)

        # Fit weighted PLS model 
        pls = PLSRegression(n_components=n_components)
        pls.fit(x_w, y_w)

        # Center test sample and predict 
        x_test_centered = x_test[i] - x_mean
        y_pred_i = pls.predict(x_test_centered.reshape(1, -1)).flatten()[0] + y_mean
        predictions.append(y_pred_i)

        # Print progress every 10 samples or on final sample 
        if (i + 1) % 10 == 0 or (i + 1) == x_test.shape[0]:
            current_rmsep = np.sqrt(mean_squared_error(y_true[:i + 1], predictions))
            print(f"{i + 1}/{x_test.shape[0]} samples done - current RMSEP: {current_rmsep:.4f}")

    # === Step 3: Compute final performance metrics ===
    y_pred = np.array(predictions)

    rmsep = np.sqrt(mean_squared_error(y_true, y_pred))              # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)                                    # R² score
    pct_error = np.abs(y_pred - y_true) / np.abs(y_true)
    practical_accuracy = np.mean(pct_error <= 0.2) * 100             # % of predictions within ±20%

    # === Step 4: Compile prediction results into a DataFrame ===
    df_results = pd.DataFrame({
        "observed": y_true,
        "predicted": y_pred
    })

    # === Step 5: Print summary metrics ===
    print(f"Test RMSEP: {rmsep:.4f}")
    print(f"Test R2: {r2:.4f}")
    print(f"Practical accuracy (±20% error): {practical_accuracy:.2f}%")

    return df_results, rmsep, r2, practical_accuracy


### _Parameters_

In [None]:
DF                  = df
TEST_VARIETY        = "TestVariety"
TEST_SEASON         = 2025
RANDOM_STATE        = 27
H_VALUES            = [round(h, 2) for h in np.arange(0.1, 2.01, 0.1)]
MAX_COMPONENTS      = 20
N_SPLITS            = 3
DISTANCE_METRICS    = ['euclidean', 'correlation', 'cosine']

### _Run_

In [None]:
# === Split into train and test sets ===
df_train, df_test_variety, df_test_season = split_train_test(
    df,
    test_variety=TEST_VARIETY,
    test_season=TEST_SEASON
)

# === Convert to x and y arrays ===
x_train, y_train = split_x_y(
    df_train,
)
x_test_variety, y_test_variety = split_x_y(
    df_test_variety,
)
x_test_season, y_test_season = split_x_y(
    df_test_season,
)

# === Hyperparameter Search ===
cv_results_all = perform_gridsearch(
    x_train=x_train,
    y_train=y_train,
    distance_metrics=DISTANCE_METRICS,
    h_values=H_VALUES,
    max_n_latent_variables=MAX_COMPONENTS,
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE
)

# === Evaluate Grid Search ===
knees_df, best_result = evaluate_gridsearch(
    cv_results_all
)

print("Best Configuration Summary:")
print(best_result)

# === Final Test Set Evaluation ===
df_results_variety, lwpls_rmsep_variety, lwpls_r2_variety, lwpls_practical_accuracy_variety = test_lwpls(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test_variety,
    y_test=y_test_variety,
    h=best_result["h"],
    n_components=best_result["n_components"],
    distance_metric=best_result["distance_metric"]
)

df_results_season, lwpls_rmsep_season, lwpls_r2_season, lwpls_practical_accuracy_season = test_lwpls(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test_season,
    y_test=y_test_season,
    h=best_result["h"],
    n_components=best_result["n_components"],
    distance_metric=best_result["distance_metric"]
)

results_overview = pd.DataFrame({
    "RMSEP": [lwpls_rmsep_variety, lwpls_rmsep_season],
    "R²": [lwpls_r2_variety, lwpls_r2_season],
    "Practical Accuracy (%)": [lwpls_practical_accuracy_variety, lwpls_practical_accuracy_season]
}, index=["Variety Test", "Season Test"])

print(results_overview)

### _Inference Time Analysis_

In [None]:
def get_inference_sample_set(
    df_variety: pd.DataFrame,
    df_season: pd.DataFrame,
    random_state: int,
    sample_size: int = 1000
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Combine two test sets (variety and season), sample rows randomly, and return X and y arrays.

    Parameters:
        df_variety   : DataFrame for variety-based test set
        df_season    : DataFrame for season-based test set
        random_state : Random seed for reproducibility
        sample_size  : Number of rows to sample from combined test set

    Returns:
        x_sample : NumPy array of shape (sample_size, n_features) with spectral features
        y_sample : NumPy array of shape (sample_size,) with corresponding Brix values
    """
    # Combine the two test sets
    df_combined = pd.concat([df_variety, df_season], axis=0)

    # Randomly sample rows from the combined test set
    df_sample = df_combined.sample(
        n=sample_size,
        random_state=random_state
    )

    # Split into X and y arrays
    x_sample, y_sample = split_x_y(df_sample)

    return x_sample, y_sample

def test_lwpls_inference_time(
    x_train: np.ndarray,
    y_train: np.ndarray,
    x_test: np.ndarray,
    y_test: np.ndarray,
    h: float,
    n_components: int,
    distance_metric: str
) -> float:
    """
    Compute average inference time per LWPLS prediction (in milliseconds).
    Includes distance calculation, weight computation, model fitting, and prediction
    for each test sample individually.

    Parameters:
        x_train        : Training features (n_train, n_features)
        y_train        : Training targets (n_train,)
        x_test         : Test features (n_test, n_features)
        y_test         : Test targets (n_test,)
        h              : Similarity decay parameter
        n_components   : Number of PLS components
        distance_metric: Distance metric to use ('euclidean', 'cosine', etc.)

    Returns:
        avg_inference_time_ms : Average inference time per sample (in milliseconds)
    """
    y_train = y_train.flatten()
    inference_times = []

    for i in range(x_test.shape[0]):
        start_time = time.perf_counter()

        # Compute distances and similarity weights
        dists_i = compute_distance_matrix(x_test[i].reshape(1, -1), x_train, distance_metric)[0]
        weights_i = compute_test_weights(dists_i.reshape(1, -1), h)[0]

        # Weighted means for centering
        x_mean = np.average(x_train, axis=0, weights=weights_i)
        y_mean = np.average(y_train, weights=weights_i)

        # Center and weight training data
        x_centered = x_train - x_mean
        y_centered = y_train - y_mean
        x_w = x_centered * weights_i[:, None]
        y_w = y_centered * weights_i

        # Fit model and predict
        pls = PLSRegression(n_components=n_components)
        pls.fit(x_w, y_w)

        x_test_centered = x_test[i] - x_mean
        _ = pls.predict(x_test_centered.reshape(1, -1)).flatten()[0] + y_mean

        end_time = time.perf_counter()
        inference_times.append(end_time - start_time)
 

    avg_inference_time_ms = np.mean(inference_times) * 1000

    print(f"Average inference time: {avg_inference_time_ms:.3f} ms/sample")

    return avg_inference_time_ms


In [None]:
# === Create sample set for inference time measurement ===
x_inference_time, y_inference_time = get_inference_sample_set(
    df_test_variety,
    df_test_season,
    random_state=RANDOM_STATE
)

# === Compute the average inference time ===
inference_time_ms = test_lwpls_inference_time(
    x_train=x_train,
    y_train=y_train,
    x_test=x_inference_time,
    y_test=y_inference_time,
    h=best_result["h"],                             
    n_components=best_result["n_components"],       
    distance_metric=best_result["distance_metric"]  
)