## Import Libraries

In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
import requests
from io import StringIO
import GPy

In [11]:
import numpy
import scipy
import sklearn
print(f"NumPy version: {numpy.__version__}")
print(f"SciPy version: {scipy.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")

NumPy version: 1.25.2
SciPy version: 1.11.1
Scikit-learn version: 1.3.0


## Data Load (PIMA)

In [None]:
def load_pima_dataset_full_features():
    url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
    col_names = ['pregnancies', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
    df = pd.read_csv(url, header=None, names=col_names)
    
    feature_names = col_names[:-1]
    X = df[feature_names].values
    y = df['label'].values
    y = np.where(y == 0, -1, 1)
    
    for col in ['glucose', 'bp', 'skin', 'insulin', 'bmi']:
        col_index = col_names.index(col)
        non_zero_values = X[:, col_index][X[:, col_index] != 0]
        if non_zero_values.size > 0:
            mean_val = non_zero_values.mean()
            X[:, col_index][X[:, col_index] == 0] = mean_val
        else:
            X[:, col_index][X[:, col_index] == 0] = 0 
    
    return X, y, feature_names

## Data Load (CRABS)

In [None]:
def load_crabs_dataset_all_features():
    url = "https://raw.githubusercontent.com/fernandomayer/data/master/crabs.csv"
    
    df = pd.read_csv(
        url,
        sep=';',           # Use semicolon as the delimiter
        header=0,          # The first row is the header
        decimal=',',       # Use comma as the decimal separator
        quotechar='"',     # Handle double quotes around fields
        engine='python'    # Python engine for robustness with custom separators/quotes
    )

    df = df.rename(columns={'especie': 'sp', 'sexo': 'sex'})

    df['sp'] = df['sp'].astype(str).str.strip()
    df['sex'] = df['sex'].astype(str).str.strip()
    
    print(f"Crabs dataset: Unique 'sp' values before filtering: {df['sp'].unique()}")

    df = df[df['sp'].isin(['azul', 'laranja'])]
    
    if df.empty:
        raise ValueError(
            "DataFrame is empty after filtering 'sp' column. "
            "This indicates 'azul' or 'laranja' species might not be present or are malformed."
        )

    feature_columns = ['FL', 'RW', 'CL', 'CW', 'BD']

    for col in feature_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    X = df[feature_columns].values
    
    if np.isnan(X).any():
        print("Warning: NaNs found in Crabs features. Imputing with mean...")
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        X = imputer.fit_transform(X)

    y = np.where(df['sp'] == 'azul', 1, -1)
    
    return X, y, feature_columns

## Sigmoid Function

In [None]:
def sigmoid(x):
    x_clamped = np.clip(x, -500, 500) 
    return np.where(x_clamped >= 0, 1 / (1 + np.exp(-x_clamped)), np.exp(x_clamped) / (1 + np.exp(x_clamped)))

## Prediction with GP

In [None]:
def train_and_predict_gp_with_gpy(X_train, y_train, X_test, feature_names):
    n_features = X_train.shape[1]
    
    kernel = GPy.kern.Matern32(input_dim=n_features, variance=1.0, 
                               lengthscale=np.ones(n_features), ARD=True)
    
    # Définir des bornes pour les hyperparamètres
    kernel.variance.constrain_bounded(np.exp(-3.0), np.exp(3.0))
    kernel.lengthscale.constrain_bounded(np.exp(-3.0), np.exp(3.0))
    
    model = GPy.models.GPClassification(X_train, y_train.reshape(-1, 1), kernel=kernel)
    
    print("\n--- Training Gaussian Process Model (using GPy) ---")
    print("Optimizing GP hyperparameters with GPy's internal optimizer...")
    
    try:
        model.optimize_restarts(num_restarts=50, optimizer='lbfgsb', max_iters=2000, verbose=False)
    except np.linalg.LinAlgError as e:
        print(f"Optimization failed with LinAlgError: {e}. Trying again with default settings.")
        model.optimize(messages=False, max_iters=2000)
    except Exception as e:
        print(f"Optimization failed with unexpected error: {e}. Trying again with default settings.")
        model.optimize(messages=False, max_iters=2000)


    print("GP Optimization complete.")
    
    optimal_sigma_f = np.sqrt(model.Mat32.variance.values[0])
    optimal_length_scales = model.Mat32.lengthscale.values
    
    print(f"  - Final Optimal sigma_f: {optimal_sigma_f:.3f}")
    for i, ls in enumerate(optimal_length_scales):
        if i < len(feature_names):
            print(f"  - Final Optimal length_scale for '{feature_names[i]}': {ls:.3f}")
        else:
            print(f"  - Final Optimal length_scale {i}: {ls:.3f} (No corresponding feature name)")

    mean_f_s, var_f_s = model.predict(X_test)
    
    denominator_term = 1 + np.pi * var_f_s / 8
    gp_probs = sigmoid(mean_f_s / np.sqrt(denominator_term + 1e-10))
    
    gp_probs = gp_probs.flatten()
    
    gp_y_pred = np.where(gp_probs > 0.5, 1, -1)
    
    return gp_y_pred, model

## Classification Function

In [None]:
def run_classification_pipeline(dataset_loader_func, visualize=False):
    X, y, feature_names = dataset_loader_func()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # ---  MinMaxScaler ---
    scaler = MinMaxScaler(feature_range=(-1, 1)) 
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    dataset_name = dataset_loader_func.__name__.replace("load_", "").replace("_dataset", "").replace("_full_features", "").replace("_all_features", "").title()
    print(f"================== {dataset_name} Dataset ==================")
    print(f"Training on {X_train.shape[0]} samples, testing on {X_test.shape[0]} samples.")
    print(f"Number of features: {X_train.shape[1]}")

    # --- Gaussian Process Model (using GPy) ---
    gp_y_pred, gp_model = train_and_predict_gp_with_gpy(X_train, y_train, X_test, feature_names)
    
    gp_accuracy = accuracy_score(y_test, gp_y_pred)
    print(f"\n>>> GP Test Accuracy: {gp_accuracy:.4f} <<<")

    
    print("\n--- Training Support Vector Machine (SVM) Model for Comparison ---")
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf']
    }
    
    grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=0, cv=5)
    print("Running GridSearchCV for SVM...")
    grid.fit(X_train, y_train)
    
    print("SVM GridSearch complete.")
    print(f"  - Best SVM Parameters: {grid.best_params_}")
    
    svm_y_pred = grid.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_y_pred)
    print(f"\n>>> SVM Test Accuracy: {svm_accuracy:.4f} <<<")
    print("=" * 60 + "\n")

In [56]:
run_classification_pipeline(load_pima_dataset_full_features, visualize=False)
run_classification_pipeline(load_crabs_dataset_all_features, visualize=False)

reconstraining parameters Mat32.variance
reconstraining parameters Mat32.lengthscale


Training on 537 samples, testing on 231 samples.
Number of features: 8

--- Training Gaussian Process Model (using GPy) ---
Optimizing GP hyperparameters with GPy's internal optimizer...
GP Optimization complete.
  - Final Optimal sigma_f: 2.458
  - Final Optimal length_scale for 'pregnancies': 20.076
  - Final Optimal length_scale for 'glucose': 2.868
  - Final Optimal length_scale for 'bp': 11.079
  - Final Optimal length_scale for 'skin': 20.071
  - Final Optimal length_scale for 'insulin': 0.913
  - Final Optimal length_scale for 'bmi': 2.902
  - Final Optimal length_scale for 'pedigree': 3.847
  - Final Optimal length_scale for 'age': 2.453

>>> GP Test Accuracy: 0.6537 <<<

--- Training Support Vector Machine (SVM) Model for Comparison ---
Running GridSearchCV for SVM...
SVM GridSearch complete.
  - Best SVM Parameters: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

>>> SVM Test Accuracy: 0.7446 <<<



reconstraining parameters Mat32.variance
reconstraining parameters Mat32.lengthscale


Crabs dataset: Unique 'sp' values before filtering: ['azul' 'laranja']
Training on 109 samples, testing on 47 samples.
Number of features: 5

--- Training Gaussian Process Model (using GPy) ---
Optimizing GP hyperparameters with GPy's internal optimizer...
GP Optimization complete.
  - Final Optimal sigma_f: 4.482
  - Final Optimal length_scale for 'FL': 1.187
  - Final Optimal length_scale for 'RW': 20.086
  - Final Optimal length_scale for 'CL': 20.082
  - Final Optimal length_scale for 'CW': 0.959
  - Final Optimal length_scale for 'BD': 1.992

>>> GP Test Accuracy: 0.5319 <<<

--- Training Support Vector Machine (SVM) Model for Comparison ---
Running GridSearchCV for SVM...
SVM GridSearch complete.
  - Best SVM Parameters: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}

>>> SVM Test Accuracy: 0.9787 <<<

