In [8]:
# path_kernel_toxicity.py
# Requirements: numpy, pandas, scikit-learn, tqdm
# Run: python path_kernel_toxicity.py

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import trange, tqdm
import urllib.request
import io



# Kernel hyperparams
N_SLICES = 40         # number of discrete time slices in each path (including endpoints)
M_SAMPLES = 2000     # Monte Carlo samples per pair (lower for speed; increase for accuracy)
BURN_IN = 50         # burn-in steps for each path chain
H_BAR = 1.0          # effective Planck constant (tune)
MASS = 1.0           # mass parameter in kinetic term
DELTA_T = 1.0 / (N_SLICES - 1)  # time step
# For speed on laptop, we'll subsample dataset
MAX_SAMPLES = 300    # <= number of rows to use (set to None to use all)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

dataset_save = "Dataset/Toxicity-13F.csv"

# ---------- Preprocess ----------
print("Loading data...")
df = pd.read_csv(dataset_save)
print("Original shape:", df.shape)
# Inspect columns: we assume last column is label or there's a 'label' column; adapt as needed.
# For this UCI CSV the labels may be in a column named 'toxicity' or similar — try to infer:
if 'toxicity' in df.columns:
    label_col = 'toxicity'
else:
    # fallback: try the last column as label
    label_col = df.columns[-1]

# Drop rows with missing label
df = df.dropna(subset=[label_col])
# Select numeric columns only for simplicity
num_df = df.select_dtypes(include=[np.number]).copy()
# remove the label column from features if present
if label_col in num_df.columns:
    y = num_df[label_col].values
    X = num_df.drop(columns=[label_col]).values
else:
    y = df[label_col].values
    # take numeric features from original df
    X = num_df.values

print("Numeric features shape:", X.shape, "labels shape:", y.shape)

# Optionally subsample for speed
if MAX_SAMPLES is not None and X.shape[0] > MAX_SAMPLES:
    idx = np.random.choice(X.shape[0], MAX_SAMPLES, replace=False)
    X = X[idx]
    y = y[idx]
    print(f"Subsampled to {MAX_SAMPLES} rows for speed.")

# Standardize and PCA to small d
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
pca = PCA(n_components=min(8, Xs.shape[1]))
Xred = pca.fit_transform(Xs)
print("Reduced feature shape:", Xred.shape)

# split
X_train, X_test, y_train, y_test = train_test_split(Xred, y, test_size=0.3, random_state=RANDOM_SEED, stratify=(y if len(np.unique(y))>1 else None))

# ---------- Path integral action ----------
# We'll map each data vector x -> a potential V(q; x) using radial basis functions.
# Precompute centers and widths for phi_j(q).
def make_phi_basis(num_basis=8):
    centers = np.linspace(-2.0, 2.0, num_basis)
    width = 0.6
    def phi(q):
        # q can be array
        vals = np.exp(-((q[:, None] - centers[None, :])**2) / (2*width**2))
        return vals  # shape (len(q), num_basis)
    return phi, centers

phi_func, centers = make_phi_basis(num_basis=Xred.shape[1] + 2)

def potential(q, x):
    # q: scalar or array. x: feature vector (dim matches number of basis used)
    # V(q; x) = base + sum_j x_j * phi_j(q)
    phis = np.exp(-((q[:, None] - centers[None, :])**2) / (2 * 0.6**2))  # shape (len(q), B)
    # Map x to first B components (pad or slice)
    xpad = np.zeros(phis.shape[1])
    k = min(len(x), phis.shape[1])
    xpad[:k] = x[:k]
    V = phis.dot(xpad)
    return V  # shape (len(q),)

def euclidean_action(path_q, x):
    # path_q: array of length N_SLICES, boundary included
    # x: data vector for potential
    kinetic = 0.0
    potential_sum = 0.0
    for n in range(1, len(path_q)):
        dq = (path_q[n] - path_q[n-1]) / DELTA_T
        kinetic += 0.5 * MASS * dq * dq * DELTA_T
    # potential evaluated at interior points
    qs = np.array(path_q)
    Vvals = potential(qs, x)
    potential_sum = np.sum(Vvals) * DELTA_T
    return kinetic + potential_sum

# ---------- Brownian bridge Metropolis sampler ----------
def brownian_bridge_proposal(curr_path, sigma=0.5):
    # propose by adding Gaussian noise to interior points (not endpoints)
    proposal = curr_path.copy()
    interior = np.arange(1, len(curr_path) - 1)
    proposal[interior] += np.random.normal(scale=sigma, size=interior.shape[0])
    return proposal

def sample_paths_between(x_from, x_to, M=M_SAMPLES):
    # build initial linear bridge between boundary projections
    b0 = np.dot(x_from, np.ones_like(centers)[:len(x_from)]) if len(x_from)>0 else 0.0
    bT = np.dot(x_to, np.ones_like(centers)[:len(x_to)]) if len(x_to)>0 else 0.0
    # Simple boundary projection: take first component of xred as scalar BCs
    # Better: learn projection; here we use first feature as scalar BC
    try:
        b0 = float(x_from[0])
        bT = float(x_to[0])
    except:
        b0 = 0.0; bT = 0.0
    path0 = np.linspace(b0, bT, N_SLICES)
    curr_path = path0.copy()
    curr_S = euclidean_action(curr_path, (x_from + x_to) / 2.0)
    weights = []
    # Burn-in
    for _ in range(BURN_IN):
        prop = brownian_bridge_proposal(curr_path, sigma=0.6)
        prop[0], prop[-1] = b0, bT
        prop_S = euclidean_action(prop, (x_from + x_to) / 2.0)
        if np.random.rand() < np.exp(-(prop_S - curr_S) / H_BAR):
            curr_path, curr_S = prop, prop_S
    # Collect samples
    for _ in range(M):
        prop = brownian_bridge_proposal(curr_path, sigma=0.4)
        prop[0], prop[-1] = b0, bT
        prop_S = euclidean_action(prop, (x_from + x_to) / 2.0)
        if np.random.rand() < np.exp(-(prop_S - curr_S) / H_BAR):
            curr_path, curr_S = prop, prop_S
        weights.append(np.exp(-curr_S / H_BAR))
    return np.array(weights)

# ---------- Build kernel matrix (pairwise) ----------
def compute_kernel_matrix(Xdata):
    N = Xdata.shape[0]
    K = np.zeros((N, N))
    print("Computing kernel matrix (this may take time). N =", N)
    for i in tqdm(range(N)):
        for j in range(i, N):
            w = sample_paths_between(Xdata[i], Xdata[j], M=M_SAMPLES)
            K_ij = np.mean(w)
            K[i, j] = K_ij
            K[j, i] = K_ij
    # Normalize kernel (optional)
    # Make PSD by adding small jitter
    K += 1e-8 * np.eye(N)
    return K

# For speed, compute kernel on a limited subset of train data
TRAIN_SUB = min(120, X_train.shape[0])
TEST_SUB = min(60, X_test.shape[0])

Xtr_small = X_train[:TRAIN_SUB]
ytr_small = y_train[:TRAIN_SUB]
Xte_small = X_test[:TEST_SUB]
yte_small = y_test[:TEST_SUB]

K_train = compute_kernel_matrix(Xtr_small)
# For test kernel we compute cross-kernel between test and train
def compute_cross_kernel(Xa, Xb):
    A = Xa.shape[0]; B = Xb.shape[0]
    K = np.zeros((A, B))
    for i in tqdm(range(A)):
        for j in range(B):
            w = sample_paths_between(Xa[i], Xb[j], M=int(M_SAMPLES/4))  # fewer samples for cross
            K[i, j] = np.mean(w)
    return K

K_test = compute_cross_kernel(Xte_small, Xtr_small)

# ---------- Train SVM with precomputed kernel ----------
print("Training SVM with path-integral kernel...")
clf = SVC(kernel='precomputed', probability=True)
clf.fit(K_train, ytr_small)
y_pred = clf.predict(K_test)
y_score = clf.decision_function(K_test) if hasattr(clf, "decision_function") else clf.predict_proba(K_test)[:, 1]
print("Path-kernel test acc:", accuracy_score(yte_small, y_pred))
try:
    print("Path-kernel test AUC:", roc_auc_score(yte_small, y_score))
except:
    pass

# ---------- Baseline: RBF on PCA features ----------
print("Training baseline RBF SVM on reduced features...")
rbf = SVC(kernel='rbf', probability=True)
rbf.fit(Xtr_small, ytr_small)
y_pred_rbf = rbf.predict(Xte_small)
y_score_rbf = rbf.decision_function(Xte_small)
print("RBF test acc:", accuracy_score(yte_small, y_pred_rbf))
try:
    print("RBF test AUC:", roc_auc_score(yte_small, y_score_rbf))
except:
    pass

print("Done.")


Loading data...
Original shape: (171, 14)
Numeric features shape: (171, 13) labels shape: (171,)
Reduced feature shape: (171, 8)
Computing kernel matrix (this may take time). N = 119


100%|██████████| 119/119 [10:03<00:00,  5.07s/it]
100%|██████████| 52/52 [02:14<00:00,  2.59s/it]

Training SVM with path-integral kernel...
Path-kernel test acc: 0.6538461538461539
Path-kernel test AUC: 0.6924369747899161
Training baseline RBF SVM on reduced features...
RBF test acc: 0.6538461538461539
RBF test AUC: 0.6235294117647059
Done.





In [15]:
np.save("K_train.npy", K_train)
np.save("K_test.npy", K_test)
np.save("y_train.npy", ytr_small)   # or your training labels variable
np.save("y_test.npy", yte_small)
# optionally save reduced features for RBF baseline:
np.save("X_train_red.npy", Xtr_small)
np.save("X_test_red.npy", Xte_small)

In [12]:
# path_kernel_diagnostics.py
# Purpose: normalize/center path kernel, grid-search SVM C, compute alignment, eigenspectrum, and bootstrap AUC CI.
# Assumes you already computed K_train, K_test and have y_train_subset / y_test_subset available.
# If K_train.npy and K_test.npy exist, script will load them. Otherwise it will call compute_kernel_matrix/compute_cross_kernel
# from your existing code (you can paste the compute_kernel_matrix & compute_cross_kernel functions or set RECOMPUTE=True).
#
# Requirements: numpy, scipy, scikit-learn, joblib, tqdm, pandas
# Run: python path_kernel_diagnostics.py

import os
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import pairwise
import scipy.linalg as la
import joblib
from tqdm import trange
import random

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ---------- User-editable paths ----------
K_TRAIN_PATH = "K_train.npy"
K_TEST_PATH = "K_test.npy"
Y_TRAIN_PATH = "y_train.npy"
Y_TEST_PATH = "y_test.npy"

RECOMPUTE = False  # set True if you want this script to recompute kernels (expensive)
# If RECOMPUTE=True, ensure compute_kernel_matrix(Xtr_small) and compute_cross_kernel(Xte_small,Xtr_small)
# are available in scope (paste from your previous script or import them).

# ---------- Utility functions ----------
def center_kernel(K):
    """Center kernel matrix K (NxN)."""
    n = K.shape[0]
    one = np.ones((n, n)) / n
    return K - one.dot(K) - K.dot(one) + one.dot(K).dot(one)

def normalize_kernel(K):
    """Normalize kernel to cosine (K_ij -> K_ij / sqrt(K_ii K_jj))."""
    diag = np.diag(K).copy()
    # numerical safety
    diag[diag <= 0] = 1e-12
    D = np.sqrt(diag)
    K_norm = K / np.outer(D, D)
    return K_norm

def kernel_alignment(K, y):
    """Compute (centered) kernel alignment with label kernel L = y y^T (y in {-1,1})."""
    Kc = center_kernel(K)
    # convert labels to +/-1 (if 0/1 present)
    yv = np.array(y)
    if set(np.unique(yv)) == {0,1}:
        yv = 2*yv - 1
    L = np.outer(yv, yv)
    # center L as well
    Lc = center_kernel(L)
    num = np.sum(Kc * Lc)
    den = np.sqrt(np.sum(Kc * Kc) * np.sum(Lc * Lc))
    return float(num / den) if den > 0 else 0.0

def kernel_eigenspectrum(K, top_k=10):
    """Return sorted eigenvalues (desc) and fraction explained by top components."""
    # Use symmetric eigensolver
    vals = la.eigvalsh(K)
    vals_sorted = np.sort(vals)[::-1]
    total = np.sum(vals_sorted)
    rel = vals_sorted[:top_k] / total if total != 0 else vals_sorted[:top_k]
    explained = np.cumsum(vals_sorted) / total if total != 0 else np.cumsum(vals_sorted)
    return vals_sorted, rel, explained

# Bootstrapping AUC for a precomputed kernel (train/test)
def bootstrap_auc_precomputed(K_train, K_test, y_train, y_test, C=1.0, n_boot=200, seed=RANDOM_SEED):
    """
    Bootstrap resampling of training set to estimate distribution of test AUC.
    We resample with replacement from the training indices, retrain SVM, evaluate on fixed test set.
    Returns list of AUCs.
    """
    n_train = K_train.shape[0]
    aucs = []
    rng = np.random.RandomState(seed)
    svc = SVC(kernel='precomputed', probability=True, C=C)
    for b in trange(n_boot, desc="Bootstraps"):
        idx = rng.randint(0, n_train, size=n_train)  # bootstrap indices
        # build bootstrap training kernel (submatrix)
        Kb = K_train[np.ix_(idx, idx)]
        yb = y_train[idx]
        # train on bootstrap set
        try:
            svc.fit(Kb, yb)
        except Exception as e:
            print("SVM fit error on bootstrap, skipping:", e)
            continue
        # build cross-kernel between test and bootstrap-train (test rows correspond to K_test columns)
        # But SVC expects columns corresponding to training set ordering. We must compute K_test_boot where
        # K_test_boot[i, j] = K_test[i, original_index_of_bootstrap_j]
        Kt_boot = K_test[:, idx]
        # predict and evaluate
        try:
            scores = svc.decision_function(Kt_boot)
        except Exception:
            scores = svc.predict_proba(Kt_boot)[:, 1]
        auc = roc_auc_score(y_test, scores)
        aucs.append(auc)
    return aucs

# ---------- Load data / kernels ----------
if os.path.exists(K_TRAIN_PATH) and os.path.exists(K_TEST_PATH) and os.path.exists(Y_TRAIN_PATH) and os.path.exists(Y_TEST_PATH) and not RECOMPUTE:
    print("Loading precomputed kernel and labels from disk...")
    K_train = np.load(K_TRAIN_PATH)
    K_test = np.load(K_TEST_PATH)
    from sklearn.preprocessing import LabelEncoder

    # after loading y_train and y_test (possibly with allow_pickle=True)
    y_train_raw = np.load(Y_TRAIN_PATH, allow_pickle=True)
    y_test_raw  = np.load(Y_TEST_PATH, allow_pickle=True)

    le = LabelEncoder()
    # fit only on train labels (safe); transform both
    y_train = le.fit_transform(y_train_raw)
    y_test  = le.transform(y_test_raw)
else:
    if RECOMPUTE:
        # Placeholder: user should paste in compute_kernel_matrix & compute_cross_kernel functions
        print("RECOMPUTE=True but compute_kernel_matrix not found in this script. Please paste your kernel-building functions or precompute kernels and save them to files.")
        raise RuntimeError("Kernel recomputation requested but compute functions are not available here.")
    else:
        raise FileNotFoundError("Precomputed kernel files not found. Either set RECOMPUTE=True and provide compute functions, or run your path-kernel script and save K_train.npy, K_test.npy, y_train.npy, y_test.npy.")

print("Shapes: K_train", K_train.shape, "K_test", K_test.shape, "y_train", y_train.shape, "y_test", y_test.shape)

# ---------- Normalize, center, and compare scaling ----------
print("\n--- Kernel normalization & centering ---")
K_train_norm = normalize_kernel(K_train)
K_train_centered = center_kernel(K_train_norm)
# For test kernel we must normalize with training diagonal entries:
# Normalize K_test such that K_test_norm[i,j] = K_test[i,j] / sqrt(K_ii_train * K_jj_train)
train_diag = np.sqrt(np.diag(K_train))
train_diag[train_diag <= 0] = 1e-12
K_test_norm = K_test / np.outer(np.ones(K_test.shape[0]), train_diag)

# Center training kernel for SVM CV (note: centering precomputed kernels for SVM is common practice,
# but when using precomputed kernel with scikit-learn you must center explicitly before passing.)
K_train_used = K_train_centered.copy()
# For SVM precomputed with gridsearch, we'll use K_train_used.

print("Train kernel - min/max/mean before:", np.min(K_train), np.max(K_train), np.mean(K_train))
print("Train kernel normalized - min/max/mean after:", np.min(K_train_norm), np.max(K_train_norm), np.mean(K_train_norm))

# ---------- Grid search for SVM C (5-fold CV on precomputed kernel) ----------
print("\n--- GridSearchCV for SVM C (5-fold) ---")
param_grid = {'C': [0.01, 0.1, 1.0, 10.0, 100.0]}
svc = SVC(kernel='precomputed')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
grid = GridSearchCV(svc, param_grid=param_grid, cv=cv, scoring='roc_auc', n_jobs=1)
# GridSearchCV expects features shaped (n_samples, n_features) - for precomputed kernels it expects the kernel matrix
# as X when fitting; pass the centered kernel.
grid.fit(K_train_used, y_train)
print("Best params:", grid.best_params_, "best CV score (AUC):", grid.best_score_)
bestC = grid.best_params_['C']

# ---------- Train final SVM on centered kernel and evaluate on test set ----------
print("\n--- Final training with best C on full train set ---")
clf = SVC(kernel='precomputed', C=bestC, probability=True)
clf.fit(K_train_used, y_train)

# Build test kernel matrix to match centered training set normalization/centering.
# Note: proper kernel centering for cross-kernel K_test requires:
# Kc_test = K_test_norm - 1_n_test 1_n_train^T K_train_norm - K_test_norm 1_n_train 1_n_train^T + 1_n_test 1_n_train^T K_train_norm 1_n_train 1_n_train^T
# But a simpler approximate approach used in practice: center K_test using train mean vectors.
def center_cross_kernel(K_cross, K_train):
    # K_cross: (n_test, n_train)
    n_train = K_train.shape[0]
    one_train = np.ones((n_train, n_train)) / n_train
    K_train_mean_rows = np.mean(K_train, axis=0)  # vector length n_train
    K_train_mean_all = np.mean(K_train)
    K_cross_centered = K_cross - np.mean(K_cross, axis=1)[:, None] - K_train_mean_rows[None, :] + K_train_mean_all
    return K_cross_centered

K_test_used = center_cross_kernel(K_test_norm, K_train_norm)

# Evaluate
try:
    test_scores = clf.decision_function(K_test_used)
except Exception:
    test_scores = clf.predict_proba(K_test_used)[:, 1]
test_pred = clf.predict(K_test_used)
test_acc = accuracy_score(y_test, test_pred)
test_auc = roc_auc_score(y_test, test_scores)
print("Final path-kernel test acc: {:.4f}, AUC: {:.4f}".format(test_acc, test_auc))

# ---------- Kernel diagnostics: alignment and eigenspectrum ----------
print("\n--- Kernel diagnostics ---")
align = kernel_alignment(K_train, y_train)
print("Kernel alignment (centered) with labels:", align)
vals_sorted, top_rel, explained = kernel_eigenspectrum(K_train_used, top_k=10)
print("Top 10 eigenvalue fractions:", top_rel)
print("Cumulative explained by top eigenvalues (first 10):", explained[:10])

# ---------- Bootstrap AUC CI ----------
print("\n--- Bootstrap AUC (retrain on bootstrap samples) ---")
n_boot = 200
aucs = bootstrap_auc_precomputed(K_train_used, K_test_used, y_train, y_test, C=bestC, n_boot=n_boot, seed=RANDOM_SEED)
aucs = np.array(aucs)
mean_auc = np.mean(aucs)
ci_low = np.percentile(aucs, 2.5)
ci_high = np.percentile(aucs, 97.5)
print(f"Bootstrap AUC mean: {mean_auc:.4f}, 95% CI: [{ci_low:.4f}, {ci_high:.4f}] (n_boot={len(aucs)})")

# ---------- Compare to RBF baseline (optional if you have X_train/X_test) ----------
print("\n--- If you have reduced features X_train/X_test, you can run an RBF baseline for comparison ---")
# If user has saved reduced features arrays, load them and compute RBF
if os.path.exists("X_train_red.npy") and os.path.exists("X_test_red.npy"):
    Xtr = np.load("X_train_red.npy")
    Xte = np.load("X_test_red.npy")
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV
    # simple grid search on gamma and C (small grid)
    param_grid_rbf = {'C':[0.1,1,10], 'gamma':[0.01, 0.1, 1.0]}
    rbf = SVC(kernel='rbf', probability=True)
    gs_rbf = GridSearchCV(rbf, param_grid=param_grid_rbf, cv=cv, scoring='roc_auc', n_jobs=1)
    gs_rbf.fit(Xtr, y_train)
    print("RBF best params:", gs_rbf.best_params_, "CV AUC:", gs_rbf.best_score_)
    rbf_best = gs_rbf.best_estimator_
    y_pred_rbf = rbf_best.predict(Xte)
    try:
        scores_rbf = rbf_best.decision_function(Xte)
    except:
        scores_rbf = rbf_best.predict_proba(Xte)[:,1]
    print("RBF test acc:", accuracy_score(y_test, y_pred_rbf), "RBF test AUC:", roc_auc_score(y_test, scores_rbf))
else:
    print("No X_train_red.npy / X_test_red.npy found — skip RBF auto-eval. If you want this, save reduced PCA features as X_train_red.npy and X_test_red.npy.")

# Save outputs for future use
np.save("K_train_centered.npy", K_train_used)
np.save("K_test_centered.npy", K_test_used)
print("\nSaved centered kernels to K_train_centered.npy and K_test_centered.npy.")
print("Diagnostics complete.")


Loading precomputed kernel and labels from disk...
Shapes: K_train (119, 119) K_test (52, 119) y_train (119,) y_test (52,)

--- Kernel normalization & centering ---
Train kernel - min/max/mean before: 1.2857643360120744e-32 70.19064776471305 0.7661542881461306
Train kernel normalized - min/max/mean after: 1.2764776744356858e-32 2.611212156780805 0.4667110179603607

--- GridSearchCV for SVM C (5-fold) ---
Best params: {'C': 0.01} best CV score (AUC): 0.6566964285714285

--- Final training with best C on full train set ---
Final path-kernel test acc: 0.6731, AUC: 0.6773

--- Kernel diagnostics ---
Kernel alignment (centered) with labels: 0.011850028735979968
Top 10 eigenvalue fractions: [0.54181511 0.31416716 0.14418591 0.08323632 0.04523573 0.03235044
 0.02047996 0.01784213 0.01572442 0.01012907]
Cumulative explained by top eigenvalues (first 10): [0.54181511 0.85598228 1.00016819 1.08340451 1.12864024 1.16099068
 1.18147064 1.19931278 1.2150372  1.22516627]

--- Bootstrap AUC (retrain 

Bootstraps: 100%|██████████| 200/200 [00:00<00:00, 444.27it/s]


Bootstrap AUC mean: 0.5512, 95% CI: [0.3966, 0.6808] (n_boot=200)

--- If you have reduced features X_train/X_test, you can run an RBF baseline for comparison ---
RBF best params: {'C': 0.1, 'gamma': 1.0} CV AUC: 0.7377232142857142
RBF test acc: 0.6730769230769231 RBF test AUC: 0.5915966386554622

Saved centered kernels to K_train_centered.npy and K_test_centered.npy.
Diagnostics complete.


In [21]:
import pandas as pd
import numpy as np

# ------------------ Config ------------------
csv_path = "sweep_results/sweep_summary.csv"  # path to your sweep CSV
top_k = 3                                    # how many top results to show

# ------------------ Load CSV ------------------
df = pd.read_csv(csv_path)

# Ensure 'label' column exists
if 'label' not in df.columns:
    df['label'] = df.index.astype(str)

# ------------------ Best single result by test_auc ------------------
best_row = df.loc[df['test_auc'].idxmax()]

print("=== Best Result by test_auc ===")
print(f"H_BAR    : {best_row['H_BAR']}")
print(f"MASS     : {best_row['MASS']}")
print(f"N_SLICES : {best_row['N_SLICES']}")
print(f"M_SAMPLES: {best_row.get('M_SAMPLES', 'N/A')}")
print(f"test_auc : {best_row['test_auc']}")
print(f"alignment: {best_row['alignment']}")
print(f"bestCV_AUC: {best_row.get('bestCV_AUC', 'N/A')}")
print(f"bestC    : {best_row.get('bestC', 'N/A')}")
print(f"label    : {best_row.get('label', 'N/A')}")
print("\n")

# ------------------ Top-K by test_auc ------------------
top_auc = df.sort_values(by='test_auc', ascending=False).head(top_k)
print(f"=== Top {top_k} Results by test_auc ===")
print(top_auc[['H_BAR','MASS','N_SLICES','M_SAMPLES','test_auc','alignment','bestCV_AUC','bestC','label']])
print("\n")

# ------------------ Combined score (test_auc + alignment) ------------------
def combined_score(df, w_auc=0.75, w_align=0.25):
    s = df.copy()
    # fill NaNs
    s['test_auc_filled'] = s['test_auc'].fillna(-999)
    s['alignment_filled'] = s['alignment'].fillna(-999)
    # min-max scale to [0,1]
    def minmax_col(arr):
        a = np.array(arr, dtype=float)
        amin = np.nanmin(a)
        amax = np.nanmax(a)
        if amax <= amin:
            return np.zeros_like(a)
        return (a - amin) / (amax - amin)
    s['auc_scaled'] = minmax_col(s['test_auc_filled'])
    s['align_scaled'] = minmax_col(s['alignment_filled'])
    s['combined_score'] = w_auc * s['auc_scaled'] + w_align * s['align_scaled']
    return s

df_combined = combined_score(df)
top_combined = df_combined.sort_values(by='combined_score', ascending=False).head(top_k)

print(f"=== Top {top_k} Results by Combined Score (0.75*test_auc + 0.25*alignment) ===")
print(top_combined[['H_BAR','MASS','N_SLICES','M_SAMPLES','test_auc','alignment','combined_score','bestC','label']])


=== Best Result by test_auc ===
H_BAR    : 0.5
MASS     : 0.5
N_SLICES : 12
M_SAMPLES: 200
test_auc : 0.8854166666666667
alignment: 0.0515012447334938
bestCV_AUC: 0.6733333333333333
bestC    : 10.0
label    : H0.5_M0.5_S12_Ms200


=== Top 3 Results by test_auc ===
     H_BAR  MASS  N_SLICES  M_SAMPLES  test_auc  alignment  bestCV_AUC  bestC  \
53     0.5   0.5        12        200  0.885417   0.051501    0.673333  10.00   
125    0.8   0.5        12        200  0.843750   0.027572    0.643333   0.01   
116    0.7   2.0         8        200  0.833333   0.033307    0.246667  10.00   

                   label  
53   H0.5_M0.5_S12_Ms200  
125  H0.8_M0.5_S12_Ms200  
116   H0.7_M2.0_S8_Ms200  


=== Top 3 Results by Combined Score (0.75*test_auc + 0.25*alignment) ===
     H_BAR  MASS  N_SLICES  M_SAMPLES  test_auc  alignment  combined_score  \
53     0.5   0.5        12        200  0.885417   0.051501        0.882388   
153    0.9   1.0        10        200  0.708333   0.131988        0.830

In [24]:
import pennylane as qml
from pennylane import numpy as np
from itertools import product

# ------------------ Hyperparameters ------------------
N_QUBITS = 4             # number of qubits (can reduce by PCA on features)
N_SLICES = 12            # discretized time slices (from classical sweep)
H_BAR = 0.5              # effective Planck constant
MASS = 0.5               # mass parameter
np.random.seed(42)

# ------------------ Device ------------------
dev = qml.device("default.qubit", wires=N_QUBITS)

# ------------------ Feature map ------------------
def kinetic_layer():
    """Single-qubit rotations representing kinetic term."""
    for w in range(N_QUBITS):
        qml.RX(H_BAR / MASS, wires=w)

def potential_layer(x):
    """Multi-qubit rotations representing potential term (couplings)."""
    for i, j in product(range(N_QUBITS), repeat=2):
        if i < j:
            # simple two-qubit rotation encoding interactions
            qml.CRZ((x[i] * x[j]) * H_BAR, wires=[i, j])

def path_integral_feature_map(x):
    """Encode input x into a path-integral quantum state."""
    x = x[:N_QUBITS]  # truncate if needed
    for w in range(N_QUBITS):
        qml.RY(x[w], wires=w)  # angle encoding of features
    
    for _ in range(N_SLICES):
        kinetic_layer()
        potential_layer(x)

# ------------------ Kernel evaluation ------------------
@qml.qnode(dev)
def kernel_circuit(x1, x2):
    # Prepare first state |psi(x1)>
    path_integral_feature_map(x1)
    # Inverse of second state |psi(x2)>^\dagger
    qml.adjoint(path_integral_feature_map)(x2)
    return qml.probs(wires=range(N_QUBITS))

def compute_kernel_matrix(X1, X2):
    """Compute kernel matrix K_ij = |<psi(x_i)|psi(x_j)>|^2"""
    N1 = len(X1)
    N2 = len(X2)
    K = np.zeros((N1, N2))
    for i, x_i in enumerate(X1):
        for j, x_j in enumerate(X2):
            probs = kernel_circuit(x_i, x_j)
            K[i, j] = probs[0]  # probability of |00...0> state
    return K

# ------------------ Example usage ------------------
# Small random dataset (replace with your real features)
X_train = np.random.rand(5, N_QUBITS) * np.pi
X_test  = np.random.rand(2, N_QUBITS) * np.pi

K_train = compute_kernel_matrix(X_train, X_train)
K_test  = compute_kernel_matrix(X_test, X_train)

print("K_train:\n", K_train)
print("K_test:\n", K_test)

# Now K_train and K_test can be fed to a classical SVM:
from sklearn.svm import SVC
y_train = np.array([0, 1, 0, 1, 0])
y_test  = np.array([1, 0])

clf = SVC(kernel="precomputed", C=10.0)
clf.fit(K_train, y_train)
y_pred = clf.predict(K_test)
print("Predictions:", y_pred)


K_train:
 [[1.         0.20628742 0.07631253 0.17544535 0.08758749]
 [0.20628742 1.         0.00742173 0.20372521 0.20779223]
 [0.07631253 0.00742173 1.         0.01793638 0.01669364]
 [0.17544535 0.20372521 0.01793638 1.         0.08076974]
 [0.08758749 0.20779223 0.01669364 0.08076974 1.        ]]
K_test:
 [[0.1003442  0.3334711  0.02144049 0.13818103 0.57093522]
 [0.01816577 0.16751916 0.07865527 0.1443786  0.34000603]]
Predictions: [0 0]
