In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import kneighbors_graph
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from scipy import sparse
from warnings import filterwarnings

filterwarnings('ignore')

# -----------------------
# Define Polynomial Kernel
# -----------------------
def polynomial_kernel(X1, X2, **kwargs):
    gamma = kwargs.get('gamma', 1)
    coef0 = kwargs.get('coef0', 1)
    degree = kwargs.get('degree', 2)
    return (gamma * X1 @ X2.T + coef0) ** degree

# -----------------------
# Define LapRLS Model
# -----------------------
class LapRLS:
    def __init__(self, opt):
        self.opt = opt

    def fit(self, X, Y):
        self.X_labeled = X
        self.Y_labeled = Y.flatten()

        if self.opt['neighbor_mode'] == 'distance':
            W = kneighbors_graph(self.X_labeled, self.opt['n_neighbor'], mode='distance', include_self=False)
            W = W.maximum(W.T)
            W = sparse.csr_matrix(
                (np.exp(-W.data ** 2 / (4 * self.opt['t'])), W.indices, W.indptr),
                shape=(self.X_labeled.shape[0], self.X_labeled.shape[0])
            )
        else:
            raise Exception("Only 'distance' mode supported")

        L = sparse.diags(np.array(W.sum(0))[0]).tocsr() - W
        K = self.opt['kernel_function'](self.X_labeled, self.X_labeled, **self.opt['kernel_parameters'])
        l = X.shape[0]
        I_l = np.identity(l)

        self.alpha = np.linalg.inv(
            K + self.opt['gamma_A'] * l * I_l + (self.opt['gamma_I'] * l / (l ** 2)) * L.dot(K)
        ).dot(self.Y_labeled)

    def decision_function(self, X):
        new_K = self.opt['kernel_function'](self.X_labeled, X, **self.opt['kernel_parameters'])
        return np.squeeze(self.alpha.dot(new_K))

# -----------------------
# Hyperparameters
# -----------------------
opt = {
    'neighbor_mode': 'distance',
    'n_neighbor': 3,
    't': 1,
    'gamma_A': 0.0001,
    'gamma_I': 1,
    'kernel_function': polynomial_kernel,
    'kernel_parameters': {
        'gamma': 1,
        'coef0': 1,
        'degree': 3
    }
}

# -----------------------
# Load and preprocess data
# -----------------------
DATA_PATH = '1000_cylindrical_anomalies.csv'  # Ensure this file is in your working directory
df = pd.read_csv(DATA_PATH)
X = np.abs(df.iloc[:, :22].values)
y = df.iloc[:, -1].values
X = MinMaxScaler().fit_transform(X)

# -----------------------
# Run experiments on seeds 1 to 5
# -----------------------
results = []
seeds = [99, 64, 94, 78, 87]

for seed in seeds:
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.8, random_state=seed)
    _, X_test, _, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)

    model = LapRLS(opt)
    model.fit(X_train, y_train)
    y_pred = model.decision_function(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    results.append({'Seed': seed, 'R2 Score': r2, 'MSE': mse})

# -----------------------
# Compute mean and std
# -----------------------
results_df = pd.DataFrame(results)
mean_row = results_df[['R2 Score', 'MSE']].mean()
std_row = results_df[['R2 Score', 'MSE']].std()

# Create labeled rows for mean and std
mean_row_df = pd.DataFrame([['Mean', *mean_row]], columns=results_df.columns)
std_row_df = pd.DataFrame([['Std', *std_row]], columns=results_df.columns)

# Append to results
results_df = pd.concat([results_df, mean_row_df, std_row_df], ignore_index=True)

# -----------------------
# Show Results
# -----------------------
print("\nPerformance across seeds with Polynomial Kernel (degree=3):")
print(results_df)


Performance across seeds with Polynomial Kernel (degree=3):
   Seed  R2 Score       MSE
0    99  0.913597  0.299652
1    64  0.943673  0.208199
2    94  0.925197  0.240109
3    78  0.901850  0.332412
4    87  0.910133  0.329675
5  Mean  0.918890  0.282009
6   Std  0.016194  0.055514
