In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
from vkoga.vkoga_2L import VKOGA_2L
from vkoga import tkernels
from vkoga import kernels
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold
import pandas as pd

In [2]:
# Zur Reproduzierbarkeit der Daten
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# Daten laden und skalieren oder Standatisieren je nach Vergleich
protein = pd.read_csv("Datasets/CASP.csv")
# print(protein.head())
X = protein.iloc[:, 1:].values
y = protein.iloc[:, :1].values

# K-fold mit k=5
kf = KFold(n_splits=5, shuffle=True, random_state=42)

standartisieren = False
if standartisieren:
    X = (X - X.mean(axis=0, keepdims=True)) / (X.std(axis=0, keepdims=True) + 1e-30)
    X = 5 * np.tanh(0.2 * X)

    y = (y - y.mean(axis=0, keepdims=True)) / (y.std(axis=0, keepdims=True) + 1e-30)
scale = True
if scale:
    #y = y.reshape(-1, 1)
    scaler_X = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    X = X_scaled

    #scaler_y = StandardScaler()
    #y_scaled = scaler_y.fit_transform(y)

#print(X.shape)
#print(y.shape)
#print(X[42])

In [4]:
# 2L VKOGA und VKOGA initialisieren und fitten über kFold
rmse_values_matrix = []
rmse_values_noMatrix = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model_matrix = VKOGA_2L(
        kernel=[kernels.Matern(k=1), tkernels.Matern(k=1)], # quadratic Matern kernel used
        flag_2L_optimization=True,
        verbose=False,
        greedy_type='f_greedy',
        reg_par=0,
        restr_par=1e-2,
        tol_f=1e-10,
        tol_p=1e-10,
        reg_para_optim=1e-5,
        learning_rate=5e-3,
        n_epochs_optim=25,
        batch_size=64,
        flag_optim_verbose=False
    )

    model_matrix.fit(X_train, y_train, maxIter=1000)
    
    predictions_matrix = model_matrix.predict(X_test)
    
    rmse_matrix = np.sqrt(np.mean((predictions_matrix - y_test) ** 2))
    rmse_values_matrix.append(rmse_matrix)
    
    print(f"Fold 2L RMSE: {rmse_matrix:.6f}")
    
    
    ep_values = [0.01, 0.05, 0.1, 0.5, 1.0, 10.0]
    rmse_list_noMatrix = []

    for i, ep in enumerate(ep_values):
        model_noMatrix = VKOGA_2L(
            kernel=kernels.Matern(k=1, ep=ep),
            flag_2L_optimization=False,
            verbose=False,
            greedy_type='f_greedy',
            reg_par=0,
            restr_par=1e-2,
            tol_f=1e-10,
            tol_p=1e-10,
            flag_optim_verbose=False
        )
        model_noMatrix.fit(X_train, y_train, maxIter=1000)
        predictions_noMatrix = model_noMatrix.predict(X_test)
        rmse_noMatrix = np.sqrt(np.mean((predictions_noMatrix - y_test) ** 2))
        rmse_list_noMatrix.append(rmse_noMatrix)
    
    min_rmse_noMatrix = min(rmse_list_noMatrix)
    rmse_values_noMatrix.append(min_rmse_noMatrix)
    best_ep = rmse_list_noMatrix.index(min_rmse_noMatrix)
    print("Kleinster MSE von VKOGA mit ", min_rmse_noMatrix.round(decimals=8), "von ep Wert: ", ep_values[best_ep])
    print("Kleinster MSE von 2L-VKOGA mit ", rmse_matrix.round(decimals=8))
    
    
    
mean_rmse_matrix = np.mean(rmse_values_matrix)
std_rmse_matrix = np.std(rmse_values_matrix)

print(f"\nRMSE pro Fold: {rmse_values_matrix}")
print(f"Durchschnittlicher 2L RMSE: {mean_rmse_matrix:.6f} ± {std_rmse_matrix:.6f}")

mean_rmse_noMatrix = np.mean(rmse_values_noMatrix)
std_rmse_noMatrix = np.std(rmse_values_noMatrix)

print(f"\nRMSE pro Fold: {rmse_values_noMatrix}")
print(f"Durchschnittlicher RMSE: {mean_rmse_noMatrix:.6f} ± {std_rmse_noMatrix:.6f}")

Fold 2L RMSE: 5.340219
Kleinster MSE von VKOGA mit  5.55577846 von ep Wert:  1.0
Kleinster MSE von 2L-VKOGA mit  5.34021898
Fold 2L RMSE: 5.271903
Kleinster MSE von VKOGA mit  5.45869623 von ep Wert:  1.0
Kleinster MSE von 2L-VKOGA mit  5.27190314
Fold 2L RMSE: 5.420064
Kleinster MSE von VKOGA mit  5.53298274 von ep Wert:  1.0
Kleinster MSE von 2L-VKOGA mit  5.42006421
Fold 2L RMSE: 5.242145
Kleinster MSE von VKOGA mit  5.59223383 von ep Wert:  1.0
Kleinster MSE von 2L-VKOGA mit  5.24214512
Fold 2L RMSE: 5.405743
Kleinster MSE von VKOGA mit  5.52936842 von ep Wert:  1.0
Kleinster MSE von 2L-VKOGA mit  5.40574341

RMSE pro Fold: [5.34021897827658, 5.27190314468769, 5.42006421061713, 5.242145121836921, 5.405743406352335]
Durchschnittlicher 2L RMSE: 5.336015 ± 0.070521

RMSE pro Fold: [5.555778458053573, 5.4586962303548985, 5.532982744760309, 5.592233830308974, 5.529368421635444]
Durchschnittlicher RMSE: 5.533812 ± 0.043723
