In [2]:
# 📦 Vereiste imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn preprocessing, feature selection, and decomposition
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, RFECV, SequentialFeatureSelector, f_classif

# Scikit-learn models and evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import (
    StratifiedKFold,
    train_test_split,
    cross_validate,
    cross_val_score,
    GridSearchCV,
)
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

# Additional imports
import pickle


# load data + feature_mask

data = pd.read_excel(r"..\..\TrainData.xlsx")

with open(r"..\..\mask_selected_features.pkl", "rb") as f:
    feature_mask = pickle.load(f)


# Data preprocessing

data = data.drop_duplicates()

# 🎯 Stap 3 – Split in features en labels
X = data.drop(columns="label")
y = data["label"]

# 🔁 Zet y om naar numeriek met behoud van pandas Series
label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(y), index=y.index)

# ❓ Hoeveel missende waarden blijven over?
print("Totaal aantal missende waarden:", X.isnull().sum().sum())

# ⚠️ Stap 4 – NaNs imputer (mediaan)
X = X.fillna(X.median(numeric_only=True))


#Data selectie + normalisatie

X_Scaled = StandardScaler().fit_transform(X)
X_selected = X_Scaled[:, feature_mask]



Totaal aantal missende waarden: 0


In [54]:
#baseline test
clf = GaussianNB(var_smoothing=0.000043)
score = cross_val_score(clf, X_selected, y, cv=StratifiedKFold(5), scoring="accuracy")
print(score)
print(np.mean(score))

[0.64285714 0.71428571 0.64285714 0.82142857 0.74074074]
0.7124338624338623


In [55]:
# (DEZE GEBRUIKEN) difficult nested score to see best params. Perform grid-search with nested cross-validation for hyperparameter tuning

NUM_TRIALS = 100
nested_scores = np.zeros(NUM_TRIALS)
Gaussian = GaussianNB()


param_grid = {
    #"var_smoothing": [2,3,4, 1, 0.1, 0.001, 0.0001],
    "var_smoothing": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
}

nested_scores = np.zeros(NUM_TRIALS)
all_best_params = []

for i in range(NUM_TRIALS):
    # Create outer and inner cross-validation objects
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    # Store the best parameters and scores for each outer fold
    best_params = []
    outer_scores = []

    for train_idx, test_idx in outer_cv.split(X_selected, y):
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Create a new GridSearchCV for each inner fold
        clf = GridSearchCV(Gaussian, param_grid, cv=inner_cv, scoring="accuracy", n_jobs=-1)
        
        # Fit GridSearchCV on the training data
        clf.fit(X_train, y_train)
        
        # Get the best model from inner CV
        best_model = clf.best_estimator_
        
        # Evaluate the best model on the outer test fold
        y_pred = best_model.predict_proba(X_test)[:, 1]
        outer_score = roc_auc_score(y_test, y_pred)
        
        # Append the best parameters and outer score
        best_params.append(clf.best_params_)
        outer_scores.append(outer_score)
    
    # Store the mean score for this trial
    nested_scores[i] = np.mean(outer_scores)
    all_best_params.append([i, best_params])

test = [best_params for _, best_params in all_best_params]

df = pd.DataFrame(test, columns=["inner_1", "inner_2", "inner_3", "inner_4", "inner_5"])

with open(r"output_grid_searches\Bayes_nested_scores.pkl", "wb") as f:
    pickle.dump(nested_scores, f)

with open(r"output_grid_searches\Bayes_best_params.pkl", "wb") as f:
    pickle.dump(df, f)

# Print results
print(f"Average performance across {NUM_TRIALS} trials: {np.mean(nested_scores):.4f} ± {np.std(nested_scores):.4f}")


Average performance across 100 trials: 0.8552 ± 0.0227


In [56]:
df

Unnamed: 0,inner_1,inner_2,inner_3,inner_4,inner_5
0,{'var_smoothing': 1},{'var_smoothing': 2},{'var_smoothing': 2},{'var_smoothing': 1},{'var_smoothing': 3}
1,{'var_smoothing': 1},{'var_smoothing': 1},{'var_smoothing': 3},{'var_smoothing': 1},{'var_smoothing': 8}
2,{'var_smoothing': 1},{'var_smoothing': 1},{'var_smoothing': 1},{'var_smoothing': 3},{'var_smoothing': 1}
3,{'var_smoothing': 1},{'var_smoothing': 5},{'var_smoothing': 1},{'var_smoothing': 7},{'var_smoothing': 1}
4,{'var_smoothing': 1},{'var_smoothing': 5},{'var_smoothing': 4},{'var_smoothing': 3},{'var_smoothing': 1}
...,...,...,...,...,...
95,{'var_smoothing': 1},{'var_smoothing': 4},{'var_smoothing': 2},{'var_smoothing': 1},{'var_smoothing': 8}
96,{'var_smoothing': 1},{'var_smoothing': 2},{'var_smoothing': 3},{'var_smoothing': 1},{'var_smoothing': 1}
97,{'var_smoothing': 3},{'var_smoothing': 2},{'var_smoothing': 1},{'var_smoothing': 5},{'var_smoothing': 1}
98,{'var_smoothing': 3},{'var_smoothing': 2},{'var_smoothing': 2},{'var_smoothing': 4},{'var_smoothing': 1}


In [None]:
##TEST met standardScaler. 

# difficult nested score to see best params. Perform grid-search with nested cross-validation for hyperparameter tuning

NUM_TRIALS = 100
nested_scores = np.zeros(NUM_TRIALS)
Gaussian = GaussianNB()

pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Add a scaler to the pipeline
    ("clf", GaussianNB()),
])

param_grid = {
    #"var_smoothing": [2,3,4, 1, 0.1, 0.001, 0.0001],
    "clf__var_smoothing": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
}

nested_scores = np.zeros(NUM_TRIALS)
all_best_params = []

for i in range(NUM_TRIALS):
    # Create outer and inner cross-validation objects
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    # Store the best parameters and scores for each outer fold
    best_params = []
    outer_scores = []

    for train_idx, test_idx in outer_cv.split(X_selected, y):
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Create a new GridSearchCV for each inner fold
        clf = GridSearchCV(pipeline, param_grid, cv=inner_cv, scoring="roc_auc", n_jobs=-1)
        
        # Fit GridSearchCV on the training data
        clf.fit(X_train, y_train)
        
        # Get the best model from inner CV
        best_model = clf.best_estimator_
        
        # Evaluate the best model on the outer test fold
        y_pred = best_model.predict_proba(X_test)[:, 1]
        outer_score = roc_auc_score(y_test, y_pred)
        
        # Append the best parameters and outer score
        best_params.append(clf.best_params_)
        outer_scores.append(outer_score)
    
    # Store the mean score for this trial
    nested_scores[i] = np.mean(outer_scores)
    all_best_params.append([i, best_params])

test = [best_params for _, best_params in all_best_params]

df = pd.DataFrame(test, columns=["inner_1", "inner_2", "inner_3", "inner_4", "inner_5"])

# Print results
print(f"Average performance across {NUM_TRIALS} trials: {np.mean(nested_scores):.4f} ± {np.std(nested_scores):.4f}")


Average performance across 100 trials: 0.8702 ± 0.0212


In [7]:
df

Unnamed: 0,inner_1,inner_2,inner_3,inner_4,inner_5
0,{'clf__var_smoothing': 7},{'clf__var_smoothing': 7},{'clf__var_smoothing': 15},{'clf__var_smoothing': 4},{'clf__var_smoothing': 6}
1,{'clf__var_smoothing': 7},{'clf__var_smoothing': 8},{'clf__var_smoothing': 11},{'clf__var_smoothing': 9},{'clf__var_smoothing': 2}
2,{'clf__var_smoothing': 8},{'clf__var_smoothing': 11},{'clf__var_smoothing': 15},{'clf__var_smoothing': 14},{'clf__var_smoothing': 12}
3,{'clf__var_smoothing': 15},{'clf__var_smoothing': 14},{'clf__var_smoothing': 13},{'clf__var_smoothing': 14},{'clf__var_smoothing': 3}
4,{'clf__var_smoothing': 10},{'clf__var_smoothing': 6},{'clf__var_smoothing': 15},{'clf__var_smoothing': 12},{'clf__var_smoothing': 10}
...,...,...,...,...,...
95,{'clf__var_smoothing': 14},{'clf__var_smoothing': 9},{'clf__var_smoothing': 8},{'clf__var_smoothing': 12},{'clf__var_smoothing': 5}
96,{'clf__var_smoothing': 7},{'clf__var_smoothing': 3},{'clf__var_smoothing': 15},{'clf__var_smoothing': 5},{'clf__var_smoothing': 12}
97,{'clf__var_smoothing': 11},{'clf__var_smoothing': 14},{'clf__var_smoothing': 6},{'clf__var_smoothing': 14},{'clf__var_smoothing': 12}
98,{'clf__var_smoothing': 14},{'clf__var_smoothing': 12},{'clf__var_smoothing': 9},{'clf__var_smoothing': 11},{'clf__var_smoothing': 12}


In [36]:
# Calculate the confidence interval for the nested scores

confidence = 0.95  # 95% confidence level
mean_score = np.mean(nested_scores)
std_error = np.std(nested_scores, ddof=1) / np.sqrt(len(nested_scores))  # Standard error of the mean
confidence_interval_lower = mean_score - (std_error * 1.96)  # Z-score for 95% confidence interval
confidence_interval_upper = mean_score + (std_error * 1.96)  # Z-score for 95% confidence interval

print(f"gemiddeld = {mean_score} 95% Confidence Interval: {confidence_interval_lower:.4f} to {confidence_interval_upper:.4f}")

gemiddeld = 0.8674458398744112 95% Confidence Interval: 0.8633 to 0.8716


In [57]:
best_params = pd.Series(df.values.ravel())
best_params.value_counts()[0:10]

{'var_smoothing': 1}     287
{'var_smoothing': 2}      97
{'var_smoothing': 3}      42
{'var_smoothing': 4}      23
{'var_smoothing': 5}      21
{'var_smoothing': 6}       9
{'var_smoothing': 8}       5
{'var_smoothing': 7}       4
{'var_smoothing': 13}      3
{'var_smoothing': 12}      2
Name: count, dtype: int64

In [45]:
# train and save final model

# Save the final trained bayes model along with feature masks and scaler

# Step 1: Standardize and select features
scaler_standard = StandardScaler()
scaler_standard.fit(X)
X_Scaled = scaler_standard.transform(X)
X_selected = X_Scaled[:, feature_mask]



# Step 3: Initialize the custom bayes classifier
Bayes = GaussianNB(var_smoothing=2)


# Step 4: Train the final KNN model on the selected features
Bayes.fit(X_selected, y)

# Step 7: Save the trained KNN model, PCA transformer and scaler to disk
with open(r'BayesFinal\Bayes_model.pkl', 'wb') as file:
    pickle.dump(Bayes, file)  # Save the trained KNN model


with open(r'BayesFinal\Bayes_Scaler.pkl', 'wb') as file:
    pickle.dump(scaler_standard, file)  # Save the scaler used for standardization
