In [1]:
# 📦 Vereiste imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn preprocessing, feature selection, and decomposition
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, RFECV, SequentialFeatureSelector, f_classif

# Scikit-learn models and evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import (
    StratifiedKFold,
    train_test_split,
    cross_validate,
    cross_val_score,
    GridSearchCV,
)
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.pipeline import Pipeline


# Additional imports
import pickle

# 🧾 Stap 1 – Laad de data
data = pd.read_excel("..\..\TrainData.xlsx")

# 🔍 Eerste inspectie
print("Vorm van de data:", data.shape)
print("Kolommen:", data.columns.tolist())
print("Aantal duplicaten:", data.duplicated().sum())
print("Missende waarden per kolom:")
print(data.isnull().sum())

# 🧼 Stap 2 – Dubbele rijen verwijderen
data = data.drop_duplicates()

# 🎯 Stap 3 – Split in features en labels
X = data.drop(columns="label")
y = data["label"]

# 🔁 Zet y om naar numeriek met behoud van pandas Series
label_encoder = LabelEncoder()
y = pd.Series(label_encoder.fit_transform(y), index=y.index)

# ❓ Hoeveel missende waarden blijven over?
print("Totaal aantal missende waarden:", X.isnull().sum().sum())

# ⚠️ Stap 4 – NaNs imputer (mediaan)
X = X.fillna(X.median(numeric_only=True))


#Data normalisatie

X_Scaled = StandardScaler().fit_transform(X)




  data = pd.read_excel("..\..\TrainData.xlsx")
  data = pd.read_excel("..\..\TrainData.xlsx")


FileNotFoundError: [Errno 2] No such file or directory: '..\\..\\TrainData.xlsx'

In [None]:
#Extra function to include RandomForest feature importance in KNN

class KNNWithFeatureImportance(KNeighborsClassifier):
    def __init__(self, n_neighbors=5, **kwargs):
        super().__init__(n_neighbors=n_neighbors, **kwargs)
        self.feature_importances_ = None
    
    def fit(self, X, y):
        # Call the original fit method
        super().fit(X, y)
        
        # Create a simple feature importance based on feature variance
        # This is just one possible approach - you could use other methods
        rf_model = RandomForestClassifier(n_estimators=50)
        rf_model.fit(X, y)
        
        self.feature_importances_ = rf_model.feature_importances_
        
        return self



In [None]:
# The following code defines two pipelines for hyperparameter tuning using GridSearchCV.
# Both pipelines aim to optimize the K-Nearest Neighbors (KNN) classifier with different feature selection methods.

# Pipeline 1:
# - Uses SelectKBest for univariate feature selection.
# - Applies Recursive Feature Elimination with Cross-Validation (RFECV) to further refine feature selection.
# - Reduces dimensionality using PCA.
# - Trains the KNN classifier with a custom feature importance implementation (KNNWithFeatureImportance).
# The hyperparameters for SelectKBest, PCA, and KNN are tuned using GridSearchCV.

# Pipeline 2:
# - Similar to Pipeline 1 but replaces RFECV with Sequential Feature Selector for feature selection.
# - Trains a standard KNN classifier.
# The hyperparameters for SelectKBest, PCA, and KNN are also tuned using GridSearchCV.

# Both pipelines save their respective GridSearchCV results


knnIMPORTANCE = KNNWithFeatureImportance()

# Define the pipeline 1
pipeline = Pipeline([
    ('feature_selection_1', SelectKBest()), 
    ('feature_selection_2', RFECV(knnIMPORTANCE, step=1, cv=StratifiedKFold(5), scoring='roc_auc')),
    ('pca', PCA()),  # PCA to reduce dimensionality
    ('knn', knnIMPORTANCE)
])

param_grid = {
    "feature_selection_1__k": [80, 90, 100, 110, 120],
    "knn__n_neighbors": [5, 7, 9, 11, 15, 20],
    "pca__n_components": [10, 12, 14, 14+2, 18, 20, 0.9999]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)

#new tested_score
nested_score = cross_val_score(grid_search, X=, y=y_iris, cv=outer_cv)
nested_scores[i] = nested_score.mean()


grid_search.fit(X_Scaled, y)

results_df = pd.DataFrame(grid_search.cv_results_)

# Save the DataFrame to an Excel file
results_df.to_excel("Results_grid_search\grid_search_KNN_results_RFECV.xlsx", index=False)

#PIPELINE 2
KNN = KNeighborsClassifier()

pipeline_2 = Pipeline([
    ('feature_selection_1', SelectKBest()), 
    ('feature_selection_2',SequentialFeatureSelector(KNN, direction='forward', scoring='roc_auc', n_jobs=-1)),
    ('pca', PCA()),  # PCA to reduce dimensionality
    ('knn', KNN)
])

param_grid_2 = {
    "feature_selection_1__k": [80, 90, 100, 110, 120],
    "knn__n_neighbors": [5, 7, 9, 11, 15, 20],
    "pca__n_components": [10, 12, 14, 14+2, 18, 20, 0.9999]
}

grid_search_2 = GridSearchCV(pipeline_2, param_grid_2, cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)

grid_search_2.fit(X_Scaled, y)

results_df = pd.DataFrame(grid_search_2.cv_results_)

# Save the DataFrame to an Excel file
results_df.to_excel("Results_grid_search\grid_search_KNN_results_forward.xlsx", index=False)

In [None]:
# This code performs a final grid search to fine-tune the hyperparameters of the KNN classifier.
# It uses a pipeline with the following steps:
# - SelectKBest: Selects the top 80 features based on univariate statistical tests.
# - RFECV: Applies Recursive Feature Elimination with Cross-Validation to further refine feature selection.
# - PCA: Reduces the dimensionality of the data to 20 components.
# - KNNWithFeatureImportance: A custom KNN classifier that incorporates feature importance.

# The grid search optimizes the hyperparameters for SelectKBest, PCA, and KNN:
# - "feature_selection_1__k": Number of features to select in SelectKBest.
# - "knn__n_neighbors": Number of neighbors for the KNN classifier.
# - "pca__n_components": Number of principal components to retain in PCA.

# The results of the grid search are saved to an Excel file for further analysis.

knnIMPORTANCE = KNNWithFeatureImportance()

# Define the pipeline 1
pipeline = Pipeline([
    ('feature_selection_1', SelectKBest()), 
    ('feature_selection_2', RFECV(knnIMPORTANCE, step=1, cv=StratifiedKFold(5), scoring='roc_auc')),
    ('pca', PCA()),  # PCA to reduce dimensionality
    ('knn', knnIMPORTANCE)
])

param_grid = {
    "feature_selection_1__k": [50, 55, 60, 65, 70, 75, 80, 85],
    "knn__n_neighbors": [3, 5],
    "pca__n_components": [10, 12, 14, 16, 18, 20]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-2)

grid_search.fit(X_Scaled, y)

results_df = pd.DataFrame(grid_search.cv_results_)

# Save the DataFrame to an Excel file
results_df.to_excel("grid_search_KNN_results_RECV_2.xlsx", index=False)

In [None]:
# This code defines a pipeline for further hyperparameter tuning of a KNN classifier with SequentialFeatureSelector:
# - Step 1: SelectKBest selects the top 80 features based on univariate statistical tests.
# - Step 2: SequentialFeatureSelector performs forward feature selection to refine the feature set.
# - Step 3: PCA reduces the dimensionality of the data.
# - Step 4: KNN classifier is trained on the processed features.
# A GridSearchCV is used to optimize the hyperparameters for SelectKBest, SequentialFeatureSelector, PCA, and KNN.
# The results of the grid search are saved to an Excel file for further analysis.



KNN = KNeighborsClassifier()

clf = SelectKBest(k=80)
clf.fit(X_Scaled, y)
X_k_best = clf.transform(X_Scaled)
selected_features = clf.get_support(indices=True)


pipeline_4 = Pipeline([
  #  ('feature_selection_1', SelectKBest()), 
    ('feature_selection_2',SequentialFeatureSelector(KNN, direction='forward',cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)),
  #  ('pca', PCA()),  # PCA to reduce dimensionality
    ('knn', KNN)
])

param_grid_4 = {
    #"feature_selection_1__k": [80],
    'feature_selection_2__n_features_to_select': [2, 3, 4, 5, 6, 7, 8, 9],
    "knn__n_neighbors": [5, 7, 9, 11, 15, 20],
   # "pca__n_components": [0.9999]
}

grid_search_4 = GridSearchCV(pipeline_4, param_grid_4, cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)

grid_search_4.fit(X_k_best, y)

results_df = pd.DataFrame(grid_search_4.cv_results_)

# Save the DataFrame to an Excel file
results_df.to_excel(r"Results_grid_search\grid_search_KNN_results_forward_2.xlsx", index=False)

In [23]:
# Save the final trained KNN model along with feature masks and scaler

# Step 1: Standardize the features using StandardScaler
scaler_standard = StandardScaler()
X_Scaled = scaler_standard.fit_transform(X)

# Step 2: Perform feature selection using SelectKBest
selector_KBest = SelectKBest(k=80)  # Select the top 80 features
X_kbest = selector_KBest.fit_transform(X_Scaled, y)
X_kbest_features_mask = selector_KBest.get_support()  # Boolean mask of selected features

# Step 3: Initialize the custom KNN classifier with feature importance
#KNN = KNNWithFeatureImportance(n_neighbors=5)
KNN = KNeighborsClassifier(n_neighbors=9)

# Step 4: Perform Recursive Feature Elimination with Cross-Validation (RFECV)
#selector_RFECV = RFECV(KNN, step=1, cv=StratifiedKFold(5), scoring='roc_auc')
#X_selected = selector_RFECV.fit_transform(X_kbest, y)
#X_RFECV_features_mask = selector_RFECV.get_support()  # Boolean mask of selected features after RFECV
selector_forward = SequentialFeatureSelector(KNN, n_features_to_select= 9, direction='forward', cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)
X_selected = selector_forward.fit_transform(X_kbest, y)
X_forward_features_mask = selector_forward.get_support()  # Boolean mask of selected features after RFECV

# Step 5: Combine the feature masks from SelectKBest and selector
X_features_mask_total = np.zeros(len(X_kbest_features_mask), dtype=bool)  # Initialize a mask with all False
indices_after_X_kbest_features_mask = np.where(X_kbest_features_mask)[0]  # Indices of features selected by SelectKBest
indices_to_keep = indices_after_X_kbest_features_mask[X_forward_features_mask]  # Apply  mask to SelectKBest features
X_features_mask_total[indices_to_keep] = True  # Update the combined mask

# Step 6: Train the final KNN model on the selected features
KNN.fit(X_selected, y)

# Step 7: Save the trained KNN model, combined feature mask, and scaler to disk
with open(r'KNNFinal\KNN_model.pkl', 'wb') as file:
    pickle.dump(KNN, file)  # Save the trained KNN model

with open(r'KNNFinal\selected_features_mask.pkl', 'wb') as file:
    pickle.dump(X_features_mask_total, file)  # Save the combined feature mask

with open(r'KNNFinal\KNN_Scaler.pkl', 'wb') as file:
    pickle.dump(scaler_standard, file)  # Save the scaler used for standardization

  f = msb / msw


In [None]:
#The following code was used to try different things and plot the graphs

#k-best
clf = SelectKBest(k=120)
clf.fit(X_Scaled, y)
X_k_best = clf.transform(X_Scaled)
selected_features = clf.get_support(indices=True)

#RFECV met KNN
clf = KNNWithFeatureImportance(n_neighbors=5)
selector = RFECV(clf, step=1, cv=StratifiedKFold(5), scoring='accuracy')
X_selected = selector.fit_transform(X_k_best, y)


# make a plot of the KNNWithFeatureImportance
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (ROC AUC)")
plt.title("Recursive Feature Elimination with Cross-Validation (KNN=5)")
plt.grid(True)
plt.plot(range(1, len(selector.cv_results_['mean_test_score']) + 1), selector.cv_results_['mean_test_score'])
plt.show()

#SequentialFeatureSelector
clf = KNeighborsClassifier(n_neighbors=9)
selector = SequentialFeatureSelector(clf, direction='forward', cv=StratifiedKFold(5))
X_selected = selector.fit_transform(X_k_best, y)

# make a plot of the KNNWithFeatureImportance
cv = StratifiedKFold(5)
scores = []
for n_features in range(1, X_k_best.shape[1] + 1):
    selector = SequentialFeatureSelector(clf, n_features_to_select=n_features, direction='forward', cv=cv)
    X_selected = selector.fit_transform(X_k_best, y)
    score = np.mean(cross_val_score(clf, X_selected, y, cv=cv, scoring='roc_auc'))
    scores.append(score)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (ROC AUC)")
plt.title("Forward Feature Elimination with Cross-Validation (KNN=9)")
plt.grid(True)
plt.plot(range(1, len(scores) + 1), scores)
plt.show()

#general testcode
KNN = KNeighborsClassifier(n_neighbors=4+1)
scores = cross_validate(KNN, X_selected, y, cv=StratifiedKFold(10), scoring=["roc_auc", "accuracy"])
print(f"AUC = {scores['test_roc_auc'].mean()} and accuracy = {scores['test_accuracy'].mean()}")


# using PCA to reduce the dimensionality of the data
from sklearn.decomposition import PCA
for i in [22]:
    pca = PCA(n_components=i) 
    X_pca = pca.fit_transform(X_selected)
    KNN = KNeighborsClassifier(n_neighbors=4+1)
    scores = cross_validate(KNN, X_pca, y, cv=StratifiedKFold(10), scoring=["roc_auc", "accuracy"])
    print(f"AUC = {scores['test_roc_auc'].mean()} and accuracy = {scores['test_accuracy'].mean()}")


# baseline test
scores = cross_validate(KNN, X, y, cv=StratifiedKFold(10), scoring=["roc_auc", "accuracy"])
print(f"AUC = {scores['test_roc_auc'].mean()} and accuracy = {scores['test_accuracy'].mean()}")

In [None]:
X_k_best.shape[1]

In [2]:
#k-best
clf = SelectKBest(k=80)
clf.fit(X_Scaled, y)
X_k_best = clf.transform(X_Scaled)
selected_features = clf.get_support(indices=True)


KNN = KNeighborsClassifier(n_neighbors=9)

  f = msb / msw


In [21]:
cv = StratifiedKFold(5)

selector = SequentialFeatureSelector(KNN, n_features_to_select=9, direction='forward', cv=cv, n_jobs=-1)
X_selected = selector.fit_transform(X_k_best, y)
np.mean(cross_val_score(KNN, X_selected, y, cv=cv, scoring='roc_auc'))

np.float64(0.8478806907378335)

In [22]:
pca = PCA(n_components=0.99) 
X_pca = pca.fit_transform(X_selected)


np.mean(cross_val_score(KNN, X_pca, y, cv=cv, scoring='roc_auc'))

np.float64(0.8552197802197803)

In [None]:
# make a plot of the SequentialFeatureSelector

cv = StratifiedKFold(5)
scores = []
for n_features in range(1, X_k_best.shape[1] + 1):
    selector = SequentialFeatureSelector(KNN, n_features_to_select=n_features, direction='forward', cv=cv, n_jobs=-1)
    X_selected = selector.fit_transform(X_k_best, y)
    score = np.mean(cross_val_score(KNN, X_selected, y, cv=cv, scoring='roc_auc'))
    scores.append(score)
    print(score)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (ROC AUC)")
plt.title("Forward Feature Elimination with Cross-Validation (KNN=9)")
plt.grid(True)
plt.plot(range(1, len(scores) + 1), scores)
plt.savefig(r"pictures\KNN_SequentialFeatureSelector.png")
plt.show()
