In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance

In [2]:
df = pd.read_csv("dataset/gas_turbine_fault_detection_simulated3.csv")  

df_fault = df[df["Fault"] == 1].copy()

cols_drop = ["Turbine ID", "TTC", "Fault", "Fault Mode"]  
X = df_fault.drop(columns=cols_drop)
y = df_fault["Fault Mode"]  # multiclase

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [3]:
f1_macro = make_scorer(f1_score, average="macro")

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier())
])

param_grid = [
    {
        "clf": [KNeighborsClassifier()],
        "clf__n_neighbors": [3, 5, 7, 9, 11, 15, 21],
        
        "clf__weights": ['uniform', 'distance'],
        
        "clf__metric": ['minkowski', 'euclidean', 'manhattan'],
        "clf__p": [1, 2],
        
        "clf__algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
        
        "clf__leaf_size": [20, 30, 40]
    }
]

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=f1_macro,        
    cv=5,                    
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

grid.fit(X_train, y_train)

print("Mejores hiperparámetros:", grid.best_params_)
print("Mejor F1_macro CV:", grid.best_score_)


Fitting 5 folds for each of 1008 candidates, totalling 5040 fits
Mejores hiperparámetros: {'clf': KNeighborsClassifier(), 'clf__algorithm': 'auto', 'clf__leaf_size': 20, 'clf__metric': 'minkowski', 'clf__n_neighbors': 11, 'clf__p': 1, 'clf__weights': 'distance'}
Mejor F1_macro CV: 0.9468623598298421


In [4]:
best_clf = grid.best_estimator_

In [5]:
results_df = pd.DataFrame(grid.cv_results_)
results_df.columns = [col.replace('param_clf__', '').replace('param_', '') for col in results_df.columns]
results_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,clf,algorithm,leaf_size,metric,n_neighbors,p,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.036508,0.004031,0.833708,0.191803,KNeighborsClassifier(),auto,20,minkowski,3,1,...,0.942278,0.003452,469,0.971089,0.971346,0.972606,0.971041,0.970034,0.971223,0.000824
1,0.038454,0.002446,0.519755,0.098809,KNeighborsClassifier(),auto,20,minkowski,3,1,...,0.943224,0.003133,397,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.035422,0.005351,0.516195,0.113188,KNeighborsClassifier(),auto,20,minkowski,3,2,...,0.936045,0.004428,973,0.964977,0.96553,0.966485,0.965928,0.96437,0.965458,0.000734
3,0.035105,0.003724,0.343426,0.064435,KNeighborsClassifier(),auto,20,minkowski,3,2,...,0.936869,0.00391,937,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.030996,0.001645,0.609132,0.035909,KNeighborsClassifier(),auto,20,minkowski,5,1,...,0.945162,0.003125,289,0.963114,0.963058,0.964379,0.964319,0.963418,0.963658,0.000578
