In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedGroupKFold, GridSearchCV

# src
from src.medidas_desempeno_biclase import calcula_medidas_biclase

In [28]:
semilla = 42

ruta = 'data/Electricity_limpio.csv'
df = pd.read_csv(ruta)
df.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,class
0,0.830492,0.5,0.234043,0.143371,0.259799,1
1,0.033902,1.0,0.553191,0.025308,0.26832,0
2,0.864394,0.0,0.765957,0.432321,0.805719,1
3,0.118751,0.333333,0.93617,0.339449,0.552735,1
4,0.813637,0.166667,0.170213,0.010216,0.138988,0


In [29]:
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size = .20,
    stratify = y,
    random_state = semilla)

print('===== Dimensiones =====')
print(f'X       : {X.shape}')
print(f'y       : {y.shape}')
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test  : {X_test.shape}')
print(f'y_test  : {y_test.shape}')

===== Dimensiones =====
X       : (2400, 5)
y       : (2400,)
X_train : (1920, 5)
y_train : (1920,)
X_test  : (480, 5)
y_test  : (480,)


In [30]:
pipeline = Pipeline([
    ('knn', KNeighborsClassifier())
])

param_grid = {
     'knn__n_neighbors': [3, 5, 7, 9],
     'knn__metric': ['euclidean']
}

skf = StratifiedGroupKFold(n_splits = 10, 
                           shuffle = True,
                           random_state = semilla)

grid_search = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    cv = 10,
    scoring = 'accuracy',
    n_jobs = -1,
    verbose = 1,
    return_train_score = True
)

print(grid_search)

GridSearchCV(cv=10, estimator=Pipeline(steps=[('knn', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'knn__metric': ['euclidean'],
                         'knn__n_neighbors': [3, 5, 7, 9]},
             return_train_score=True, scoring='accuracy', verbose=1)


In [31]:
grid_search.fit(X_train, y_train)
print(f'Mejores parámetros: {grid_search.best_params_}')
print(f'Mejor score       : {np.round(grid_search.best_score_, 4)}')

Fitting 10 folds for each of 4 candidates, totalling 40 fits


Mejores parámetros: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3}
Mejor score       : 0.8885


In [34]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [36]:
medidas = calcula_medidas_biclase(y_test, y_pred)
medidas

Unnamed: 0,Medida,Valor
0,Accuracy,0.89375
1,Error Rate,0.10625
2,Recall (Sensitivity),0.860825
3,Specificity,0.916084
4,Balanced Accuracy,0.888454
5,Precision,0.874346
6,F1 Score,0.867532
7,MCC,0.778912
