In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

# Modelos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC  
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import GaussianNB  
from sklearn.neural_network import MLPClassifier   
from sklearn.ensemble import AdaBoostClassifier              
from lightgbm import LGBMClassifier

# src
from src.medidas_desempeno_biclase import calcula_medidas_biclase

# Shapiro-Wilk
from scipy.stats import shapiro

In [64]:
semilla = 42

ruta = 'data/Electricity_limpio.csv'
df = pd.read_csv(ruta)
df.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,class
0,0.830492,0.5,0.234043,0.143371,0.259799,1
1,0.033902,1.0,0.553191,0.025308,0.26832,0
2,0.864394,0.0,0.765957,0.432321,0.805719,1
3,0.118751,0.333333,0.93617,0.339449,0.552735,1
4,0.813637,0.166667,0.170213,0.010216,0.138988,0


In [65]:
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size = .20,
    stratify = y,
    random_state = semilla)

print('===== Dimensiones =====')
print(f'X       : {X.shape}')
print(f'y       : {y.shape}')
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test  : {X_test.shape}')
print(f'y_test  : {y_test.shape}')

===== Dimensiones =====
X       : (2400, 5)
y       : (2400,)
X_train : (1920, 5)
y_train : (1920,)
X_test  : (480, 5)
y_test  : (480,)


## Modelos

In [66]:
print('========== KNN ==========')
pipeline = Pipeline([
    ('knn', KNeighborsClassifier(metric = 'euclidean', n_neighbors = 3))
])

results = cross_validate(
    estimator = pipeline,
    X = X_train,
    y = y_train,
    cv = 5,
    scoring = ['accuracy', 'recall', 'f1'],
    n_jobs = -1,
    verbose = 1,
    return_train_score = True
)
df_KNN = pd.DataFrame(results)
df_KNN



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.009514,0.034689,0.835938,0.952474,0.76129,0.917874,0.789298,0.939819
1,0.008922,0.032217,0.888021,0.944661,0.877419,0.922705,0.863492,0.93095
2,0.009722,0.054658,0.888021,0.94987,0.832258,0.924316,0.857143,0.937143
3,0.009707,0.035704,0.877604,0.953776,0.812903,0.937198,0.842809,0.94251
4,0.008127,0.03205,0.882812,0.946615,0.852564,0.925806,0.855305,0.933333


In [67]:
print('========== LR ==========')
pipeline = Pipeline([
    ('log_reg', LogisticRegression(penalty = 'l2', C = 1.0))
])

results = cross_validate(
    estimator = pipeline,
    X = X_train,
    y = y_train,
    cv = 5,
    scoring = ['accuracy', 'recall', 'f1'],
    n_jobs = -1,
    verbose = 1,
    return_train_score = True
)
df_LR = pd.DataFrame(results)
df_LR

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.038125,0.021,0.802083,0.799479,0.645161,0.636071,0.724638,0.71949
1,0.023911,0.014997,0.815104,0.797526,0.658065,0.639291,0.741818,0.718552
2,0.031104,0.015197,0.776042,0.80013,0.6,0.653784,0.683824,0.725648
3,0.032924,0.021593,0.786458,0.794271,0.580645,0.636071,0.687023,0.714286
4,0.022521,0.011658,0.799479,0.794922,0.679487,0.632258,0.733564,0.713376


In [68]:
print('========== RF ==========')
pipeline = Pipeline([
    ('rf', RandomForestClassifier(n_estimators = 50, max_depth = 10))
])

results = cross_validate(
    estimator = pipeline,
    X = X_train,
    y = y_train,
    cv = 5,
    scoring = ['accuracy', 'recall', 'f1'],
    n_jobs = -1,
    verbose = 1,
    return_train_score = True
)
df_RF = pd.DataFrame(results)
df_RF

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.5s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.171813,0.020681,0.966146,0.998047,0.948387,0.996779,0.957655,0.997583
1,0.171324,0.016459,0.947917,0.998047,0.948387,0.99839,0.936306,0.997586
2,0.240626,0.029508,0.955729,0.998698,0.941935,0.99839,0.944984,0.99839
3,0.212316,0.025669,0.96875,0.996094,0.954839,0.995169,0.961039,0.995169
4,0.2237,0.020538,0.934896,0.996094,0.923077,0.993548,0.920128,0.995153


In [69]:
print('========== XGB ==========')
pipeline = Pipeline([
    ('xgb', XGBClassifier(n_estimators = 100, max_depth = 3))
])

results = cross_validate(
    estimator = pipeline,
    X = X_train,
    y = y_train,
    cv = 5,
    scoring = ['accuracy', 'recall', 'f1'],
    n_jobs = -1,
    verbose = 1,
    return_train_score = True
)
df_XGB = pd.DataFrame(results)
df_XGB

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.067405,0.014657,0.963542,0.992188,0.929032,0.983897,0.953642,0.990276
1,0.079535,0.01342,0.955729,0.994141,0.96129,0.990338,0.946032,0.992736
2,0.043428,0.012864,0.963542,0.991536,0.948387,0.985507,0.954545,0.989491
3,0.060918,0.017303,0.966146,0.992839,0.954839,0.985507,0.957929,0.991093
4,0.043438,0.012915,0.950521,0.992839,0.935897,0.987097,0.938907,0.991093


In [70]:
print('========== SVM ==========')
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(
        C=1.0,
        kernel='rbf',
        gamma='scale'
    ))
])

results = cross_validate(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=['accuracy', 'recall', 'f1'],
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)
df_SVM = pd.DataFrame(results)
df_SVM



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.06405,0.024056,0.820312,0.844401,0.729032,0.73591,0.766102,0.792715
1,0.066823,0.025262,0.835938,0.834635,0.767742,0.755233,0.790698,0.786913
2,0.080187,0.033706,0.822917,0.845052,0.709677,0.747182,0.763889,0.795883
3,0.082754,0.034106,0.820312,0.834635,0.709677,0.761675,0.761246,0.788333
4,0.08046,0.034212,0.828125,0.832031,0.730769,0.716129,0.77551,0.774869


In [71]:
print('========== DECISION TREE ==========')
pipeline = Pipeline([
    ('dt', DecisionTreeClassifier(
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    ))
])

results = cross_validate(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=['accuracy', 'recall', 'f1'],
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)
df_DT = pd.DataFrame(results)
df_DT



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.009784,0.011522,0.953125,1.0,0.922581,1.0,0.940789,1.0
1,0.012041,0.017084,0.955729,1.0,0.948387,1.0,0.945338,1.0
2,0.011855,0.00909,0.950521,1.0,0.954839,1.0,0.939683,1.0
3,0.01029,0.010569,0.971354,1.0,0.96129,1.0,0.964401,1.0
4,0.013036,0.014746,0.960938,1.0,0.955128,1.0,0.952077,1.0


In [72]:
print('========== NAIVE BAYES ==========')
pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('nb', GaussianNB(
        var_smoothing=1e-9
    ))
])

results = cross_validate(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=['accuracy', 'recall', 'f1'],
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)
df_NB = pd.DataFrame(results)
df_NB

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.011367,0.017383,0.802083,0.789714,0.63871,0.607085,0.722628,0.700093
1,0.011674,0.01721,0.809896,0.789062,0.645161,0.616747,0.732601,0.702752
2,0.01102,0.016906,0.783854,0.797526,0.612903,0.628019,0.695971,0.71494
3,0.008124,0.012843,0.78125,0.792969,0.56129,0.623188,0.674419,0.708791
4,0.008736,0.013058,0.789062,0.796224,0.641026,0.622581,0.711744,0.711521


In [73]:
print('========== MLP ==========')
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(100,),  
        activation='relu',
        solver='adam',
        alpha=0.0001,
        random_state=42
    ))
])

results = cross_validate(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=['accuracy', 'recall', 'f1'],
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)
df_MLP = pd.DataFrame(results)
df_MLP



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,1.547797,0.009113,0.841146,0.851562,0.774194,0.792271,0.797342,0.811881
1,1.049342,0.010641,0.84375,0.846354,0.793548,0.780998,0.803922,0.804312
2,1.426073,0.009799,0.830729,0.854167,0.729032,0.793881,0.776632,0.814876
3,1.196667,0.01081,0.825521,0.851562,0.741935,0.784219,0.774411,0.810316
4,1.444928,0.010778,0.830729,0.845703,0.794872,0.779032,0.792332,0.802993


In [74]:
print('========== ADABOOST ==========')
pipeline = Pipeline([
    ('ab', AdaBoostClassifier(
        n_estimators=50,
        learning_rate=1.0,
        random_state=42
    ))
])

results = cross_validate(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=['accuracy', 'recall', 'f1'],
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)
df_ABOOST = pd.DataFrame(results)
df_ABOOST

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,0.224412,0.028964,0.880208,0.876302,0.877419,0.884058,0.855346,0.852484
1,0.179239,0.025048,0.869792,0.878255,0.858065,0.867955,0.841772,0.852174
2,0.191079,0.031874,0.854167,0.899089,0.806452,0.874396,0.816993,0.875101
3,0.230786,0.022311,0.880208,0.872396,0.858065,0.856683,0.852564,0.844444
4,0.21152,0.023192,0.861979,0.883464,0.852564,0.866129,0.833856,0.857143


In [84]:
print('========== LGBMClassifier ==========')
pipeline = Pipeline([
    ('lgbm', LGBMClassifier(
        n_estimators=100,
        max_depth=-1,
        learning_rate=0.1,
        random_state=42,
        verbose=-1,
        n_jobs=-1
    ))
])

results = cross_validate(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    cv=5,
    scoring=['accuracy', 'recall', 'f1'],
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)
df_LGBMC = pd.DataFrame(results)
df_LGBMC



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished


Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_f1,train_f1
0,2.198808,0.015555,0.973958,1.0,0.948387,1.0,0.967105,1.0
1,2.057239,0.014994,0.955729,1.0,0.941935,1.0,0.944984,1.0
2,2.13337,0.019451,0.973958,1.0,0.96129,1.0,0.967532,1.0
3,2.164692,0.018468,0.971354,1.0,0.96129,1.0,0.964401,1.0
4,2.117628,0.018116,0.960938,1.0,0.948718,1.0,0.951768,1.0


### Shapiro - Wilk. Se puede usar un test no paramétrico?

* Si p > 0.05 → No rechazas H₀ → la distribución es normal.
* Si p ≤ 0.05 → Rechazas H₀ → la distribución NO es normal → usa pruebas no paramétricas.

In [86]:
dfs = {
    'KNN': df_KNN, 
    'LR' : df_LR,
    'RF': df_RF,
    'XGBC': df_XGB,
    'SVM': df_SVM,
    'DT': df_DT,
    'NB': df_NB,
    'MLP': df_MLP,
    'ABOOST': df_ABOOST,
    'LGBMC': df_LGBMC
}

modelos = ['KNN', 'LR', 'RF', 'XGBC', 'SVM', 'DT', 'NB', 'MLP', 'ABOOST', 'LGBMC']
medidas = ['test_accuracy', 'test_recall', 'test_f1']

for modelo in modelos:
    print(f'\n{"="*40}')
    print(f'MODELO: {modelo}')
    print("="*40)
    df = dfs[modelo]

    for medida in medidas:
        data = df[medida]
        stat, p = shapiro(data)
        print(f'\n-->{medida}')
        print(f'p value: {np.round(p, 4)}')
        if p > 0.05:
            print('Tomar Ho, SI sigue distribución Normal')
        else:
            print('Tomar H1, NO sigue distribución Normal xxxxx')


MODELO: KNN

-->test_accuracy
p value: 0.0133
Tomar H1, NO sigue distribución Normal xxxxx

-->test_recall
p value: 0.8912
Tomar Ho, SI sigue distribución Normal

-->test_f1
p value: 0.0398
Tomar H1, NO sigue distribución Normal xxxxx

MODELO: LR

-->test_accuracy
p value: 0.9276
Tomar Ho, SI sigue distribución Normal

-->test_recall
p value: 0.6584
Tomar Ho, SI sigue distribución Normal

-->test_f1
p value: 0.2013
Tomar Ho, SI sigue distribución Normal

MODELO: RF

-->test_accuracy
p value: 0.6846
Tomar Ho, SI sigue distribución Normal

-->test_recall
p value: 0.2373
Tomar Ho, SI sigue distribución Normal

-->test_f1
p value: 0.7026
Tomar Ho, SI sigue distribución Normal

MODELO: XGBC

-->test_accuracy
p value: 0.314
Tomar Ho, SI sigue distribución Normal

-->test_recall
p value: 0.7821
Tomar Ho, SI sigue distribución Normal

-->test_f1
p value: 0.5156
Tomar Ho, SI sigue distribución Normal

MODELO: SVM

-->test_accuracy
p value: 0.2073
Tomar Ho, SI sigue distribución Normal

-->test