In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV


## 1. Cargue la base de datos riesgo_sobrecarga.csv a un DataFrame y mostrar 5 pimeras filas

In [13]:
df = pd.read_csv("riesgo_sobrecarga.csv")
df.head()

Unnamed: 0,Consumo_Promedio_kWh,Pico_Maximo_kW,Antiguedad_Instalacion_anios,Numero_Electrodomesticos,Uso_AireAcondicionado_horas,Uso_Calefaccion_horas,Variacion_Voltaje_pct,Carga_Nocturna_pct,Temperatura_Promedio_C,Riesgo_Sobrecarga
0,165.4,9.98,23.3,7,5.3,1.0,2.12,10.8,24.6,0
1,505.48,5.98,26.3,16,10.6,6.1,3.82,24.7,17.7,1
2,613.57,10.09,2.3,20,11.3,7.9,4.05,63.3,19.8,1
3,595.8,8.81,11.0,12,2.2,5.8,3.54,84.5,20.3,1
4,580.32,4.89,1.6,9,11.1,9.7,4.34,10.2,20.5,1


## 2. Calcule la proporción de cada clase de la variable objetivo

In [14]:
conteos = df["Riesgo_Sobrecarga"].value_counts()
proporciones = df[y_col].value_counts(normalize=True)

print("Conteos por clase:")
print(conteos)

print("\nProporciones por clase:")
print(proporciones)

Conteos por clase:
Riesgo_Sobrecarga
0    240
1    240
Name: count, dtype: int64

Proporciones por clase:
Riesgo_Sobrecarga
0    0.5
1    0.5
Name: proportion, dtype: float64


## 3. Defina variables predictoras y variable objetivo

In [15]:
X = df.drop(columns=[y_col])
y = df[y_col]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (480, 9)
y shape: (480,)


## 4. Realizar un split train-test (80% – 20%) estratificado

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y)

print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)

Train: (384, 9) (384,)
Test : (96, 9) (96,)


## 5. Crear pipelines para:
• KNN  

• SVC

• Decision Tree

• Naive Bayes

(Incluir escalado donde sea necesario).

In [18]:
pipelines = {
    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsClassifier())
    ]),
    "SVC": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC())
    ]),
    "DecisionTree": Pipeline([
        ("model", DecisionTreeClassifier(random_state=42))
    ]),
    "NaiveBayes": Pipeline([
        ("model", GaussianNB())
    ])}

## 6. Crear el param_grid adecuado para cada modelo.

In [19]:
param_grids = {
    "KNN": {
        "model__n_neighbors": [3, 5, 7, 9],
        "model__weights": ["uniform", "distance"]
    },
    "SVC": {
        "model__C": [0.1, 1, 10],
        "model__kernel": ["rbf", "linear"],
        "model__gamma": ["scale", "auto"]
    },
    "DecisionTree": {
        "model__max_depth": [None, 3, 5, 10],
        "model__min_samples_split": [2, 5, 10]
    },
    "NaiveBayes": {
        "model__var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
    }}

## 7. Usar validación cruzada interna (cv=5).

In [20]:
grid_searches = {}

for name in pipelines:

    gs = GridSearchCV(
        estimator=pipelines[name],
        param_grid=param_grids[name],
        cv=5,
        scoring="accuracy")

    gs.fit(X_train, y_train)
    grid_searches[name] = gs

    print(f"{name} | best CV acc = {gs.best_score_:.4f}")
    print(f"best params: {gs.best_params_}\n")


KNN | best CV acc = 0.7474
best params: {'model__n_neighbors': 5, 'model__weights': 'uniform'}

SVC | best CV acc = 0.7968
best params: {'model__C': 0.1, 'model__gamma': 'scale', 'model__kernel': 'linear'}

DecisionTree | best CV acc = 0.7318
best params: {'model__max_depth': 5, 'model__min_samples_split': 2}

NaiveBayes | best CV acc = 0.7994
best params: {'model__var_smoothing': 1e-09}



## 8. Construir tabla con:
• Mejores hiperparámetros

• CV Accuracy

In [21]:
rows = []
for name, gs in grid_searches.items():
    rows.append({
        "Modelo": name,
        "Mejores_hiperparametros": gs.best_params_,
        "CV_Accuracy": gs.best_score_
    })

tabla_cv = pd.DataFrame(rows).sort_values("CV_Accuracy", ascending=False).reset_index(drop=True)
tabla_cv


Unnamed: 0,Modelo,Mejores_hiperparametros,CV_Accuracy
0,NaiveBayes,{'model__var_smoothing': 1e-09},0.799419
1,SVC,"{'model__C': 0.1, 'model__gamma': 'scale', 'mo...",0.796822
2,KNN,"{'model__n_neighbors': 5, 'model__weights': 'u...",0.747368
3,DecisionTree,"{'model__max_depth': 5, 'model__min_samples_sp...",0.731784


## 9. Evaluar test externo y obtener accuracy y classification report.

In [27]:
from sklearn.metrics import accuracy_score, classification_report

rows = []
for name, gs in grid_searches.items():
    y_pred = gs.best_estimator_.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    rows.append({"Modelo": name, "Test_Accuracy": acc})

    print(f"\n\n{name} | Test accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

tabla_test = pd.DataFrame(rows).sort_values("Test_Accuracy", ascending=False).reset_index(drop=True)
tabla_test




KNN | Test accuracy: 0.6875
              precision    recall  f1-score   support

           0       0.65      0.81      0.72        48
           1       0.75      0.56      0.64        48

    accuracy                           0.69        96
   macro avg       0.70      0.69      0.68        96
weighted avg       0.70      0.69      0.68        96



SVC | Test accuracy: 0.7396
              precision    recall  f1-score   support

           0       0.70      0.83      0.76        48
           1       0.79      0.65      0.71        48

    accuracy                           0.74        96
   macro avg       0.75      0.74      0.74        96
weighted avg       0.75      0.74      0.74        96



DecisionTree | Test accuracy: 0.6562
              precision    recall  f1-score   support

           0       0.64      0.73      0.68        48
           1       0.68      0.58      0.63        48

    accuracy                           0.66        96
   macro avg       0.66      

Unnamed: 0,Modelo,Test_Accuracy
0,SVC,0.739583
1,NaiveBayes,0.739583
2,KNN,0.6875
3,DecisionTree,0.65625


## 10. ¿Cuál modelo sería el adecuado para esta base de datos en particular? Justifique numéricamente su respuesta.

In [28]:
mejor_modelo = tabla_test.loc[0, "Modelo"]
mejor_test_acc = tabla_test.loc[0, "Test_Accuracy"]
mejor_cv_acc = tabla_cv.set_index("Modelo").loc[mejor_modelo, "CV_Accuracy"]

print("Modelo recomendado:", mejor_modelo)
print(f"CV Accuracy (cv=5): {mejor_cv_acc:.4f}")
print(f"Test Accuracy     : {mejor_test_acc:.4f}")

# (Opcional) mostrar diferencia contra el segundo lugar en test, si existe
if len(tabla_test) > 1:
    segundo = tabla_test.loc[1]
    print(f"Diferencia vs 2do en test: {(mejor_test_acc - segundo['Test_Accuracy']):.4f}")


y_pred_best = grid_searches[mejor_modelo].best_estimator_.predict(X_test)

print("\nClassification report (mejor modelo):")
print(classification_report(y_test, y_pred_best))


Modelo recomendado: SVC
CV Accuracy (cv=5): 0.7968
Test Accuracy     : 0.7396
Diferencia vs 2do en test: 0.0000

Classification report (mejor modelo):
              precision    recall  f1-score   support

           0       0.70      0.83      0.76        48
           1       0.79      0.65      0.71        48

    accuracy                           0.74        96
   macro avg       0.75      0.74      0.74        96
weighted avg       0.75      0.74      0.74        96

