# **Librerías**

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, auc, brier_score_loss, matthews_corrcoef, log_loss, precision_recall_curve
import statsmodels.api as sm

# **Dataset**

In [3]:
df = pd.read_csv("../data/dataset_limpio.csv")
df.head()

Unnamed: 0,COMBUSTIBLE,MODELO,FECHA_REGISTRO,AÑO_REGISTRO,CLASE,MARCA,LINEA,CARROCERIA,CILINDRAJE,MODALIDAD,MUNICIPIO,DEPARTAMENTO,PESO,POTENCIA
0,ELECTRICO,2023,10/21/2022 12:00:00 AM,2022,CAMIONETA,BYD,YUAN PRO EV,WAGON,1798.0,PASAJEROS,PEREIRA,RISARALDA,1980.0,134.0
1,ELECTRICO,2022,10/25/2021 12:00:00 AM,2021,CAMIONETA,BYD,SONG PRO EV,WAGON,1798.0,PASAJEROS,EL ROSAL,CUNDINAMARCA,2120.0,161.0
2,ELECTRICO,2023,08/03/2022 12:00:00 AM,2022,AUTOMOVIL,MINI,COOPER SE,HATCH BACK,1798.0,PASAJEROS,SABANETA,ANTIOQUIA,1440.0,184.0
3,ELECTRICO,2020,10/31/2019 12:00:00 AM,2019,AUTOMOVIL,BMW,I3,HATCH BACK,1798.0,PASAJEROS,BOGOTA,BOGOTA D.C.,1345.0,170.0
4,ELECTRICO,2022,08/27/2021 12:00:00 AM,2021,AUTOMOVIL,BYD,E1,HATCH BACK,1798.0,PASAJEROS,ENVIGADO,ANTIOQUIA,1445.0,60.0


# **Entrenamiento de modelos**

## **Modelo para predecir si un auto es híbrido o eléctrico**

In [4]:
df['HIBRIDO'] = df['COMBUSTIBLE'].apply(lambda x: 1 if x in ['GASO ELEC', 'DIES ELEC'] else 0)

print(df['HIBRIDO'].value_counts())
print(f"Percentage of hybrid vehicles: {df['HIBRIDO'].mean() * 100:.2f}%")

HIBRIDO
1    46076
0     3645
Name: count, dtype: int64
Percentage of hybrid vehicles: 92.67%


In [38]:
X = df[['POTENCIA', 'PESO', 'CILINDRAJE', 'MODELO']]
y = df['HIBRIDO']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

l_model = LogisticRegression(class_weight='balanced')
l_model.fit(X_train, y_train)


l_pred = l_model.predict(X_test)
l_probs = l_model.predict_proba(X_test)[:, 1]

# Resultados de Regresión Logística
print("Logistic Regression Results:")
l_accuracy = accuracy_score(y_test, l_pred)
print(f'Accuracy: {l_accuracy}')
l_mse = mean_squared_error(y_test, l_pred)
print(f"MSE: {l_mse}")


# ROC-AUC
print("\nROC-AUC Scores:")
print(f"Logistic Regression ROC-AUC: {roc_auc_score(y_test, l_probs)}")

Logistic Regression Results:
Accuracy: 0.7493212669683258
MSE: 0.2506787330316742

ROC-AUC Scores:
Logistic Regression ROC-AUC: 0.7599941878230495


In [40]:
def evaluate_model(model, X_test, y_test, name):
    probs = model.predict_proba(X_test)[:, 1]
    preds = model.predict(X_test)
    
    metrics = {
        'Accuracy': accuracy_score(y_test, preds),
        'Precision': precision_score(y_test, preds),
        'Recall': recall_score(y_test, preds),
        'F1': f1_score(y_test, preds),
        'ROC-AUC': roc_auc_score(y_test, probs),
        'PR-AUC': auc(*precision_recall_curve(y_test, probs)[1::-1]),
        'Brier Score': brier_score_loss(y_test, probs),
        'MCC': matthews_corrcoef(y_test, preds),
        'Log Loss': log_loss(y_test, probs)
    }
    
    print(f"\n{name} Metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")
        
    return metrics

log_metrics = evaluate_model(l_model, X_test, y_test, "Logistic Regression")


Logistic Regression Metrics:
Accuracy: 0.7493
Precision: 0.9655
Recall: 0.7562
F1: 0.8481
ROC-AUC: 0.7600
PR-AUC: 0.9661
Brier Score: 0.1754
MCC: 0.2471
Log Loss: 0.5280


In [41]:
X_train_sm = sm.add_constant(X_train)

model_sm = sm.Logit(y_train, X_train_sm).fit(disp=0)

print(model_sm.summary())

                           Logit Regression Results                           
Dep. Variable:                HIBRIDO   No. Observations:                39776
Model:                          Logit   Df Residuals:                    39771
Method:                           MLE   Df Model:                            4
Date:                Wed, 02 Apr 2025   Pseudo R-squ.:                  0.2214
Time:                        10:58:36   Log-Likelihood:                -8094.2
converged:                       True   LL-Null:                       -10396.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1229.4906     30.067    -40.892      0.000   -1288.421   -1170.560
POTENCIA      -0.0093      0.001    -17.630      0.000      -0.010      -0.008
PESO           0.0031   8.41e-05     37.093      0.0

Debo evaluar luego el modelo.

## **Modelo para... (continuar)**