In [None]:
# ===========================
# Regresión logística binaria (clasificación)
# ===========================

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pprint import pprint
from sklearn.metrics import roc_curve, auc


# hacemos train y test
X = df.drop(['target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42) 

# hacemos un EDA para ver correlación y eliminar columnas
df_eda = X_train.copy()
df_eda["target"] = y_train.copy()
corr = np.abs(df_eda.corr()['target']).sort_values(ascending=False)
# Features con menos de 0.1 de correlación vs el target
bad_corr_feat = corr[corr < 0.05].index.values
# Filtramos las columnas buenas
df_eda.drop(columns=bad_corr_feat, inplace=True)

# pairplot con el target, pero tarda mucho
sns.pairplot(df_eda[[col for col in df_eda.columns if "NumberOf" in col or col == "target"]], hue = "target")

features_num = [col for col in df_eda.columns if "target" not in col]
X_train = X_train[features_num].copy()

# hacemos el modelo
log_reg = LogisticRegression(max_iter = 5000)
log_reg.fit(X_train, y_train)

# podemos ver sus parámetros
print(log_reg.coef_)
print(log_reg.intercept_)
print(log_reg.classes_)

# podemos ver la importancia de cada feature
intercept = log_reg.intercept_
coefs = log_reg.coef_.ravel()
features = pd.DataFrame(coefs, X_train.columns, columns=['coefficient']).copy()
features['coefficient'] = np.abs(features['coefficient'])
features.sort_values('coefficient', ascending=False).head()

# le dejamos las mismas columnas al test
X_test = X_test[X_train.columns].copy()

# evaluamos el modelo con el test
acc_train = round(accuracy_score(log_reg.predict(X_train), y_train), 3)
acc_test = round(accuracy_score(log_reg.predict(X_test), y_test), 3)
print("Accuracy train:", acc_train)
print("Accuracy test:", acc_test)

# luego vemos si el target esta balanceado
y_train.value_counts(True)
y_test.value_counts(True)

# hacemos la matriz de confusión para ver verdaderos positivos, verdaderos negativps, falsos...
c_matrix = confusion_matrix(y_test, log_reg.predict(X_test))
c_matrix_df = pd.DataFrame(c_matrix, columns = ['Pred_Pays', 'Pred_Default'],
                          index = ['True_Pays', 'True_Default'])
sns.heatmap(c_matrix_df, annot=True, fmt='g');
# lo mismo pero con porcentajes
sns.heatmap(c_matrix_df/np.sum(c_matrix_df), annot=True, 
            fmt='.2%', cmap='Blues');

# comparación de varias métricas
print(classification_report(y_test, log_reg.predict(X_test)))

# curva ROC
scores = log_reg.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, scores[:,1])
roc_auc = auc(fpr,tpr)
print("AUROC: %.2f" %(roc_auc))
plt.figure(figsize=(6, 5))  
plt.plot(fpr, tpr, linewidth=2, color= "blue", label=f"ROC Logistic Reg (area = {roc_auc:0.2f})")
plt.plot([0, 1], [0, 1], 'k:', label="Random classifier's ROC curve")
plt.ylabel('True Positive Rate (Recall)')
plt.grid()
plt.axis([0, 1, 0, 1])
plt.legend(loc="lower right", fontsize=13)