In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve

In [2]:
DATASET = "../data/train_clean.csv"
COLUMN_TARGET = "SeriousDlqin2yrs"

df = pd.read_csv(DATASET)
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,MonthlyIncome_missing,Dependents_missing
0,1,0.766127,45,2,0.802982,9120.0,13,0,4,0,2.0,0,0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,0,0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,0,0
4,0,0.907239,49,1,0.024926,23000.0,7,0,1,0,0.0,0,0


In [3]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,MonthlyIncome_missing,Dependents_missing
count,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0
mean,0.06684,0.320491,52.295555,0.421029,316.551377,6142.446263,8.404016,0.107921,0.992593,0.240388,0.728132,0.198208,0.02616
std,0.249746,0.352149,14.771298,4.192795,906.966934,3835.691425,4.946412,0.635483,0.985802,4.155193,1.070425,0.398651,0.159612
min,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3903.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.154176,52.0,0.0,0.366503,5400.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.0,0.559044,63.0,0.0,0.868257,7400.0,11.0,0.0,2.0,0.0,1.0,0.0,0.0
max,1.0,1.092958,109.0,98.0,4979.08,23000.0,24.0,10.0,4.0,98.0,4.0,1.0,1.0


In [4]:
 # Separar target (y) e variáveis explicativas (X)
 # y = o que queremos prever: 1 = default / 0 = não default
 # Conversão para não prever valores flaot
y = df[COLUMN_TARGET].astype(int)

 # X = todas as colunas menos o target
X = df.drop(columns=[COLUMN_TARGET])
print("Shape X (linhas, colunas):", X.shape)
print("Taxa de default (y.mean):", y.mean())

 # Split treino/teste
 # test_size=0.2: 80% treino, 20% teste
 # stratify=y: garante que a proporção de default no treino e teste fique parecida
 # como apenas 6% é inadimplente, se o stratify=y não for selecionado isso pode acontecer 
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.2, random_state=50, stratify=y
)
print("\nTaxa de default treino:", round(y_train.mean(), 3))
print("Taxa de default teste:", round(y_test.mean(), 3))

# Taxa de default treino: 0.067
# Taxa de default teste: 0.067
# Está proporcional!

# Modelo baseline: Regressão Logística
# A regressão logistica é usada quando um resultado é binário
# class_weight='balanced' ajuda o modelo a não ignorar default,
# Com o default é minoritario, ele pode ser ignorado

model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train, y_train)
# Previsão de PD (probabilidade de default)
# predict_proba retorna uma lista com[probabilidade de 0, probabilidade de 1]

pd_test = model.predict_proba(X_test)[:, 1]
# print(f"pd_test={pd_test}")

# AUC (área sob a curva ROC)
# Mede a capacidade do modelo de ranquear bons vs ruins.
# O AUC responde: “Se eu pegar 1 pessoa que deu default e 1 pessoa que não deu, qual a chance do modelo dar um PD maior para a pessoa que deu default?”
# 0.5 = aleatório (ou seja, não sabe nada), 0.7 (Acerta 70 % das vezes), 1.0 = Acerta 100%
# Quanto maior, melhor
auc = roc_auc_score(y_test, pd_test)
print("\nAUC:", auc)
#O Modelo acerta 83% das vezes

# KS (Kolmogorov-Smirnov)
# Métrica muito usada em crédito.
# Responde: “em qual ponto eu separo melhor os ruins dos bons, e quão grande é essa separação?”
# TPR : "Dos que realmente são 1 (defaults), quantos eu capturei como 1"
# FPR : "Dos que realmente são 0 (não-default), quantos eu errei chamando de 1?"
# KS = maior diferença entre TPR e FPR ao longo do ROC.
fpr, tpr, _ = roc_curve(y_test, pd_test)
ks = np.max(tpr - fpr)
print("KS:", ks)


# Tabela por decis (10 grupos por risco)
# Irei ordenar as pessoas pelo risco e divide em 10 grupos (decis)
# Se o modelo estiver bom, o 1º decil (maior PD) deve ter bad_rate bem maior que o último decil (menor PD)
# O primeiro decil (maior PD) deve ter maior bad rate se o modelo estiver separando risco.
# qcut divide a coluna pd em 10 faixas com o mesmo número de pessoas (quantis)
# cada decil tem 10% dos clientes.

tmp = pd.DataFrame({"y": y_test.values, "pd": pd_test})

 # qcut separa em quantis (mesmo número de pessoas por grupo)
tmp["decil"] = pd.qcut(tmp["pd"], 10, duplicates="drop")

# Dicionário
# total: quantas pessoas tem no grupo
# bads: quantos defaults reais aconteceram no grupo (y=1)
# bad_rate: % de default real no grupo (bads/total)
# avg_pd: PD média prevista pelo modelo no grupo (deve ser maior no topo e menor no fim)
decis = tmp.groupby("decil").agg(
    total=("y", "size"),
    bads=("y", "sum"),
    bad_rate=("y", "mean"),
    avg_pd=("pd", "mean"),
).reset_index()

 # Ordena do maior risco para o menor risco
decis = decis.sort_values("avg_pd", ascending=False).reset_index(drop=True)
# Lift vs base: mede “quantas vezes” o decil é mais/menos arriscado que a média.
# base_rate é a taxa média de default na base, lift = bad_rate_do_decil / base_rate (
base_rate = tmp["y"].mean()
decis["lift_vs_base"] = decis["bad_rate"] / (base_rate + 1e-9)
print("\nTabela por decis maior> menor):")
print(decis)
# Interpretação (coeficientes do modelo)
# Na regressão logística:
# coeficiente positivo -> aumenta a PD (mais risco)
# coeficiente negativo -> diminui a PD (menos risco)
coef = pd.Series(model.coef_[0], index=X.columns).sort_values()
print("\nVariáveis que REDUZEM risco (coef negativo):")
print(coef.head(6))
print("\nVariáveis que AUMENTAM risco (coef positivo):")
print(coef.tail(6))


Shape X (linhas, colunas): (149999, 12)
Taxa de default (y.mean): 0.06684044560297069

Taxa de default treino: 0.067
Taxa de default teste: 0.067

AUC: 0.8350583177704961
KS: 0.5191775873764418

Tabela por decis maior> menor):
             decil  total  bads  bad_rate    avg_pd  lift_vs_base
0     (0.713, 1.0]   3000   960  0.320000  0.831482      4.788030
1   (0.577, 0.713]   3000   375  0.125000  0.643820      1.870324
2   (0.444, 0.577]   3000   231  0.077000  0.509035      1.152120
3   (0.344, 0.444]   3000   144  0.048000  0.391331      0.718204
4   (0.278, 0.344]   3000    93  0.031000  0.308220      0.463840
5   (0.231, 0.278]   3000    67  0.022333  0.253033      0.334165
6   (0.195, 0.231]   3000    50  0.016667  0.212698      0.249377
7   (0.162, 0.195]   3000    40  0.013333  0.178068      0.199501
8   (0.129, 0.162]   3000    28  0.009333  0.145809      0.139651
9  (0.0321, 0.129]   3000    17  0.005667  0.106043      0.084788

Variáveis que REDUZEM risco (coef negativo):
N

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  decis = tmp.groupby("decil").agg(


In [5]:
"""
OUTPUT

Shape X (linhas, colunas): (149999, 12)
Taxa de default (y.mean): 0.06684044560297069

Taxa de default treino: 0.067
Taxa de default teste: 0.067

AUC: 0.8393155974147504
KS: 0.5251071285886018

Tabela por decis maior> menor):
            decil  total  bads  bad_rate    avg_pd  lift_vs_base
0     (0.72, 1.0]   3000   995  0.331667  0.835071      4.962593
1   (0.584, 0.72]   3000   351  0.117000  0.653370      1.750623
2  (0.448, 0.584]   3000   233  0.077667  0.513764      1.162095
3  (0.347, 0.448]   3000   143  0.047667  0.393959      0.713217
4  (0.278, 0.347]   3000    97  0.032333  0.309957      0.483791
5  (0.233, 0.278]   3000    55  0.018333  0.254175      0.274314
6  (0.196, 0.233]   3000    57  0.019000  0.214017      0.284289
7  (0.163, 0.196]   3000    30  0.010000  0.178981      0.149626
8   (0.13, 0.163]   3000    26  0.008667  0.146599      0.129676
9  (0.0369, 0.13]   3000    18  0.006000  0.107523      0.089776

Variáveis que REDUZEM risco (coef negativo):
NumberOfTime60-89DaysPastDueNotWorse   -0.299235
Dependents_missing                     -0.055449
age                                    -0.017301
DebtRatio                              -0.000107
MonthlyIncome                          -0.000044
MonthlyIncome_missing                   0.039141
dtype: float64

Variáveis que AUMENTAM risco (coef positivo):
NumberOfOpenCreditLinesAndLoans         0.047236
NumberOfDependents                      0.048321
NumberRealEstateLoansOrLines            0.128114
NumberOfTime30-59DaysPastDueNotWorse    0.291752
NumberOfTimes90DaysLate                 1.135203
RevolvingUtilizationOfUnsecuredLines    2.360617

"""

'\nOUTPUT\n\nShape X (linhas, colunas): (149999, 12)\nTaxa de default (y.mean): 0.06684044560297069\n\nTaxa de default treino: 0.067\nTaxa de default teste: 0.067\n\nAUC: 0.8393155974147504\nKS: 0.5251071285886018\n\nTabela por decis maior> menor):\n            decil  total  bads  bad_rate    avg_pd  lift_vs_base\n0     (0.72, 1.0]   3000   995  0.331667  0.835071      4.962593\n1   (0.584, 0.72]   3000   351  0.117000  0.653370      1.750623\n2  (0.448, 0.584]   3000   233  0.077667  0.513764      1.162095\n3  (0.347, 0.448]   3000   143  0.047667  0.393959      0.713217\n4  (0.278, 0.347]   3000    97  0.032333  0.309957      0.483791\n5  (0.233, 0.278]   3000    55  0.018333  0.254175      0.274314\n6  (0.196, 0.233]   3000    57  0.019000  0.214017      0.284289\n7  (0.163, 0.196]   3000    30  0.010000  0.178981      0.149626\n8   (0.13, 0.163]   3000    26  0.008667  0.146599      0.129676\n9  (0.0369, 0.13]   3000    18  0.006000  0.107523      0.089776\n\nVariáveis que REDUZEM 

In [None]:
import numpy as np


# Quanto maior o score, menos risco 
# Multiplicador de score setado para 100

score = np.round((1 - np.array(pd_test)) * 1000).astype(int) # Inversamente proporcial pois o PD é chance de default, o Score alto é comumente visto com algo bom

print("Quantidade de scores:", len(score))
print("Score min/max:", score.min(), score.max())
print("Primeiros 10 scores:", score[:10])

CUTOFF = 500  # nota de corte - aprovado se score >= 500 

approved = (score > CUTOFF).astype(int)
print("\nCutoff:", CUTOFF)
print(f"Aprovados: {round(approved.mean(),2)*100}%")

Quantidade de scores: 30000
Score min/max: 0 967
Primeiros 10 scores: [670 665 531 600 776 251 510 763 712 358]

Cutoff: 500
Aprovados: 74.0%
