## Dependencias

In [132]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import roc_auc_score,classification_report

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.neural_network import MLPClassifier

from CreditScoringToolkit.frequency_table import frequency_table
from CreditScoringToolkit.DiscreteNormalizer import DiscreteNormalizer

from functools import reduce
pd.set_option('display.max_columns', None)  

## Lectura de Datos 

In [26]:
df = pd.read_csv('../data/lending_club_clean.csv').sample(frac=0.1, random_state=42)
df.shape

(155391, 27)

In [27]:
df.head(2)

Unnamed: 0,id,loan_amnt,term,grade,emp_title,emp_length,home_ownership,annual_inc,issue_d,loan_status,addr_state,dti,fico_range_high,inq_last_6mths,mths_since_last_delinq,application_type,acc_now_delinq,open_acc_6m,il_util,max_bal_bc,total_rev_hi_lim,inq_last_12m,acc_open_past_24mths,bc_open_to_buy,bc_util,num_accts_ever_120_pd,tob
24244,66464843,24000.0,60 months,C,sr_accountant,4 years,MORTGAGE,65000.0,2015-12-01,0,NM,20.38,684.0,2.0,15.0,Individual,0.0,,,,17700.0,,3.0,1376.0,89.9,1.0,160
159487,59239775,21000.0,60 months,D,designer,10+ years,RENT,150000.0,2015-09-01,0,IL,8.75,674.0,1.0,7.0,Individual,0.0,,,,18600.0,,3.0,420.0,96.3,0.0,132


## Naturaleza de las variables

In [35]:
um = ['id']
varc = ['acc_now_delinq',
'acc_open_past_24mths',
'annual_inc',
'bc_open_to_buy',
'bc_util',
'dti',
'fico_range_high',
'il_util',
'inq_last_12m',
'inq_last_6mths',
'loan_amnt',
'max_bal_bc',
'mths_since_last_delinq',
'num_accts_ever_120_pd',
'open_acc_6m',
'total_rev_hi_lim','tob']

vard = ['addr_state',
'emp_length',
'grade',
'home_ownership','application_type','term']


vart = ['loan_status']


## Partición de los datos 

In [36]:
train,valid = train_test_split(df, test_size=0.3, random_state=20241130)
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
train.shape,valid.shape

((108773, 27), (46618, 27))

## Creación de matriz de predictoras $\mathcal{X}$

In [89]:
X = train[um+varc].copy()

In [90]:
X.head()

Unnamed: 0,id,acc_now_delinq,acc_open_past_24mths,annual_inc,bc_open_to_buy,bc_util,dti,fico_range_high,il_util,inq_last_12m,inq_last_6mths,loan_amnt,max_bal_bc,mths_since_last_delinq,num_accts_ever_120_pd,open_acc_6m,total_rev_hi_lim,tob
0,115049390,0.0,5.0,48000.0,9952.0,66.0,28.01,754.0,49.0,8.0,0.0,16625.0,10013.0,58.0,0.0,0.0,31600.0,335
1,128104798,0.0,6.0,65000.0,61371.0,11.8,12.37,739.0,,2.0,0.0,10000.0,2119.0,,0.0,1.0,76200.0,192
2,138451806,0.0,3.0,39000.0,1381.0,88.9,12.12,674.0,,2.0,2.0,10000.0,5021.0,,0.0,1.0,25000.0,53
3,132241458,0.0,4.0,27348.0,6528.0,63.1,16.45,664.0,,5.0,2.0,13500.0,5339.0,25.0,0.0,0.0,24800.0,120
4,50635602,0.0,0.0,35000.0,1829.0,55.4,2.43,734.0,,,0.0,4000.0,,,0.0,,4100.0,78


In [92]:
dn = DiscreteNormalizer(normalization_threshold=0.05,default_category='SMALL')
dn.fit(train[vard])
Xd = dn.transform(train[vard])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  aux['mapping'].replace({default_category:mode},inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  aux['mapping'].replace({default_category:mode},inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

In [93]:
oh = OneHotEncoder(sparse_output=False, handle_unknown='ignore',drop='if_binary')
oh.fit(Xd[vard])

In [94]:
varoh = oh.get_feature_names_out()
len(varoh)

24

In [95]:
Xd = pd.DataFrame(oh.transform(Xd[vard]),columns=varoh)

In [96]:
Xd.shape,X.shape

((108773, 24), (108773, 18))

In [97]:
X = pd.concat([X,Xd],axis=1)

In [100]:
var = varc+varoh.tolist()

In [101]:
X.head()

Unnamed: 0,id,acc_now_delinq,acc_open_past_24mths,annual_inc,bc_open_to_buy,bc_util,dti,fico_range_high,il_util,inq_last_12m,inq_last_6mths,loan_amnt,max_bal_bc,mths_since_last_delinq,num_accts_ever_120_pd,open_acc_6m,total_rev_hi_lim,tob,addr_state_CA,addr_state_FL,addr_state_NY,addr_state_SMALL,addr_state_TX,emp_length_1 year,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_< 1 year,emp_length_MISSING,emp_length_SMALL,grade_A,grade_B,grade_C,grade_D,grade_E,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,application_type_Individual,term_ 60 months
0,115049390,0.0,5.0,48000.0,9952.0,66.0,28.01,754.0,49.0,8.0,0.0,16625.0,10013.0,58.0,0.0,0.0,31600.0,335,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,128104798,0.0,6.0,65000.0,61371.0,11.8,12.37,739.0,,2.0,0.0,10000.0,2119.0,,0.0,1.0,76200.0,192,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,138451806,0.0,3.0,39000.0,1381.0,88.9,12.12,674.0,,2.0,2.0,10000.0,5021.0,,0.0,1.0,25000.0,53,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,132241458,0.0,4.0,27348.0,6528.0,63.1,16.45,664.0,,5.0,2.0,13500.0,5339.0,25.0,0.0,0.0,24800.0,120,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
4,50635602,0.0,0.0,35000.0,1829.0,55.4,2.43,734.0,,,0.0,4000.0,,,0.0,,4100.0,78,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


## Imputación de valores ausentes 

In [102]:
im = SimpleImputer(strategy='median')
im.fit(X[var])

In [103]:
X[var] = pd.DataFrame(im.transform(X[var]),columns=var)

In [104]:
X.dropna().shape,X.shape

((108773, 42), (108773, 42))

## Selección de predictoras 

In [105]:
y = X[um].merge(train[um+vart],on=um,how='inner')

In [106]:
sk = SelectKBest(k=len(var))
sk.fit(X[var],y[vart[0]])

  f = msb / msw


In [113]:
poderPredictivo = pd.Series(dict(zip(var,sk.scores_))).sort_values(ascending=False)

In [114]:
poderPredictivo

grade_A                        2175.365054
fico_range_high                1680.501505
grade_E                        1264.759379
grade_D                        1065.683289
acc_open_past_24mths           1002.037322
inq_last_6mths                  856.018048
bc_open_to_buy                  710.440724
term_ 60 months                 631.962616
dti                             572.429967
open_acc_6m                     468.066193
bc_util                         421.457036
total_rev_hi_lim                396.149755
inq_last_12m                    307.070137
home_ownership_MORTGAGE         270.481883
il_util                         260.872322
home_ownership_RENT             252.437165
max_bal_bc                      194.144369
grade_B                         193.538857
grade_C                         168.722822
annual_inc                      153.233208
loan_amnt                       114.177982
tob                              85.960410
emp_length_MISSING               62.840860
emp_length_

In [115]:
sk = SelectKBest(k=10)
sk.fit(X[var],y[vart[0]])


Features [39] are constant.


invalid value encountered in divide



In [117]:
best = [v for v,s in zip(var,sk.get_support()) if s]

In [119]:
X = X[um+best]

In [120]:
X.head()

Unnamed: 0,id,acc_open_past_24mths,bc_open_to_buy,dti,fico_range_high,inq_last_6mths,open_acc_6m,grade_A,grade_D,grade_E,term_ 60 months
0,115049390,5.0,9952.0,28.01,754.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128104798,6.0,61371.0,12.37,739.0,0.0,1.0,1.0,0.0,0.0,0.0
2,138451806,3.0,1381.0,12.12,674.0,2.0,1.0,0.0,0.0,0.0,0.0
3,132241458,4.0,6528.0,16.45,664.0,2.0,0.0,0.0,1.0,0.0,1.0
4,50635602,0.0,1829.0,2.43,734.0,0.0,1.0,0.0,0.0,0.0,0.0


## Ajuste de escala, híper-caja $\mathcal{X}_{[0,1]}$

In [121]:
sc = MinMaxScaler() 
sc.fit(X[best])
Xs = pd.DataFrame(sc.transform(X[best]),columns=best)
Xs[um] = X[um]

In [122]:
Xs.describe()

Unnamed: 0,acc_open_past_24mths,bc_open_to_buy,dti,fico_range_high,inq_last_6mths,open_acc_6m,grade_A,grade_D,grade_E,term_ 60 months,id
count,108773.0,108773.0,108773.0,108773.0,108773.0,108773.0,108773.0,108773.0,108773.0,108773.0,108773.0
mean,0.076005,0.032646,0.365913,0.208005,0.104522,0.068242,0.201668,0.134491,0.054462,0.276043,99993890.0
std,0.054022,0.046437,0.171543,0.17666,0.162937,0.074809,0.401247,0.341181,0.226928,0.44704,29491260.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365090.0
25%,0.032787,0.005469,0.236979,0.080645,0.0,0.0,0.0,0.0,0.0,0.0,73745720.0
50%,0.065574,0.016364,0.355168,0.16129,0.0,0.071429,0.0,0.0,0.0,0.0,98164530.0
75%,0.098361,0.040927,0.487981,0.295699,0.2,0.071429,0.0,0.0,0.0,1.0,127962000.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,145642500.0


## Híper-parametrización

In [None]:
hp = {
    'hidden_layer_sizes': [(a,b,c,) for a in range(5,21) for b in range(5,21) for c in range(5,21)],
    'activation': ['tanh', 'relu','logistic','identity'],
    'solver': ['sgd', 'adam','lbfgs'],
    'alpha': np.arange(0.0001,0.001,0.0001),
    'learning_rate': ['constant', 'adaptive', 'invscaling'],
    'max_iter': [10000]
}

In [None]:
reduce(lambda x,y: x*y, [len(hp[i]) for i in hp])

1327104

In [133]:
mod = MLPClassifier()

In [148]:
grid = RandomizedSearchCV(estimator=mod, 
                          param_distributions=hp, 
                          n_iter=100, 
                          cv=3, 
                          scoring='roc_auc', 
                          n_jobs=-1)

In [149]:
grid.fit(Xs, y[vart[0]])

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL_TERMINATION

In [150]:
ann = grid.best_estimator_

## Aprendizaje 

In [151]:
ann.fit(Xs[best], y[vart[0]])

In [152]:
[w.shape for w in ann.coefs_]

[(10, 19), (19, 20), (20, 14), (14, 1)]

In [153]:
roc_auc_score(y[vart[0]],ann.predict_proba(Xs[best])[:,1])

np.float64(0.6947196861733405)

## Generalización del modelo 

In [154]:
Xv = valid[um+varc].copy()
Xdv = dn.transform(valid[vard])
Xdv = pd.DataFrame(oh.transform(Xdv[vard]),columns=varoh)
Xv = pd.concat([Xv,Xdv],axis=1)
Xv[var] = pd.DataFrame(im.transform(Xv[var]),columns=var)
Xv = Xv[um+best]
Xv[best] = sc.transform(Xv[best])

In [155]:
roc_auc_score(valid[vart[0]],ann.predict_proba(Xv[best])[:,1])

np.float64(0.6929302688340737)