In [1]:
import numpy as np
import pandas as pd
import sklearn
import warnings

In [2]:
datos = pd.read_csv('./loan_data.csv')

datos.columns = ['person_age', 'person_gender', 'person_education', 'person_income', 'person_emp_exp', 'person_home_ownership', 
                 'loan_amnt', 'loan_intent', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score',
                 'previous_loan_defaults_on_file', 'loan_status']

In [3]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [7]:
# Variables categóricas 

datos.select_dtypes(include=['object']).describe()

Unnamed: 0,person_gender,person_education,person_home_ownership,loan_intent,previous_loan_defaults_on_file
count,45000,45000,45000,45000,45000
unique,2,5,4,6,2
top,male,Bachelor,RENT,EDUCATION,Yes
freq,24841,13399,23443,9153,22858


In [10]:
# Variables cuantitativas int64

datos.select_dtypes(include=['int64']).describe()

Unnamed: 0,person_emp_exp,credit_score,loan_status
count,45000.0,45000.0,45000.0
mean,5.410333,632.608756,0.222222
std,6.063532,50.435865,0.415744
min,0.0,390.0,0.0
25%,1.0,601.0,0.0
50%,4.0,640.0,0.0
75%,8.0,670.0,0.0
max,125.0,850.0,1.0


In [11]:
# Variables cuantitativas float64

datos.select_dtypes(include=['float64']).describe()

Unnamed: 0,person_age,person_income,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
count,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80319.05,9583.157556,11.006606,0.139725,5.867489
std,6.045108,80422.5,6314.886691,2.978808,0.087212,3.879702
min,20.0,8000.0,500.0,5.42,0.0,2.0
25%,24.0,47204.0,5000.0,8.59,0.07,3.0
50%,26.0,67048.0,8000.0,11.01,0.12,4.0
75%,30.0,95789.25,12237.25,12.99,0.19,8.0
max,144.0,7200766.0,35000.0,20.0,0.66,30.0


In [12]:
from sklearn.model_selection import train_test_split
N = len(datos)
cTrain = int(N*0.8)
cTest = N - cTrain
print(N,cTrain,cTest)
train_data, test_data = sklearn.model_selection.train_test_split(datos, train_size=cTrain, test_size=cTest)

45000 36000 9000


In [13]:
train_data.shape

(36000, 14)

In [14]:
train_data.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
6833,23.0,male,Associate,83922.0,2,RENT,6000.0,MEDICAL,7.88,0.07,2.0,624,Yes,0
5837,23.0,male,Associate,41783.0,2,RENT,5500.0,DEBTCONSOLIDATION,11.01,0.13,3.0,594,Yes,0
44408,25.0,male,Associate,53788.0,1,RENT,23982.0,DEBTCONSOLIDATION,10.43,0.45,4.0,588,No,1
18017,28.0,female,Master,241208.0,5,RENT,25000.0,VENTURE,15.33,0.1,7.0,653,Yes,0
26877,27.0,female,High School,140273.0,6,OWN,6000.0,MEDICAL,7.29,0.04,7.0,547,Yes,0


**Pipeline para atributos categóricos**

In [30]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

cat_attribs = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(sparse_output=False)) #me recomienda usar sparse_output en vez de sparce porque fue renombrado a este
])

**Pipeline para atributos numéricos**

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

num_attribs = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 
               'cb_person_cred_hist_length', 'credit_score']

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("scaler", StandardScaler()) # puede ser MinMaxScaler()
])

**Pipeline completo**

In [32]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

In [33]:
X_train = full_pipeline.fit_transform(train_data)

In [34]:
X_train.shape

(36000, 27)

In [35]:
X_train[0,:]

array([-0.78385477,  0.04254   , -0.55794504, -0.56716856, -1.04783058,
       -0.8005294 , -0.99558595, -0.17155156,  0.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  1.        ])

In [36]:
y_train = train_data['loan_status']
y_train

6833     0
5837     0
44408    1
18017    0
26877    0
        ..
40220    0
2405     0
41542    0
23513    1
34980    0
Name: loan_status, Length: 36000, dtype: int64

**Entrenamiento de las redes neuronales** \
Se construyen 5 modelos variando en la topología de la red, la cantidad de capas ocultas y neuronas por capa oculta.

In [37]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

**RED 1**

In [38]:
modelo1 = MLPClassifier(activation='relu',solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10,5), random_state=123)
modelo1.fit(X_train, y_train)
scores1 = cross_val_score(modelo1, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(scores1)
scores1.mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

[-0.07708333 -0.07777778 -0.08569444 -0.07569444 -0.07888889]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


-0.07902777777777778

In [72]:
X_test=full_pipeline.transform(test_data)
X_test

array([[-0.61879989,  0.08338662, -0.72238104, ...,  1.        ,
         0.        ,  1.        ],
       [-0.28869014, -0.59645925, -0.06463705, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.86669397, -0.25824072,  0.59310694, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.0414196 ,  0.47021629, -0.22907305, ...,  0.        ,
         1.        ,  0.        ],
       [-0.61879989, -0.0874093 , -0.39350904, ...,  0.        ,
         0.        ,  1.        ],
       [-0.28869014,  0.43707502, -0.55794504, ...,  1.        ,
         0.        ,  1.        ]])

**Usando los modelos**

In [49]:
y_pred1=modelo1.predict(X_test)
y_pred1

array([0, 0, 0, ..., 0, 0, 0])

**Metricas**

In [62]:
y_test = test_data['loan_status']
y_test

11177    0
6790     0
21385    0
29158    1
42556    1
        ..
9821     0
36568    0
38732    0
35967    0
34332    0
Name: loan_status, Length: 9000, dtype: int64

In [92]:
from sklearn.metrics import mean_absolute_error, accuracy_score
#mean absolute error
mae1=mean_absolute_error(
    y_true = y_test,
    y_pred = y_pred1
) 


#accuracy
acc1 = accuracy_score(
    y_true = y_test,
    y_pred = y_pred1
)
