In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import scipy
import math
import os
import zipfile
import warnings
import joblib


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier


In [4]:
df = pd.read_csv('data/datos_historicos.csv', sep = ',')
df

Unnamed: 0,ID,CreditScore,DebtRatio,Assets,Age,NumberOfDependents,NumberOfOpenCreditLinesAndLoans,MonthlyIncome,NumberOfTimesPastDue,EmploymentLength,HomeOwnership,Education,MaritalStatus,YearsAtCurrentAddress,NoPaidPerc
0,1,384,0.016560,37950,20,0,2,17355,9,28,Mortgage,Bachelor,Divorced,18,0.196566
1,2,516,0.138825,22909,46,5,8,7175,13,0,Rent,Bachelor,Married,9,0.264438
2,3,638,0.391149,126521,80,5,7,1910,6,25,Rent,Bachelor,Single,9,0.287568
3,4,363,0.463521,39543,54,9,0,19339,5,22,Own,High School,Widowed,3,0.181348
4,5,804,0.504824,38522,72,1,10,7928,18,19,Own,PhD,Married,5,0.343376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,448,0.602547,57758,22,9,19,17797,19,5,Rent,Bachelor,Widowed,2,0.190319
9996,9997,643,0.633910,67051,85,3,1,7034,4,0,Rent,High School,Married,1,0.168900
9997,9998,695,0.019713,65309,42,4,3,9105,18,4,Rent,Bachelor,Divorced,6,0.237987
9998,9999,387,0.174242,115144,20,3,10,19388,0,11,Mortgage,PhD,Divorced,5,0.180394


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               10000 non-null  int64  
 1   CreditScore                      10000 non-null  int64  
 2   DebtRatio                        10000 non-null  float64
 3   Assets                           10000 non-null  int64  
 4   Age                              10000 non-null  int64  
 5   NumberOfDependents               10000 non-null  int64  
 6   NumberOfOpenCreditLinesAndLoans  10000 non-null  int64  
 7   MonthlyIncome                    10000 non-null  int64  
 8   NumberOfTimesPastDue             10000 non-null  int64  
 9   EmploymentLength                 10000 non-null  int64  
 10  HomeOwnership                    10000 non-null  object 
 11  Education                        10000 non-null  object 
 12  MaritalStatus      

In [6]:
# Datos duplicados

df.duplicated().sum()

0

In [7]:
# Datos faltantes por variable

df.isnull().sum()

ID                                 0
CreditScore                        0
DebtRatio                          0
Assets                             0
Age                                0
NumberOfDependents                 0
NumberOfOpenCreditLinesAndLoans    0
MonthlyIncome                      0
NumberOfTimesPastDue               0
EmploymentLength                   0
HomeOwnership                      0
Education                          0
MaritalStatus                      0
YearsAtCurrentAddress              0
NoPaidPerc                         0
dtype: int64

In [11]:
df = df.drop(columns=['ID'])

In [13]:
df = pd.get_dummies(df, columns=['HomeOwnership', 'Education', 'MaritalStatus'], dtype=int)

In [14]:
df

Unnamed: 0,CreditScore,DebtRatio,Assets,Age,NumberOfDependents,NumberOfOpenCreditLinesAndLoans,MonthlyIncome,NumberOfTimesPastDue,EmploymentLength,YearsAtCurrentAddress,...,HomeOwnership_Own,HomeOwnership_Rent,Education_Bachelor,Education_High School,Education_Masters,Education_PhD,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed
0,384,0.016560,37950,20,0,2,17355,9,28,18,...,0,0,1,0,0,0,1,0,0,0
1,516,0.138825,22909,46,5,8,7175,13,0,9,...,0,1,1,0,0,0,0,1,0,0
2,638,0.391149,126521,80,5,7,1910,6,25,9,...,0,1,1,0,0,0,0,0,1,0
3,363,0.463521,39543,54,9,0,19339,5,22,3,...,1,0,0,1,0,0,0,0,0,1
4,804,0.504824,38522,72,1,10,7928,18,19,5,...,1,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,448,0.602547,57758,22,9,19,17797,19,5,2,...,0,1,1,0,0,0,0,0,0,1
9996,643,0.633910,67051,85,3,1,7034,4,0,1,...,0,1,0,1,0,0,0,1,0,0
9997,695,0.019713,65309,42,4,3,9105,18,4,6,...,0,1,1,0,0,0,1,0,0,0
9998,387,0.174242,115144,20,3,10,19388,0,11,5,...,0,0,0,0,0,1,1,0,0,0


In [16]:
# Escalar las características al rango [0, 1]
scaler = MinMaxScaler()
# Escalar las variables numéricas
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [17]:
df_scaled

Unnamed: 0,CreditScore,DebtRatio,Assets,Age,NumberOfDependents,NumberOfOpenCreditLinesAndLoans,MonthlyIncome,NumberOfTimesPastDue,EmploymentLength,YearsAtCurrentAddress,...,HomeOwnership_Own,HomeOwnership_Rent,Education_Bachelor,Education_High School,Education_Masters,Education_PhD,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed
0,0.153005,0.016528,0.099691,0.028169,0.000000,0.105263,0.860918,0.473684,0.965517,0.620690,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.393443,0.138818,0.016119,0.394366,0.555556,0.421053,0.325016,0.684211,0.000000,0.310345,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.615665,0.391192,0.591818,0.873239,0.555556,0.368421,0.047852,0.315789,0.862069,0.310345,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.114754,0.463579,0.108542,0.507042,1.000000,0.000000,0.965361,0.263158,0.758621,0.103448,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.918033,0.504890,0.102869,0.760563,0.111111,0.526316,0.364656,0.947368,0.655172,0.172414,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.269581,0.602633,0.209750,0.056338,1.000000,1.000000,0.884186,1.000000,0.172414,0.068966,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9996,0.624772,0.634003,0.261385,0.943662,0.333333,0.052632,0.317593,0.210526,0.000000,0.034483,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9997,0.719490,0.019682,0.251706,0.338028,0.444444,0.157895,0.426616,0.947368,0.137931,0.206897,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9998,0.158470,0.174241,0.528604,0.028169,0.333333,0.526316,0.967941,0.000000,0.379310,0.172414,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
