# **PREPARACIÓN DE DATOS**


## Importar librerías

In [None]:
# pandas: manejo de dataFrame
import pandas as pd
# sklearn: scikit-learn - Librería de python para Machine Learning
# Normalizar: permite escalar los datos a un rango de [0...1]
from sklearn.preprocessing import MinMaxScaler

## Importación de datos **riesgo_credito.csv**

In [None]:
datos = pd.read_csv("riesgo_credito.csv", sep=';')
datos.head(10)

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,no checking account,18,all credits at this bank paid back duly,car (used),1049,unknown/no savings account,< 1 yr,< 20,female : non-single or male : single,none,...,car or other,21,none,for free,1,skilled employee/official,0 to 2,no,no,good
1,no checking account,9,all credits at this bank paid back duly,others,2799,unknown/no savings account,1 <= ... < 4 yrs,25 <= ... < 35,male : married/widowed,none,...,unknown / no property,36,none,for free,02-Mar,skilled employee/official,3 or more,no,no,good
2,... < 0 DM,12,no credits taken/all credits paid back duly,retraining,841,... < 100 DM,4 <= ... < 7 yrs,25 <= ... < 35,female : non-single or male : single,none,...,unknown / no property,23,none,for free,1,unskilled - resident,0 to 2,no,no,good
3,no checking account,12,all credits at this bank paid back duly,others,2122,unknown/no savings account,1 <= ... < 4 yrs,20 <= ... < 25,male : married/widowed,none,...,unknown / no property,39,none,for free,02-Mar,unskilled - resident,3 or more,no,yes,good
4,no checking account,12,all credits at this bank paid back duly,others,2171,unknown/no savings account,1 <= ... < 4 yrs,< 20,male : married/widowed,none,...,car or other,38,bank,rent,02-Mar,unskilled - resident,0 to 2,no,yes,good
5,no checking account,10,all credits at this bank paid back duly,others,2241,unknown/no savings account,< 1 yr,>= 35,male : married/widowed,none,...,unknown / no property,48,none,for free,02-Mar,unskilled - resident,3 or more,no,yes,good
6,no checking account,8,all credits at this bank paid back duly,others,3398,unknown/no savings account,4 <= ... < 7 yrs,>= 35,male : married/widowed,none,...,unknown / no property,39,none,rent,02-Mar,unskilled - resident,0 to 2,no,yes,good
7,no checking account,6,all credits at this bank paid back duly,others,1361,unknown/no savings account,< 1 yr,25 <= ... < 35,male : married/widowed,none,...,unknown / no property,40,none,rent,1,unskilled - resident,3 or more,no,yes,good
8,... >= 200 DM / salary for at least 1 year,18,all credits at this bank paid back duly,furniture/equipment,1098,unknown/no savings account,unemployed,< 20,female : non-single or male : single,none,...,building soc. savings agr./life insurance,65,none,rent,02-Mar,unemployed/unskilled - non-resident,0 to 2,no,no,good
9,... < 0 DM,24,no credits taken/all credits paid back duly,furniture/equipment,3758,100 <= ... < 500 DM,unemployed,>= 35,female : non-single or male : single,none,...,real estate,23,none,for free,1,unemployed/unskilled - non-resident,0 to 2,no,no,good


# **2. ELIMINACIÓN DE DATOS FALTANTES**

# Determinar si existen datos faltantes

In [None]:
# visualizar información del dataset
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   status                   1000 non-null   object
 1   duration                 1000 non-null   int64 
 2   credit_history           1000 non-null   object
 3   purpose                  1000 non-null   object
 4   amount                   1000 non-null   int64 
 5   savings                  1000 non-null   object
 6   employment_duration      1000 non-null   object
 7   installment_rate         1000 non-null   object
 8   personal_status_sex      1000 non-null   object
 9   other_debtors            1000 non-null   object
 10  present_residence        1000 non-null   object
 11  property                 1000 non-null   object
 12  age                      1000 non-null   int64 
 13  other_installment_plans  1000 non-null   object
 14  housing                  1000 non-null   

In [None]:
#número de valores nulos por cada atributo
datos.isnull().sum()

status                     0
duration                   0
credit_history             0
purpose                    0
amount                     0
savings                    0
employment_duration        0
installment_rate           0
personal_status_sex        0
other_debtors              0
present_residence          0
property                   0
age                        0
other_installment_plans    0
housing                    0
number_credits             0
job                        0
people_liable              0
telephone                  0
foreign_worker             0
credit_risk                0
dtype: int64

In [None]:
#visualizar datos que tengan algun atributo nulo
nulos = datos[datos.isnull().any(1)]
nulos

  nulos = datos[datos.isnull().any(1)]


Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk


In [None]:
print ('número de registros con valores nulos: ', len(nulos))

número de registros con valores nulos:  0


## Eliminar de valores nulos

# **2. CONVERSION DE DATOS CATEGÓRICOS A NUMÉRICOS**


## Implementación de un módulo que convierte un atributo categórico a numérico utilizando las funciones **astype("category")** y **cat.codes**

In [None]:
def Categorico_a_numerico(atributo):
  # Convertir el atributo al tipo categorico
  atributo = atributo.astype("category")
  # Convertir el atributo categorico a numerico
  return atributo.astype("category").cat.codes

## Convertir los atributos SUELDO, SEXO, ESTADO.CIVIL, DEUDA, GARANTE DISTRITO a numérico

In [None]:
# datos antes de la conversión
datos.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,no checking account,18,all credits at this bank paid back duly,car (used),1049,unknown/no savings account,< 1 yr,< 20,female : non-single or male : single,none,...,car or other,21,none,for free,1,skilled employee/official,0 to 2,no,no,good
1,no checking account,9,all credits at this bank paid back duly,others,2799,unknown/no savings account,1 <= ... < 4 yrs,25 <= ... < 35,male : married/widowed,none,...,unknown / no property,36,none,for free,02-Mar,skilled employee/official,3 or more,no,no,good
2,... < 0 DM,12,no credits taken/all credits paid back duly,retraining,841,... < 100 DM,4 <= ... < 7 yrs,25 <= ... < 35,female : non-single or male : single,none,...,unknown / no property,23,none,for free,1,unskilled - resident,0 to 2,no,no,good
3,no checking account,12,all credits at this bank paid back duly,others,2122,unknown/no savings account,1 <= ... < 4 yrs,20 <= ... < 25,male : married/widowed,none,...,unknown / no property,39,none,for free,02-Mar,unskilled - resident,3 or more,no,yes,good
4,no checking account,12,all credits at this bank paid back duly,others,2171,unknown/no savings account,1 <= ... < 4 yrs,< 20,male : married/widowed,none,...,car or other,38,bank,rent,02-Mar,unskilled - resident,0 to 2,no,yes,good


In [None]:
# convertir los categóricos a numéricos
datos['status'] = Categorico_a_numerico(datos['status'])
datos['credit_history'] = Categorico_a_numerico(datos['credit_history'])
datos['purpose'] = Categorico_a_numerico(datos['purpose'])
datos['savings'] = Categorico_a_numerico(datos['savings'])
datos['employment_duration'] = Categorico_a_numerico(datos['employment_duration'])
datos['installment_rate'] = Categorico_a_numerico(datos['installment_rate'])
datos['personal_status_sex'] = Categorico_a_numerico(datos['personal_status_sex'])
datos['other_debtors'] = Categorico_a_numerico(datos['other_debtors'])
datos['present_residence'] = Categorico_a_numerico(datos['present_residence'])
datos['property'] = Categorico_a_numerico(datos['property'])
datos['other_installment_plans'] = Categorico_a_numerico(datos['other_installment_plans'])
datos['housing'] = Categorico_a_numerico(datos['housing'])
datos['number_credits'] = Categorico_a_numerico(datos['number_credits'])
datos['job'] = Categorico_a_numerico(datos['job'])
datos['people_liable'] = Categorico_a_numerico(datos['people_liable'])
datos['telephone'] = Categorico_a_numerico(datos['telephone'])
datos['foreign_worker'] = Categorico_a_numerico(datos['foreign_worker'])
datos['credit_risk'] = Categorico_a_numerico(datos['credit_risk'])

In [None]:
# datos después de la conversión
datos.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,3,18,0,2,1049,4,2,2,0,2,...,1,21,1,0,2,1,0,0,0,1
1,3,9,0,5,2799,4,0,1,3,2,...,3,36,1,0,0,1,1,0,0,1
2,0,12,4,8,841,0,1,1,0,2,...,3,23,1,0,2,3,0,0,0,1
3,3,12,0,5,2122,4,0,0,3,2,...,3,39,1,0,0,3,1,0,1,1
4,3,12,0,5,2171,4,0,2,3,2,...,1,38,0,2,0,3,0,0,1,1


# **3. NORMALIZACIÓN DE DATOS**

## Implementar la normalización de datos por amplitud y distribución

In [None]:
# Transformación por amplitud [0..1]
def Normalizacion_Amplitud(atributo):
  return (atributo - atributo.min())/(atributo.max() - atributo.min())

In [None]:
# Transformación por distribución [-1..1]
def Normalizacion_Distribucion(atributo):
  return (atributo - atributo.mean())/(atributo.var())

## Normalizar todos los atributos por amplitud

In [None]:
datos.columns

Index(['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings',
       'employment_duration', 'installment_rate', 'personal_status_sex',
       'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'number_credits', 'job',
       'people_liable', 'telephone', 'foreign_worker', 'credit_risk'],
      dtype='object')

In [None]:
datos['status'] = Normalizacion_Amplitud(datos['status'])
datos['duration'] = Normalizacion_Amplitud(datos['duration'])
datos['credit_history'] = Normalizacion_Amplitud(datos['credit_history'])
datos['purpose'] = Normalizacion_Amplitud(datos['purpose'])
datos['amount'] = Normalizacion_Amplitud(datos['amount'])
datos['savings'] = Normalizacion_Amplitud(datos['savings'])
datos['employment_duration'] = Normalizacion_Amplitud(datos['employment_duration'])
datos['installment_rate'] = Normalizacion_Amplitud(datos['installment_rate'])
datos['personal_status_sex'] = Normalizacion_Amplitud(datos['personal_status_sex'])
datos['other_debtors'] = Normalizacion_Amplitud(datos['other_debtors'])
datos['present_residence'] = Normalizacion_Amplitud(datos['present_residence'])
datos['property'] = Normalizacion_Amplitud(datos['property'])
datos['age'] = Normalizacion_Amplitud(datos['age'])
datos['other_installment_plans'] = Normalizacion_Amplitud(datos['other_installment_plans'])
datos['housing'] = Normalizacion_Amplitud(datos['housing'])
datos['number_credits'] = Normalizacion_Amplitud(datos['number_credits'])
datos['job'] = Normalizacion_Amplitud(datos['job'])
datos['people_liable'] = Normalizacion_Amplitud(datos['people_liable'])
datos['telephone'] = Normalizacion_Amplitud(datos['telephone'])
datos['foreign_worker'] = Normalizacion_Amplitud(datos['foreign_worker'])
datos['credit_risk'] = Normalizacion_Amplitud(datos['credit_risk'])

In [None]:
#datos después de la normalizacion por amplitud
datos.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,1.0,0.205882,0.0,0.222222,0.043964,1.0,0.5,0.666667,0.0,1.0,...,0.333333,0.035714,0.5,0.0,0.666667,0.333333,0.0,0.0,0.0,1.0
1,1.0,0.073529,0.0,0.555556,0.140255,1.0,0.0,0.333333,1.0,1.0,...,1.0,0.303571,0.5,0.0,0.0,0.333333,1.0,0.0,0.0,1.0
2,0.0,0.117647,1.0,0.888889,0.032519,0.0,0.25,0.333333,0.0,1.0,...,1.0,0.071429,0.5,0.0,0.666667,1.0,0.0,0.0,0.0,1.0
3,1.0,0.117647,0.0,0.555556,0.103004,1.0,0.0,0.0,1.0,1.0,...,1.0,0.357143,0.5,0.0,0.0,1.0,1.0,0.0,1.0,1.0
4,1.0,0.117647,0.0,0.555556,0.1057,1.0,0.0,0.666667,1.0,1.0,...,0.333333,0.339286,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0


# **4. GRABACIÓN DE DATOS PRE-PROCESADOS**


In [None]:
# guardar el preprocesamiento de datos en un archivo csv
datos.to_csv("datos_riesgo_credito_norm.csv", sep=',', header=True, index=False)