In [1]:
#conda install -c conda-forge feature_engine

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer
from feature_engine.transformation import PowerTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

from sklearn.metrics import mean_squared_error 

import joblib

In [2]:
#Librería creada
import my_preprocessors as mypp 

In [3]:
data = pd.read_csv("dataP.csv")
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
cat_vars = [var for var in data.columns if data[var].dtype == 'O']
#cat_vars

#Agregamos variables que por definición son categóricas
cat_vars = cat_vars + ['FLAG_MOBIL'] + ['FLAG_EMP_PHONE'] + ['FLAG_WORK_PHONE'] + ['FLAG_CONT_MOBILE'] + ['FLAG_PHONE'] + ['FLAG_EMAIL'] + ['FLAG_WORK_PHONE'] + ['FLAG_PHONE'] + ['REGION_RATING_CLIENT_W_CITY'] + ['REG_CITY_NOT_LIVE_CITY'] + ['REG_CITY_NOT_WORK_CITY'] + ['ORGANIZATION_TYPE'] + ['FLAG_DOCUMENT_3'] + ['OCCUPATION_TYPE'] + ['WALLSMATERIAL_MODE'] +['NAME_CONTRACT_TYPE'] + ['CODE_GENDER'] + ['FLAG_OWN_CAR'] + ['NAME_INCOME_TYPE'] + ['NAME_EDUCATION_TYPE']
#cat_vars

data[cat_vars] = data[cat_vars].astype('O')

# Cantidad de variables categóricas
len(cat_vars)

36

In [5]:
# Se separa data para train y test
X_train, X_test, y_train, y_test = train_test_split(
                        data.drop(['SK_ID_CURR', 'TARGET'], axis=1),
                        data['TARGET'],
                        test_size=0.15,
                        random_state=2021)

X_train.shape, X_test.shape

((261384, 120), (46127, 120))

In [6]:
# No se usa np.log por datos negatvos, genera error

y_train = np.sqrt(np.power(y_train,2))
y_test = np.sqrt(np.power(y_test,2))
#y_train
#y_test

## Configuración del Machine Learning Pipeline

In [7]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['OCCUPATION_TYPE']

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = ['WALLSMATERIAL_MODE']


#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'DAYS_ID_PUBLISH',
 'DAYS_BIRTH'
]


#Varibles para transformación logaritmia
NUMERICALS_LOG_VARS = ['DAYS_LAST_PHONE_CHANGE']

#Variables para binarización por sesgo fuerte
BINARIZE_VARS = []

#Variables para hacer mapeo categorico por codificación ordinal
NAME_CONTRACT_TYPE_VARS = ['NAME_CONTRACT_TYPE']

CODE_GENDER_VARS = ['CODE_GENDER']

FLAG_OWN_CAR_VARS = ['FLAG_OWN_CAR']

NAME_INCOME_TYPE_VARS = ['NAME_INCOME_TYPE']

NAME_EDUCATION_TYPE_VARS = ['NAME_EDUCATION_TYPE']

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ['FLAG_WORK_PHONE','FLAG_PHONE', 'REGION_RATING_CLIENT_W_CITY', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FLAG_DOCUMENT_3']

#Mapeos de variables categoricas
NAME_CONTRACT_TYPE_MAPPINGS = {'Cash loans':1, 'Revolving loans':0}

CODE_GENDER_MAPPINGS = {'M':1, 'F':0, 'XNA':2}

FLAG_OWN_CAR_MAPPINGS = {'Y':1, 'N':0}

NAME_INCOME_TYPE_MAPPINGS = {'Businessman':0, 'Unemployed':1, 'Commercial associate':2, 'Maternity leave':3, 'Pensioner':4, 'State servant':5, 'Student':6, 'Working':7}

NAME_EDUCATION_TYPE_MAPPINGS = {'Academic degree':0, 'Higher education':1, 'Incomplete higher':2, 'Lower secondary':3, 'Secondary / secondary special':4}

#Variables seleccionadas según análisis de Lasso
FEATURES = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'OCCUPATION_TYPE',
       'REGION_RATING_CLIENT_W_CITY', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'WALLSMATERIAL_MODE',
       'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_3',
       'AMT_REQ_CREDIT_BUREAU_MON']

In [13]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_train

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,DAYS_BIRTH,DAYS_ID_PUBLISH,FLAG_WORK_PHONE,FLAG_PHONE,OCCUPATION_TYPE,...,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,WALLSMATERIAL_MODE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_MON
257757,Cash loans,F,Y,Working,Secondary / secondary special,-11553,-1657,0,0,Cleaning staff,...,0,0,Industry: type 7,0.477186,0.584956,0.422370,,-2785.0,1,0.0
258861,Cash loans,M,N,Commercial associate,Higher education,-9925,-2570,0,0,Sales staff,...,0,0,Trade: type 3,,0.394221,0.461482,Block,-196.0,0,0.0
203236,Cash loans,F,N,Pensioner,Secondary / secondary special,-21820,-4599,0,0,,...,0,0,XNA,,0.634678,0.452534,Panel,-1429.0,1,1.0
156431,Cash loans,M,N,State servant,Secondary / secondary special,-21809,-4267,0,0,Laborers,...,0,0,Transport: type 2,0.562275,0.191526,0.665855,Panel,-1252.0,1,0.0
14722,Revolving loans,M,Y,Commercial associate,Higher education,-11332,-3496,0,0,,...,0,0,Business Entity Type 3,0.292984,0.594132,0.222581,Panel,-597.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219094,Cash loans,F,N,Working,Secondary / secondary special,-16026,-4501,1,1,Sales staff,...,0,0,Self-employed,0.613478,0.437516,0.593718,,-901.0,1,1.0
151390,Cash loans,F,Y,Commercial associate,Higher education,-7786,-461,1,1,Core staff,...,0,0,Trade: type 2,,0.401676,0.192942,,-112.0,0,0.0
133741,Cash loans,M,N,Commercial associate,Secondary / secondary special,-13744,-4561,0,0,Sales staff,...,0,0,Business Entity Type 3,,0.648341,0.579727,Block,-1365.0,0,0.0
224597,Revolving loans,M,N,Working,Secondary / secondary special,-8347,-1007,0,0,Laborers,...,0,1,Business Entity Type 2,,0.279090,,,-895.0,0,


## Machine Learing PipeLine

In [9]:
Target_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de varaibles categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de variables categoricas con NA basado en frequiencia.
    ('frequent_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #5. Transformación logaritmica
    ('pow', PowerTransformer(variables=NUMERICALS_LOG_VARS)),
    
    #6. Binarización de Variables con Sesgo Fuerte
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)),
    
    #=============== CODIFICACION DE VARIABLES CATEGORICAS ORDINALES ==============
    ('mapper_name', mypp.Mapper(
        variables=NAME_CONTRACT_TYPE_VARS, mappings=NAME_CONTRACT_TYPE_MAPPINGS)),
    
    ('mapper_gender', mypp.Mapper(
        variables=CODE_GENDER_VARS, mappings=CODE_GENDER_MAPPINGS)),
    
    ('mapper_car', mypp.Mapper(
        variables=FLAG_OWN_CAR_VARS, mappings=FLAG_OWN_CAR_MAPPINGS)),
    
        
    ('mapper_income', mypp.Mapper(
        variables=NAME_INCOME_TYPE_VARS, mappings=NAME_INCOME_TYPE_MAPPINGS)),
    
    ('mapper_education', mypp.Mapper(
        variables=NAME_EDUCATION_TYPE_VARS, mappings=NAME_EDUCATION_TYPE_MAPPINGS)),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),
    
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Lasso', Lasso(alpha=0.01, random_state=2022)),
    
]) 

In [10]:
Target_pipeline.fit(X_train, y_train)

ValueError: Some of the variables to transform contain NaN. Check and remove those before using this transformer.

In [None]:
# ['FLAG_WORK_PHONE','FLAG_PHONE', 'REGION_RATING_CLIENT_W_CITY', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FLAG_DOCUMENT_3']

#X_train['FLAG_WORK_PHONE'].value_counts()
#X_train['FLAG_WORK_PHONE'].value_counts()
#X_train['FLAG_PHONE'].value_counts()
#X_train['REGION_RATING_CLIENT_W_CITY'].value_counts()
#X_train['REG_CITY_NOT_LIVE_CITY'].value_counts()
#X_train['REG_CITY_NOT_WORK_CITY'].value_counts()
#X_train['ORGANIZATION_TYPE'].value_counts()
#X_train['FLAG_DOCUMENT_3'].value_counts()


# ['EXT_SOURCE_1',  'EXT_SOURCE_2',  'EXT_SOURCE_3',  'DAYS_LAST_PHONE_CHANGE',  'AMT_REQ_CREDIT_BUREAU_MON',  'DAYS_ID_PUBLISH',  'DAYS_BIRTH' 


#X_train['EXT_SOURCE_1'].value_counts()
#X_train['EXT_SOURCE_2'].value_counts()
#X_train['EXT_SOURCE_3'].value_counts()
#X_train['DAYS_LAST_PHONE_CHANGE'].value_counts()
#X_train['AMT_REQ_CREDIT_BUREAU_MON'].value_counts()
#X_train['DAYS_ID_PUBLISH'].value_counts()
#X_train['DAYS_BIRTH'].value_counts()

In [None]:
#Seleccionamos variables para predicción
X_test = X_test[FEATURES]

In [None]:
preds = Target_pipeline.predict(X_test)

In [None]:
mean_squared_error(np.exp(y_test), np.exp(preds), squared=False)

In [None]:
X_test

In [None]:
#Guardamos pipeline
joblib.dump(Target_pipeline, 'Target_pipeline.pkl')

In [11]:
X_train

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,DAYS_BIRTH,DAYS_ID_PUBLISH,FLAG_WORK_PHONE,FLAG_PHONE,OCCUPATION_TYPE,...,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,WALLSMATERIAL_MODE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_MON
257757,Cash loans,F,Y,Working,Secondary / secondary special,-11553,-1657,0,0,Cleaning staff,...,0,0,Industry: type 7,0.477186,0.584956,0.422370,,-2785.0,1,0.0
258861,Cash loans,M,N,Commercial associate,Higher education,-9925,-2570,0,0,Sales staff,...,0,0,Trade: type 3,,0.394221,0.461482,Block,-196.0,0,0.0
203236,Cash loans,F,N,Pensioner,Secondary / secondary special,-21820,-4599,0,0,,...,0,0,XNA,,0.634678,0.452534,Panel,-1429.0,1,1.0
156431,Cash loans,M,N,State servant,Secondary / secondary special,-21809,-4267,0,0,Laborers,...,0,0,Transport: type 2,0.562275,0.191526,0.665855,Panel,-1252.0,1,0.0
14722,Revolving loans,M,Y,Commercial associate,Higher education,-11332,-3496,0,0,,...,0,0,Business Entity Type 3,0.292984,0.594132,0.222581,Panel,-597.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219094,Cash loans,F,N,Working,Secondary / secondary special,-16026,-4501,1,1,Sales staff,...,0,0,Self-employed,0.613478,0.437516,0.593718,,-901.0,1,1.0
151390,Cash loans,F,Y,Commercial associate,Higher education,-7786,-461,1,1,Core staff,...,0,0,Trade: type 2,,0.401676,0.192942,,-112.0,0,0.0
133741,Cash loans,M,N,Commercial associate,Secondary / secondary special,-13744,-4561,0,0,Sales staff,...,0,0,Business Entity Type 3,,0.648341,0.579727,Block,-1365.0,0,0.0
224597,Revolving loans,M,N,Working,Secondary / secondary special,-8347,-1007,0,0,Laborers,...,0,1,Business Entity Type 2,,0.279090,,,-895.0,0,


In [15]:
var_with_na = [var for var in X_train.columns if(X_train[var].isnull().sum() > 0)]
len(var_with_na)

7

In [16]:
data[var_with_na].isnull().mean().sort_values(ascending=False)

EXT_SOURCE_1                 0.563811
WALLSMATERIAL_MODE           0.508408
OCCUPATION_TYPE              0.313455
EXT_SOURCE_3                 0.198253
AMT_REQ_CREDIT_BUREAU_MON    0.135016
EXT_SOURCE_2                 0.002146
DAYS_LAST_PHONE_CHANGE       0.000003
dtype: float64