# Libraries

In [100]:
import pandas as pd
import numpy as np
import gc
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import mlflow
from mlflow import log_metric, log_param
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, fbeta_score

import time

# Dataset

## Import

In [43]:
data = pd.read_csv('Home_credit_risk_data_modeling.csv')

In [44]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,...,,,,,,,,,,
1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,...,,,,,,,,,,


In [64]:
df = data.iloc[:,1:] #delete first column 'unnamed:0'

In [65]:
df.head(2)

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,...,,,,,,,,,,
1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,...,,,,,,,,,,


In [66]:
df.set_index('SK_ID_CURR', inplace = True)
df.head(2)

Unnamed: 0_level_0,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,,,,,,,,,,
100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,,,,,,,,,,


In [67]:
df_size = df.shape

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 356251 entries, 100002 to 456250
Columns: 796 entries, TARGET to CC_COUNT
dtypes: bool(133), float64(606), int64(41), object(16)
memory usage: 1.8+ GB


In [71]:
#replace infinite values by Nan
df.replace([np.inf, -np.inf], np.nan, inplace = True)

In [72]:
df.isna().sum()

TARGET                               48744
CODE_GENDER                              0
FLAG_OWN_CAR                             0
FLAG_OWN_REALTY                          0
CNT_CHILDREN                             0
                                     ...  
CC_NAME_CONTRACT_STATUS_nan_MAX     252693
CC_NAME_CONTRACT_STATUS_nan_MEAN    252693
CC_NAME_CONTRACT_STATUS_nan_SUM     252693
CC_NAME_CONTRACT_STATUS_nan_VAR     253385
CC_COUNT                            252693
Length: 796, dtype: int64

Split the Dataset into a train (where target was informed) and test (where no target was informed)

In [73]:
df_train_sample = df.loc[~(df['TARGET'].isnull())]
df_test_sample = df.loc[df['TARGET'].isnull()]

In [74]:
del df
gc.collect() 

1318

In [75]:
print(f"Train dataset has {round((df_train_sample[df_train_sample['TARGET'] == 1].shape[0])/df_train_sample.shape[0]*100,1)}% of target 1 and {round((df_train_sample[df_train_sample['TARGET'] == 0].shape[0])/df_train_sample.shape[0]*100,1)}% of target 0")

Train dataset has 8.1% of target 1 and 91.9% of target 0


## Preprocessing

In [76]:
X = df_train_sample.drop(columns = ['TARGET']).copy()
y = df_train_sample['TARGET'].copy()

In [77]:
def list_features_att(df):
    '''Function to attribute to columns of a dataframe a category into a several list
    it returns two lists.
    --> Numerical: num_list
    --> discrete_numerical: num_dis_list'''

    numerical_list = list()
    numerical_discrete_list = list()
    for name in df.columns:
        if df[name].nunique() < 10:
            numerical_discrete_list.append(name)
        else:
            numerical_list.append(name)
    return numerical_list, numerical_discrete_list

In [78]:
numerical_list, numerical_disc_list = list_features_att(X)

Use of SimpleImputer to impute missing values:

In [79]:
time_init = time.time()
#imputation on numeric continue columns with 'mean' strategy
X[numerical_list] = SimpleImputer(strategy='mean').fit_transform(X[numerical_list])
#imputation on  discrete numeric columns with 'most_frequent' strategy
X[numerical_disc_list] = SimpleImputer(strategy='most_frequent').fit_transform(X[numerical_disc_list])
print (f'time for imputation: {time.time() - time_init:.2f}')

time for imputation: 128.69


## Import / Export csv

In [99]:
# Use the first time to export as csv
X.to_csv('./X_train_data.csv')
y.to_csv('./y_train_data.csv')

# Import csv of sampled data before second split train/test and normalization
#X = pd.read_csv('X_train_data.csv')
#y = pd.read_csv('y_train_data.csv')

In [82]:
# for a better efficiency, we only try our models on 10% of train data
X1,_,y1,_ = train_test_split(X, y, train_size=0.10, stratify=y, random_state=42)

In [83]:
# data split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1, train_size=0.8, shuffle= True, stratify= y1, random_state= 42)

In [86]:
# Normalization
std_scaler = StandardScaler()

std_scaler.fit(X1_train)
X1_train = std_scaler.transform(X1_train)
X1_test = std_scaler.transform(X1_test)

# MLFlow setup

In [40]:
import mlflow
# Set the MLFlow tracking URI (local file storage)
mlflow.set_tracking_uri("./mlruns")

# Modelisation