In [44]:
import functions as fct
import warnings
from joblib import dump, load
import numpy as np
import pandas as pd
import re
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from imblearn.over_sampling import SMOTENC


In [45]:
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

 # Import des données

In [46]:
data = pd.read_csv("../01_DATA/data_for_modelling.csv")
liste_quanti = pd.read_csv("../01_DATA/liste_quanti.csv")

In [47]:
def load_joblibs(job_dir):
    """Permet de charger les encodeurs et les imputeurs depuis le dossier en argument"""
    onehot_encoder = load(job_dir+'/onehot_enc.joblib')
    mean_imputer = load(job_dir+'/mean_imputer.joblib')
    regexp = 'label_encoder-'
    label_encoders = {}
    for file_name in os.listdir(os.getcwd()+'/joblib'):
        if re.match(regexp, file_name):
            feature = file_name.split('-')[1].split('.')[0]
            label_encoders[feature] = load(job_dir+'/'+file_name)
    return onehot_encoder, label_encoders, mean_imputer


In [48]:
onehot_encoder, label_encoders, mean_imputer = load_joblibs('./joblib')

In [49]:
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Transport: type 3,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick"
0,100002,1.0,0,1,0,0.0,202500.0,406597.5,24700.5,351000.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,100003,0.0,0,0,0,0.0,270000.0,1293502.5,35698.5,1129500.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,100004,0.0,1,1,1,0.0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0.0,0,0,0,0.0,135000.0,312682.5,29686.5,297000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0.0,0,1,0,0.0,121500.0,513000.0,21865.5,513000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Préparation des données

In [50]:
# l'index des lignes correspond à l'ID de la demande de prêt en cours
data = data.set_index('SK_ID_CURR')

# Séparation de la target des données d'apprentissage
target = data.TARGET
data = data.drop(columns=['TARGET'], axis=1)

In [51]:
liste_quanti.drop(columns=['Unnamed: 0'], inplace=True)
liste_quanti = list(liste_quanti)

In [52]:
# séparation des données en jeu d'entrainement et jeu de test
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=0)
print("X_train :", X_train.shape)
print("y_train :", y_train.shape)
print("X_test :", X_test.shape)
print("y_test :", y_test.shape)

X_train : (243612, 159)
y_train : (243612,)
X_test : (60903, 159)
y_test : (60903,)


In [55]:
print("Notre échantillon d'entraînement contient %d individus avec %d variables" % (X_train.shape[0], X_train.shape[1]))

Notre échantillon d'entraînement contient 243612 individus avec 159 variables


# Preprocessing

In [42]:
# mise en place du pipeline pour les données quantitatives
quanti_pipeline = make_pipeline(StandardScaler())

# mise en place du preprocessor
preprocessor = make_column_transformer((quanti_pipeline, liste_quanti))

In [58]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304515 entries, 100002 to 456255
Data columns (total 159 columns):
 #    Column                                                         Dtype  
---   ------                                                         -----  
 0    NAME_CONTRACT_TYPE                                             int64  
 1    CODE_GENDER                                                    int64  
 2    FLAG_OWN_CAR                                                   int64  
 3    CNT_CHILDREN                                                   float64
 4    AMT_INCOME_TOTAL                                               float64
 5    AMT_CREDIT                                                     float64
 6    AMT_ANNUITY                                                    float64
 7    AMT_GOODS_PRICE                                                float64
 8    REGION_POPULATION_RELATIVE                                     float64
 9    DAYS_BIRTH                    