# **Création du dataset**

Notebook de base (non édité) : https://www.kaggle.com/code/hikmetsezen/micro-model-174-features-0-8-auc-on-home-credit

> **Imports**

In [None]:
!pip install ppscore

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sample_submission = pd.read_csv('/kaggle/input/home-credit-default-risk/sample_submission.csv')

previous_application = pd.read_csv('/kaggle/input/home-credit-default-risk/previous_application.csv')

installments_payments = pd.read_csv('/kaggle/input/home-credit-default-risk/installments_payments.csv')

credit_card_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/credit_card_balance.csv')

bureau_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau_balance.csv')

bureau = pd.read_csv('/kaggle/input/home-credit-default-risk/bureau.csv')

application_train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')

application_test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')

POS_CASH_balance = pd.read_csv('/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv')

In [None]:
datasets = [application_train, application_test, bureau, bureau_balance, POS_CASH_balance, 
            credit_card_balance, installments_payments, previous_application, sample_submission]

dataset_names = ['application_train', 'application_test', 'bureau', 'bureau_balance', 
                 'POS_CASH_balance', 'credit_card_balance', 'installments_payments', 
                 'previous_application', 'sample_submission']

columns = []
for dataset in datasets :
    columns += list(dataset.columns)

unique_columns = []
[unique_columns.append(column) for column in columns if column not in unique_columns]
print(f"Nombre de features uniques : {len(unique_columns)}")
for i in range(len(datasets)):
    print(f"\nDimensions de {dataset_names[i]} : {datasets[i].shape}")

On doit se retrouver avec un dataset d'entraînement de taille 307511 * n

et un dataset de test de taille 48744 * n avec n <= 195.

> **JOINTURES**

In [None]:
# Descriptions des variables des fichiers précédents : uilisation pour trouver les clés de jointure entre les fichiers
desc = pd.read_csv('/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv', encoding="ISO-8859-1")
desc.loc[desc.Row.duplicated(), ['Table', 'Row', 'Description']].sort_values(by='Row')

On peut joindre les fichiers grâce aux clés d'identification : SK_ID_CURR, SK_ID_BUREAU, SK_ID_PREV.

Le fichier sample_applications ne sera pas utilisé dans la jointure des fichiers.

In [None]:
train_ids = application_train['SK_ID_CURR']
test_ids = application_test['SK_ID_CURR']

labels = application_train['TARGET']

def merge_data():
    prev = credit_card_balance.merge(installments_payments, how='inner', on='SK_ID_PREV')
    prev = prev.merge(POS_CASH_balance, how='inner', on='SK_ID_PREV')
    prev = prev.merge(previous_application, how='inner', on='SK_ID_PREV')
    
    bur = bureau.merge(bureau_balance, how='inner', on='SK_ID_BUREAU')
    return prev, bur

In [None]:
'''
prev, bur = merge_data()
print(prev.shape, bur.shape)
'''
# Prends trop de RAM

A cause des duplicatas de la variable SK_ID_CURR de bureau, une jointure des fichiers modifie le nombre de demandes dans le dataframe 'applications', il y a beaucoup trop de lignes à stocker.

On abandonne la jointure directe : on prend les deux datasets application_train/test et on rajoute des features avec le reste des fichiers via la méthode .groupby().

In [None]:
# Groupby pour bureau/bureau_balance

ohe = OneHotEncoder(sparse_output=False)
array_status = np.reshape(np.array(bureau['CREDIT_ACTIVE']), (-1,1))
ohe_status = pd.DataFrame(ohe.fit_transform(array_status),
                          columns=list(pd.Series(ohe.get_feature_names_out()).str.split('_',
                                                 expand=True)[1].values))
bureau.reset_index(drop=True, inplace=True)
for col in ohe_status.columns:
    bureau = pd.concat([bureau, ohe_status.loc[:, col]], axis=1)
    
def bureau_groupby_merge(data):
    application = pd.DataFrame()
    application['SK_ID_CURR'] = pd.Series(bureau['SK_ID_CURR'].unique()).sort_values()
    
    application['NB_CREDIT_ACTIVE'] = bureau.groupby('SK_ID_CURR')['Active'].sum().values
    application['NB_CREDIT_DEBT'] = bureau.groupby('SK_ID_CURR')['Bad debt'].sum().values
    application['NB_CREDIT_CLOSED'] = bureau.groupby('SK_ID_CURR')['Closed'].sum().values
    application['NB_CREDIT_SOLD'] = bureau.groupby('SK_ID_CURR')['Sold'].sum().values
    application['MEAN_DAYS_BETWEEN_APP'] = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT'].mean().values
    application['MEAN_CREDIT_DAY_OVERDUE'] = bureau.groupby('SK_ID_CURR')['CREDIT_DAY_OVERDUE'].mean().values
    application['REMAINING_CREDIT_DAYS'] = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_ENDDATE'].max().values
    application['MAX_OVERDUE'] = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].max().values
    application['NB_PROLONGATIONS'] = bureau.groupby('SK_ID_CURR')['CNT_CREDIT_PROLONG'].sum().values
    application['TOT_CREDIT_AMT'] = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_SUM'].sum().values
    application['TOT_CREDIT_DEBT'] = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_DEBT'].sum().values
    application['MAX_CREDIT_LIMIT'] = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_LIMIT'].max().values
    application['TOT_CREDIT_OVERDUE'] = bureau.groupby('SK_ID_CURR')['AMT_CREDIT_SUM_OVERDUE'].sum().values
    application['NB_CREDIT_TYPES'] = bureau.groupby('SK_ID_CURR')['CREDIT_TYPE'].nunique().values
    application['MEAN_DAYS_SINCE_UPDATE'] = bureau.groupby('SK_ID_CURR')['DAYS_CREDIT_UPDATE'].mean().values
    application['TOT_ANNUITY'] = bureau.groupby('SK_ID_CURR')['AMT_ANNUITY'].sum().values
    
    merged = data.merge(application, how='left', on='SK_ID_CURR')
    return merged

data_train = bureau_groupby_merge(application_train)
data_test = bureau_groupby_merge(application_test)

del bureau
del bureau_balance
del application_train
del application_test

print(data_train.shape)
print(data_test.shape)

In [None]:
# Groupby pour previous_application

ohe = OneHotEncoder(sparse_output=False)
array_status = np.reshape(np.array(previous_application['NAME_CONTRACT_STATUS']), (-1,1))
ohe_status = pd.DataFrame(ohe.fit_transform(array_status),
                          columns=list(pd.Series(ohe.get_feature_names_out()).str.split('_',
                                                 expand=True)[1].values))
previous_application.reset_index(drop=True, inplace=True)
for col in ohe_status.columns:
    previous_application = pd.concat([previous_application, ohe_status.loc[:, col]], axis=1)

def previous_groupby_merge(data):
    application = pd.DataFrame()
    application['SK_ID_CURR'] = pd.Series(previous_application['SK_ID_CURR'].unique()).sort_values()
    
    application['NB_PREV_CONTRACT_TYPE'] = previous_application.groupby('SK_ID_CURR')['NAME_CONTRACT_TYPE'].nunique().values
    application['MEAN_PREV_ANNUITY'] = previous_application.groupby('SK_ID_CURR')['AMT_ANNUITY'].mean().values
    application['MEAN_PREV_APP'] = previous_application.groupby('SK_ID_CURR')['AMT_APPLICATION'].mean().values
    application['MEAN_PREV_CREDIT'] = previous_application.groupby('SK_ID_CURR')['AMT_CREDIT'].mean().values
    application['MEAN_PREV_DOWNPAYMENT'] = previous_application.groupby('SK_ID_CURR')['AMT_DOWN_PAYMENT'].mean().values
    application['MEAN_PREV_GOODSPRICE'] = previous_application.groupby('SK_ID_CURR')['AMT_GOODS_PRICE'].mean().values
    application['NB_PREV_APPROVED'] = previous_application.groupby('SK_ID_CURR')['Approved'].count().values
    application['NB_PREV_CANCELED'] = previous_application.groupby('SK_ID_CURR')['Canceled'].count().values
    application['NB_PREV_REFUSED'] = previous_application.groupby('SK_ID_CURR')['Refused'].count().values
    application['NB_PREV_UNUSED'] = previous_application.groupby('SK_ID_CURR')['Unused offer'].count().values
    application['NB_PREV_GOODSTYPE'] = previous_application.groupby('SK_ID_CURR')['NAME_GOODS_CATEGORY'].count().values
    application['DAYS_FIRST_DRAWING'] = previous_application.groupby('SK_ID_CURR')['DAYS_FIRST_DRAWING'].max().values
    application['DAYS_FIRST_DUE'] = previous_application.groupby('SK_ID_CURR')['DAYS_FIRST_DUE'].min().values
    application['NB_PREV_INSURED'] = previous_application.groupby('SK_ID_CURR')['NFLAG_INSURED_ON_APPROVAL'].count().values
    
    merged = data.merge(application, how='left', on='SK_ID_CURR')
    return merged

data_train = previous_groupby_merge(data_train)
data_test = previous_groupby_merge(data_test)

del previous_application

print(data_train.shape)
print(data_test.shape)

In [None]:
# Groupby pour POS_CASH_balance

def POS_CASH_groupby_merge(data):
    application = pd.DataFrame()
    application['SK_ID_CURR'] = pd.Series(POS_CASH_balance['SK_ID_CURR'].unique()).sort_values()
    
    application['MEAN_INSTALMENTS'] = POS_CASH_balance.groupby('SK_ID_CURR')['CNT_INSTALMENT'].mean().values
    application['MAX_INSTALMENTS_LEFT'] = POS_CASH_balance.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].max().values
    application['MEAN_DAYS_PAST_DUE'] = POS_CASH_balance.groupby('SK_ID_CURR')['SK_DPD'].mean().values
    
    merged = data.merge(application, how='left', on='SK_ID_CURR')
    return merged

data_train = POS_CASH_groupby_merge(data_train)
data_test = POS_CASH_groupby_merge(data_test)

del POS_CASH_balance

print(data_train.shape)
print(data_test.shape)

In [None]:
# Groupby pour credit_card_balance

def credit_card_groupby_merge(data):
    application = pd.DataFrame()
    application['SK_ID_CURR'] = pd.Series(credit_card_balance['SK_ID_CURR'].unique()).sort_values()
    
    application['MEAN_BALANCE'] = credit_card_balance.groupby('SK_ID_CURR')['AMT_BALANCE'].mean().values
    application['MAX_CREDIT_LIMIT'] = credit_card_balance.groupby('SK_ID_CURR')['AMT_CREDIT_LIMIT_ACTUAL'].max().values
    application['MEAN_DRAWING'] = credit_card_balance.groupby('SK_ID_CURR')['AMT_DRAWINGS_CURRENT'].mean().values
    application['MIN_INSTALMENT'] = credit_card_balance.groupby('SK_ID_CURR')['AMT_INST_MIN_REGULARITY'].min().values
    application['MEAN_MONTHLY_PAYMENT'] = credit_card_balance.groupby('SK_ID_CURR')['AMT_PAYMENT_TOTAL_CURRENT'].mean().values
    application['MEAN_DRAWING_FREQUENCY'] = credit_card_balance.groupby('SK_ID_CURR')['CNT_DRAWINGS_CURRENT'].mean().values
    
    merged = data.merge(application, how='left', on='SK_ID_CURR')
    return merged

data_train = credit_card_groupby_merge(data_train)
data_test = credit_card_groupby_merge(data_test)

del credit_card_balance

print(data_train.shape)
print(data_test.shape)

In [None]:
# Pas de feature pertinente à extraire
del installments_payments

Fin de la jointure. Nos datasets finaux sont data_train et data_test.

# **Nettoyage des données**

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

colors = sns.color_palette('pastel')[0:data_train.dtypes.nunique()]
counts = data_train.dtypes.value_counts()
axs[0].pie(counts.values, labels=counts.index, colors=colors,
        autopct=lambda p : '{:.2f}%  ({:,.0f})'.format(p, p*sum(counts.values)/100))
axs[0].set_title('Data_train dtypes')

colors = sns.color_palette('pastel')[0:data_test.dtypes.nunique()]
counts = data_test.dtypes.value_counts()
axs[1].pie(counts.values, labels=counts.index, colors=colors,
           autopct=lambda p : '{:.2f}%  ({:,.0f})'.format(p, p*sum(counts.values)/100))
axs[1].set_title('Data_test dtypes')

plt.show()

16 variables à encoder si l'étude des valeurs manquantes ne nous amène pas à supprimer des variables catégoriques.

In [None]:
int64_col = data_train.dtypes[data_train.dtypes=='int64'].index
describe = data_train[int64_col].describe()
describe.iloc[:, :20]

In [None]:
describe.iloc[:, 20:]

Les variables TARGET, les 6 variabes FLAG_, REG_REGION_NOT_LIVE_REGION, REG_REGION_NOT_WORK_REGION, 
LIVE_REGION_NOT_WORK_REGION, REG_CITY_NOT_LIVE_CITY, REG_CITY_NOT_WORK_CITY, LIVE_CITY_NOT_WORK_CITY et les 20 variables FLAG_DOCUMENT_ sont des variables catégoriques (binaires).

In [None]:
train_cat = list(data_train.dtypes[data_train.dtypes == object].index)
train_cat += ['TARGET', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
              'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
              'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
              'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 
              'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 
              'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 
              'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
              'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
              'FLAG_DOCUMENT_21']

train_num = [col for col in data_train.columns if col not in train_cat]

print(f"Nombre de variables catégoriques : {len(train_cat)}")
print(f"Nombre de variables numériques : {len(train_num)}")

In [None]:
FLAG_DOCUMENT=['FLAG_DOCUMENT_2','FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 
               'FLAG_DOCUMENT_5','FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 
               'FLAG_DOCUMENT_8','FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 
               'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
               'FLAG_DOCUMENT_14','FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 
               'FLAG_DOCUMENT_17','FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 
               'FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']

def feature_engineering(data):
    data['NB_FLAG_DOCUMENTS'] = data[FLAG_DOCUMENT].sum(axis=1)
    data.drop(FLAG_DOCUMENT,axis=1,inplace=True)
    
    data['AGE']=data['DAYS_BIRTH']/(-365)

    data['CREDIT_INCOME_RATIO'] = data['AMT_CREDIT'] / data['AMT_INCOME_TOTAL']
    
feature_engineering(data_train)
feature_engineering(data_test)

train_num += ['NB_FLAG_DOCUMENTS', 'AGE', 'CREDIT_INCOME_RATIO']
for cat in FLAG_DOCUMENT:
    train_cat.remove(cat)

print(f"Nombre de variables numériques : {len(train_num)}\nNombre de variables catégoriques : {len(train_cat)}")
print("Nouvelles dimensions")
print(f"data_train : {data_train.shape}\ndata_test  {data_test.shape}")

> **Valeurs manquantes**

In [None]:
def missing_pct(data):
    # Supprimer les variables vides à 60% au moins
    shape_init = data.shape[1]
    for col in data.columns:
        if round(data[col].isna().sum()/data.shape[0]*100, 2) > 60:
            data.drop(columns=[col], inplace=True)
            if col in train_num:
                train_num.remove(col)
            elif col in train_cat:
                train_cat.remove(col)
    print(f"Nombre de variables supprimées : {shape_init-data.shape[1]}")
    print(f"Variables catégoriques restantes : {len(train_cat)}")     
    print(f"Variables numériques restantes : {len(train_num)}")
    # Plot missing values
    plt.figure(figsize=(15, 15))
    counts = pd.Series(data.isna().sum()).loc[data.isna().sum()>0].sort_values()
    print(f"Nombre de variables avec des valeurs manquantes : {counts.shape[0]}")
    ax = sns.barplot(x=np.round(counts.values/data.shape[0]*100, 2), y=counts.index)
    # Plot with % labels
    for i in ax.containers:
        plt.bar_label(i, )
    plt.show()

missing_pct(data_train)

In [None]:
missing_pct(data_test)

Il y a une grande portion des datasets qui est vide : 83 variables en entraînement et 80 en test ont des valeurs manquantes.
La majorité de ces variables sont à moitié vides.

Il y avait 23 variables dans chaque dataset avec plus de 60% de valeurs manquantes qui ont été supprimées.

On choisi d'abord d'utiliser un SimpleImputer avec moyenne pour les float et médiane pour les entiers.

On se réserve l'option d'y revenir si les résultats du classifieur ne sont pas satisfaisants.

In [None]:
# Stratégie = moyenne pour floats
si = SimpleImputer(missing_values=np.nan, strategy='mean')

float_64 = list(data_train.dtypes[data_train.dtypes == float].index)
data_train[float_64] = si.fit_transform(data_train[float_64])
float_64 = list(data_test.dtypes[data_test.dtypes == float].index)
data_test[float_64] = si.fit_transform(data_test[float_64])

# Stratégie = médiane pour ints
si = SimpleImputer(missing_values=np.nan, strategy='median')

data = data_train.drop(columns=['TARGET'])
int_64 = list(data.dtypes[data.dtypes == int].index)
data_train[int_64] = si.fit_transform(data_train[int_64])
int_64 = list(data_test.dtypes[data_test.dtypes == int].index)
data_test[int_64] = si.fit_transform(data_test[int_64])

missing_pct(data_train)

In [None]:
missing_pct(data_test)

In [None]:
def fillna_cat(data):
    # np.NaN dans NAME_TYPE_SUITE = pas d'accompagnateur
    data['NAME_TYPE_SUITE'].fillna('Unaccompanied', inplace=True)
    # np.NaN dans OCCUPATION_TYPE = autre occupation OU pas d'occupation
    data['OCCUPATION_TYPE'].fillna('Other/No occupation', inplace=True)
    # EMERGENCYSTATE_MODE : 'Yes' ou 'No' : Remplacer par le plus fréquent
    data['EMERGENCYSTATE_MODE'].fillna(data['EMERGENCYSTATE_MODE'].value_counts().index[0], inplace=True)
    # np.NaN dans HOUSETYPE_MODE = autre type de logement OU pas de logement
    data['HOUSETYPE_MODE'].fillna('Other/No housetype', inplace=True)
    # np.NaN dans WALLSMATERIAL_MODE = autre type de matériaux : catégorie existante
    data['WALLSMATERIAL_MODE'].fillna('Others', inplace=True)

fillna_cat(data_train)
fillna_cat(data_test)

print("Nombre de valeurs manquantes")
print(f"- dans data_train : {data_train.isna().sum().sum()}")
print(f"- dans data_test : {data_test.isna().sum().sum()}")

> **DATA ANALYSIS**

In [None]:
counts = labels.value_counts()

plt.pie(counts.values, labels=counts.index,
        autopct=lambda p :'{:.2f}%  ({:,.0f})'.format(p, p*sum(counts.values)/100))
plt.show()

Rappel :

0 -> Crédit remboursé dans les temps

1 -> Défaut de paiement

La variable cible est très déséquilibrée

Pour analyser les corrélations on encode toutes les variables de type objet.

Pour minimiser le nombre de colonnes ajoutées :

- Si le nombre de catégories est inférieur ou égal à 2 : LabelEncoder
- Sinon : OneHotEncoder

In [None]:
data_train = data_train.loc[data_train.CODE_GENDER != 'XNA', :]
le = LabelEncoder()

def encoding(data):
    original_columns = list(data.columns)
    inf2 = [cat for cat in data.dtypes[data.dtypes==object].index if data[cat].nunique() <= 2]
    sup2 = [cat for cat in data.dtypes[data.dtypes==object].index if data[cat].nunique() > 2]
    
    for cat in inf2:
        data[cat] = le.fit_transform(data[cat])
        
    data = pd.get_dummies(data, columns= sup2)
        
    new_columns = [c for c in data.columns if c not in original_columns]
    return data, new_columns

data_train, new_columns_tr = encoding(data_train)
data_test, new_columns_te = encoding(data_test)

print(f"Nombre de colonnes ajoutées à data_train : {len(new_columns_tr)}")
print(f"Dimensions data_train : {data_train.shape}")
print(f"Nombre de colonnes ajoutées à data_test : {len(new_columns_te)}")
print(f"Dimensions data_test : {data_test.shape}")

In [None]:
print("Variables présentes dans le dataset d'entraînement et non dans le dataset de test :")
print()

var_to_pop = [var for var in data_train.columns if var not in data_test.columns]
data_train.drop(columns=var_to_pop[1:], inplace=True)

print(var_to_pop)
print(f"Dimensions data_train : {data_train.shape}")
print(f"Dimensions data_test : {data_test.shape}")

FINAL : 235 features avec variable d'identification.

In [None]:
pearson = data_train[['TARGET'] + [col for col in data_train.columns if col != 'TARGET']].corr()

pearson.style.format("{:.1}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Les 3 variables EXT_SOURCE_ sont les seules à avoir un coefficient de corrélation absolu supérieur ou égal à 0.1 avec TARGET.

Globalement, la variable TARGET n'a pas l'air d'être influencée par les variables du dataset d'entraînement.

De plus, ces coeffcients nous montrent qu'il n'y a pas de data leakage.

In [None]:
cat_test = [col for col in data_test.columns if np.max(data_test[col])==1]
num = [col for col in data_test.columns if col not in cat]
num.remove('SK_ID_CURR')
data_test_sc = data_test.copy()
data_test_sc[num] = RobustScaler().fit_transform(data_test_sc[num])

data_test_sc['SK_ID_CURR'] = data_test_sc['SK_ID_CURR'].astype(int)
data_test_sc[cat_test] = data_test_sc[cat_test].astype(int)

data_test_sc.to_csv(r'app_test.csv', index=False) # upload données clean

In [None]:
cat = [col for col in data_train.columns if np.max(data_train[col])==1]
num = [col for col in data_train.columns if col not in cat]
num.remove('SK_ID_CURR')
data_train_sc = data_train.copy()
data_train_sc[num] = RobustScaler().fit_transform(data_train_sc[num])

data_train_sc['SK_ID_CURR'] = data_train_sc['SK_ID_CURR'].astype(int)
data_train_sc[cat] = data_train_sc[cat].astype(int)

data_train_sc.to_csv(r'data_train.csv', index=False)

> **Feature importance**

In [None]:
import ppscore as pps
data_train = pd.read_csv('data_train.csv')
data_train['TARGET'] = data_train['TARGET'].astype(object)

predictors = pps.predictors(data_train, 'TARGET')
data_train['TARGET'] = data_train['TARGET'].astype(int)
predictors = predictors.loc[predictors['ppscore'] > 0].sort_values(by='ppscore', ascending = False)
predictors

In [None]:
predictive_var = list(predictors.loc[:, 'x'].values)

176 variables sur 235 ont un Predictive Power Score non nul. Il sera nécessaire d'essayer un modèle avec pénalisation et de faire une étude de Feature Importance pour faire une sélection de feature si le modèle performe mal.

Aucun de ces score ne se dépasse 0.02 sachant qu'un ppscore parfait est égal à 1.

# **Modélisation**

In [None]:
# !pip install mlflow : sur terminal
!pip install xgboost

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# Metrics adaptés au problème de déséquilibre des coûts
from sklearn.metrics import confusion_matrix, roc_auc_score, fbeta_score, recall_score, precision_score

#Modèles
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

#gestion du desiquilibre de TARGET
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

from sklearn.preprocessing import RobustScaler

import shap
from collections import Counter

> **Avec over sampling**

In [None]:
data_train = pd.read_csv('data_train.csv')

X = data_train.drop(columns=['TARGET', 'SK_ID_CURR']).copy()
y = data_train['TARGET'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

# Le nombre de crédits en défaut de paiement étant assez faible (25 000) : 
# on choisit une méthode d'over-sampling
smote = SMOTE()
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

print('y_train', Counter(y_train_over))
print('y_val', Counter(y_val))

Il n'y a plus de déséquilibre des classes mais il faudra faire attention à l'overfitting possible des futurs modèles (conséquence de l'over-sampling)

On commence par un DummyClassifier :

In [None]:
"""
def make_model(classifier, classifier__param, list_param):
    pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
                                 ['classifier', classifier]])
    
    param_grid = { classifier__param : list_param }
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grid,
                               scoring='roc_auc',
                               cv=3,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    print("Best params : ")
    print(grid_search.best_params_)
    return grid_search.best_estimator_
"""

In [None]:
comp = pd.DataFrame(index = ['precision', 'recall', 'Score métier', 'AUC'])

def score_métier(model, model_str):
    y_proba = model.predict_proba(X_val)[:, 1]
    
    # Seuil optimisé pour favoriser les prédictions positives qui ont un moindre coût 
    scores = [fbeta_score(y_val, pd.Series(y_proba).apply(lambda x: 1 if x > seuil else 0),
                          beta=2) for seuil in np.linspace(0.1, 0.9, 17)]
    seuil_opt = np.linspace(0.1, 0.9, 17)[scores.index(np.max(scores))]
    y_pred = pd.Series(y_proba).apply(lambda x: 1 if x > seuil_opt else 0)
    print(f"Seuil optimal = {seuil_opt}")
    
    conf_mat = pd.DataFrame(confusion_matrix(y_val, y_pred), 
                            columns = ['pred_0', 'pred_1'],
                            index = ['true_0', 'true_1'])
     
    precision = round(precision_score(y_val, y_pred), 3)
    recall = round(recall_score(y_val, y_pred), 3)# Recall => priorité sur précision
    fbeta = round(fbeta_score(y_val, y_pred, beta=10), 3)# Coût FN = 10 * Coût FP
    auc = round(roc_auc_score(y_val, y_proba), 3)
    
    print(f"Precision score       : {precision}")
    print(f"Recall score          : {recall}")
    print(f"FBêta score           : {fbeta}")
    print(f"ROC AUC score         : {auc}")
    
    comp[model_str] = [precision, recall, fbeta, auc]
    
    return conf_mat

In [None]:
#dummy = make_model(DummyClassifier(), 'classifier__strategy', ["most_frequent", "prior", "stratified", "uniform"])
dummy = DummyClassifier(strategy='stratified')
dummy.fit(X_train_over, y_train_over)

score_métier(dummy, 'Baseline')

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_over, y_train_over)

score_métier(lr, 'Logistic Regression') #3min

Il y a moins de faux négatifs et tous les metrics ont été améliorés par contre il n'y a pas convergence.

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_train_over, y_train_over)

score_métier(rfc, 'Random Forest') # 8min environ

In [None]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()
xgbc.fit(X_train_over, y_train_over)

score_métier(xgbc, 'XGradient Boosting') # 20min...

Le nombre de faux négatifs à augmenté.

In [None]:
import re
X_train_over_lgbm = X_train_over.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_val_lgbm = X_val.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

lgbm = LGBMClassifier()
lgbm.fit(X_train_over_lgbm, y_train_over, eval_set=[(X_train_over_lgbm, y_train_over), (X_val_lgbm, y_val)],
         eval_metric='auc')


score_métier(lgbm, 'LGBM Classifier') 

Pareil que XGBClassifier.

In [None]:
comp

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
i=0
j=0
for index in comp.index:
    if j>1:
        i+=1
        j=0
    val = comp.loc[index, :].sort_values()
    sns.barplot(x=val.values, y=val.index, ax=axs[i, j])
    axs[i, j].set_title(f"{index}")
    j+=1

plt.show()

In [None]:
comp = comp[['LGBM Classifier', 'Random Forest']] # LGBMClassifier au lieu de XGBClassifier car XGB beaucoup plus long

> **Avec under sampling**

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

# on essaie une méthode d'under-sampling
nearmiss = NearMiss()
X_train_under, y_train_under = nearmiss.fit_resample(X_train, y_train)

print('y_train', Counter(y_train_under))
print('y_val', Counter(y_val))

In [None]:
lr_under = LogisticRegression(max_iter=1000)
lr_under.fit(X_train_under, y_train_under)

score_métier(lr_under, 'LR under')

In [None]:
rf_under = RandomForestClassifier(n_jobs=-1)
rf_under.fit(X_train_under, y_train_under)

score_métier(rf_under, 'RF under')

In [None]:
xgb_under = XGBClassifier()
xgb_under.fit(X_train_under, y_train_under)

score_métier(xgb_under, 'XGB under')

In [None]:
X_train_under_lgbm = X_train_under.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

lgbm_under = LGBMClassifier()
lgbm_under.fit(X_train_under_lgbm, y_train_under, eval_set=[(X_train_under_lgbm, y_train_under), (X_val_lgbm, y_val)],
               eval_metric='auc')

score_métier(lgbm_under, 'LGBM under')

In [None]:
comp

L'approche under-sampling nous ramène à un roc_auc d'environ 0.5.

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
i=0
j=0
for index in comp.index:
    if j>1:
        i+=1
        j=0
    val = comp.loc[index, :].sort_values()
    sns.barplot(x=val.values, y=val.index, ax=axs[i, j])
    axs[i, j].set_title(f"{index}")
    j+=1

plt.show()

On va essayer d'optimiser :
- les hyperparamètres du LGBMClassifier pour améliorer le recall
- les hyperparamètres du RandomForest pour améliorer le recall

> **Optimisation des modèle choisi**

In [None]:
param_dict = {'n_estimators': np.linspace(20, 200, 10).astype(int),
              'max_depth': np.linspace(2, 20, 10).astype(int),
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],}
                
rs = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), 
                        param_distributions = param_dict,
                        scoring = 'recall',
                        cv = 3,
                        verbose = 3,
                        n_jobs = -1)
                        
rs.fit(X_train_over, y_train_over)

print(rs.best_params_)

In [None]:
rfc_opt = RandomForestClassifier(n_estimators=50, max_depth=12, n_jobs=-1)
rfc_opt.fit(X_train_over, y_train_over)

score_métier(rfc_opt, 'RandomForest tuned') # 3min

In [None]:
from sklearn.metrics import accuracy_score
print('Training Accuracy : ',
      accuracy_score(y_train_over, rfc_opt.predict(X_train_over))*100)
print('Validation Accuracy : ',
      accuracy_score(y_val, rfc_opt.predict(X_val))*100) # Pas d'overfitting

In [None]:
param_dict = {'uniform_drop': [True, False],
              'xgboost_dart_mode': [True, False],
              'n_estimators': np.linspace(20, 200, 10).astype(int),
              'num_leaves': [2, 20, 200],
              'min_data_in_leaf': [1, 10, 100, 1000],
              'max_depth': np.linspace(1, 10, 10).astype(int)}
    
rs1 = RandomizedSearchCV(LGBMClassifier(), 
                         param_distributions = param_dict,
                         scoring = 'recall',
                         cv = 2,
                         verbose = 3,
                         n_jobs = -1)
                        
rs1.fit(X_train_over_lgbm, y_train_over)

print(rs1.best_params_)

In [None]:
lgbm_opt = LGBMClassifier(xgboost_dart_mode = False, uniform_drop = False, num_leaves = 200,
                          n_estimators=160, min_data_in_leaf = 1000, max_depth = 10)
lgbm_opt.fit(X_train_over_lgbm, y_train_over)

score_métier(lgbm_opt, 'LGBM tuned') # 1min

In [None]:
print('Training Accuracy : ',
      accuracy_score(y_train_over, lgbm_opt.predict(X_train_over_lgbm))*100)
print('Validation Accuracy : ',
      accuracy_score(y_val, lgbm_opt.predict(X_val_lgbm))*100) #Pas d'overfitting

In [None]:
comp = comp[['Random Forest', 'RandomForest tuned', 
             'LGBM Classifier', 'LGBM tuned']]

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
i=0
j=0
for index in comp.index:
    if j>1:
        i+=1
        j=0
    val = comp.loc[index, :].sort_values()
    sns.barplot(x=val.values, y=val.index, ax=axs[i, j])
    axs[i, j].set_title(f"{index}")
    j+=1

plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(15, 30))
for i, model in enumerate([rfc_opt, lgbm_opt]):
    fi = model.feature_importances_
    df_fi = pd.DataFrame(fi, columns=['Feature importance'])
    if model==rfc_opt:
        df_fi['Feature name'] = model.feature_names_in_
    else:
        df_fi['Feature name'] = model.feature_name_
    df_fi['Relative importance'] = round((df_fi['Feature importance']/np.max(df_fi['Feature importance']))*100, 2)
    df_fi = df_fi[round(df_fi['Relative importance'], 2) > 5]
    df_fi.sort_values(by='Relative importance', inplace=True)
    
    sns.barplot(data=df_fi, x='Relative importance', y='Feature name', orient='h', ax=axs[i])
    for j in axs[i].containers:
        axs[i].bar_label(j, )
    axs[i].set_xlabel('Importance relative')
    axs[i].set_ylabel('Variables')
    
axs[0].set_title(f'Importance relative des variabless\nRandomForest')   
axs[1].set_title(f'Importance relative des variabless\nLGBMClassifier')

plt.show()

In [None]:
df_fi = pd.DataFrame(rfc_opt.feature_importances_, columns=['Feature importance RFC'])
df_fi['Feature name RFC'] = rfc_opt.feature_names_in_
df_fi = df_fi.sort_values(by='Feature importance RFC', ascending=False).iloc[:len(predictive_var), :]
df_fi.reset_index(drop=True, inplace=True)

df_fi['predictive variables'] = predictive_var

df_fi2 = pd.DataFrame(lgbm_opt.feature_importances_, columns=['Feature importance LGBM'])
df_fi2['Feature name LGBM'] = lgbm_opt.feature_name_
df_fi2 = df_fi2.sort_values(by='Feature importance LGBM', ascending=False).iloc[:len(predictive_var), :]
df_fi2.reset_index(drop=True, inplace=True)

pd.concat([df_fi, df_fi2], axis=1)[:10]

Choix du modèle final : LGBMClassifier() avec over sampling

Il y a des features qui peuvent être supprimées:

In [None]:
import time
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

start = time.time()
rfecv = RFECV(estimator=LGBMClassifier(xgboost_dart_mode = False, uniform_drop = False, num_leaves = 200,
                                       n_estimators=160, min_data_in_leaf = 1000, max_depth = 10), 
              step=5, 
              cv=StratifiedKFold(5), 
              scoring='roc_auc', 
              verbose=2,
              n_jobs=-1)
rfecv.fit(X_train_over_lgbm, y_train_over)

print('Time Elapsed: {}'.format(time.time()-start))
print("Optimal number of features : %d" % rfecv.n_features_)

In [None]:
step=5
plt.figure(figsize=(20, 5))
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])
plt.xlabel('Number of features tested (x%d)' % step)
plt.ylabel('Cross-validation score (roc_auc)')
plt.show()

In [None]:
ranking = pd.DataFrame({'Features': rfecv.feature_names_in_})
ranking['RANK'] = np.asarray(rfecv.ranking_)
ranking.sort_values('RANK', inplace=True)

features_selection = ranking[ranking.RANK == 1]['Features'].to_list()

X_train_over_lgbm = X_train_over_lgbm.loc[:, features_selection]
X_val_lgbm = X_val_lgbm.loc[:, features_selection]

data_test = pd.read_csv('/kaggle/working/app_test.csv')
data_test_lgbm = data_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
data_test = data_test_lgbm.loc[:, features_selection]

data_test['SK_ID_CURR'] = data_test_lgbm['SK_ID_CURR']
data_test.shape

In [None]:
lgbm_fin = LGBMClassifier(max_depth=10, min_data_in_leaf=1000, n_estimators=160,
                          num_leaves=200, uniform_drop=False, xgboost_dart_mode=False)
lgbm_fin.fit(X_train_over_lgbm, y_train_over)

y_proba = lgbm_fin.predict_proba(X_val_lgbm)[:, 1]
scores = [fbeta_score(y_val, pd.Series(y_proba).apply(lambda x: 1 if x > seuil else 0),
                      beta=2) for seuil in np.linspace(0.1, 0.9, 17)]
seuil_opt = np.linspace(0.1, 0.9, 17)[scores.index(np.max(scores))]
y_pred = pd.Series(y_proba).apply(lambda x: 1 if x > seuil_opt else 0)
print(f"Seuil optimal = {seuil_opt}")
    
conf_mat = pd.DataFrame(confusion_matrix(y_val, y_pred), 
                            columns = ['pred_0', 'pred_1'],
                            index = ['true_0', 'true_1'])
     
precision = round(precision_score(y_val, y_pred), 3)
recall = round(recall_score(y_val, y_pred), 3)# Recall => priorité sur précision
fbeta = round(fbeta_score(y_val, y_pred, beta=10), 3)# Coût FN = 10 * Coût FP
auc = round(roc_auc_score(y_val, y_proba), 3)
    
print(f"Precision score       : {precision}")
print(f"Recall score          : {recall}")
print(f"FBêta score           : {fbeta}")
print(f"ROC AUC score         : {auc}")
    
comp['LGBM 49 features'] = [precision, recall, fbeta, auc]
conf_mat

In [None]:
comp[['LGBM tuned', 'LGBM 49 features']]

In [None]:
features_selection

> **SHAP**

Vue globale : summary et dependence

In [None]:
import shap 

data_test.reset_index(drop=True, inplace=True)

data = data_test.drop(columns=['SK_ID_CURR'])

explainer_lgbm = shap.TreeExplainer(lgbm_fin)
shap_value_lgbm = explainer_lgbm.shap_values(data)

In [None]:
shap.initjs()
shap.summary_plot(shap_value_lgbm[0], data) # classe 0 : demande de prêt acceptée

In [None]:
shap.dependence_plot("EXT_SOURCE_1", shap_value_lgbm[0], data)

Vue locale : force_plot

In [None]:
client_ID = 100005
index = data_test.loc[data_test.SK_ID_CURR==client_ID].index
pred = lgbm_fin.predict_proba(data.iloc[index, :])[:, 1]
print(pred)

shap.plots.force(explainer_lgbm.expected_value[0], shap_value_lgbm[0][index,:], data.iloc[index, :], matplotlib = True)

Exemple d'un prêt refusé.

Serialisations :

In [None]:
import pickle

with open(r'/kaggle/working/LGBM_final.pkl', 'wb') as file:
    pickle.dump(lgbm_fin, file)
file.close()

# Modèle

In [None]:
with open('shap_explainer_lgbm.p', 'wb') as f4:
    pickle.dump(explainer_lgbm, f4)

with open('shap_values_lgbm.p', 'wb') as f4:
    pickle.dump(shap_value_lgbm, f4)
    
# objets shap pour dashboard

In [None]:
with open('applications.csv', 'wb') as f4:
    pickle.dump(data_test, f4)
    
# demandes de prêts

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))