# Projet 7 : Implémentez un modèle de scoring : Selection des features

## Problématique

Le but de ce notebook est la selection des features les plus importantes qu'on a calculées dans le notebook précedent, pour ensuite les utiliser dans le notebook suivant pour la mise en place de notre application sur le dashboard.



## Importation des modules

In [1]:
pip install scikit-plot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [2]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (575 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m575.9/575.9 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [3]:
import numpy as np 
import pandas as pd

## PLOT
import matplotlib.pyplot as plt
import seaborn as sns

## Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

## Resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from collections import Counter

##Split
from sklearn.model_selection import train_test_split

## Modelisation
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

## Scores
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_recall_fscore_support
import scikitplot as skplt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import  make_scorer

## feature importance
import shap

## Threshold
from yellowbrick.classifier.threshold import discrimination_threshold

## Export
import pickle

## Warning
import warnings

In [4]:
import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform, randint
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import FitFailedWarning

In [5]:
warnings.filterwarnings("ignore")

##  Importation des données

In [6]:
use_colab = True 

if use_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH ='/content/drive/MyDrive/'
else:
    PATH ='/data/'

Mounted at /content/drive


In [7]:
dataset = pd.read_csv(PATH + 'df_final.csv')

In [8]:
dataset.shape

(307511, 47)

In [9]:
dataset.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,DAYS_DECISION,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_LAST_DUE,DAYS_TERMINATION,NAME_CONTRACT_STATUS,CODE_REJECT_REASON,NAME_CLIENT_TYPE,CNT_INSTALMENT_FUTURE,SK_DPD_y
0,100002,1,Cash loans,M,0,202500.0,406597.5,Unaccompanied,working,low_educ,...,-606.0,24.0,365243.0,-25.0,-17.0,Approved,XAP,New,15.0,0.0
1,100003,0,Cash loans,F,0,270000.0,1293502.5,Family,working,high_educ,...,-1305.0,10.0,365243.0,-1054.333333,-1047.333333,Approved,XAP,Refreshed,4.909091,0.0
2,100004,0,Revolving loans,M,0,67500.0,135000.0,Unaccompanied,working,low_educ,...,-815.0,4.0,365243.0,-724.0,-714.0,Approved,XAP,New,0.0,0.0
3,100006,0,Cash loans,F,0,135000.0,312682.5,Unaccompanied,working,low_educ,...,-272.444444,23.0,365243.0,182477.5,182481.75,Approved,XAP,Repeater,8.65,0.0
4,100007,0,Cash loans,M,0,121500.0,513000.0,Unaccompanied,working,low_educ,...,-1222.833333,20.666667,365243.0,72136.2,72143.8,Approved,XAP,Repeater,11.666667,0.0


In [10]:
train_len = dataset.shape[0]

In [11]:
train_dataset = dataset[:train_len]
train_ids = train_dataset['SK_ID_CURR']
train_dataset.drop(columns=['SK_ID_CURR'], axis = 1, inplace=True)

* Définir les features et la variable cible pour la modélisation

In [12]:
# separate training data
train_dataset['TARGET'] = train_dataset['TARGET'].astype(int)
target = train_dataset['TARGET']
features = train_dataset.drop(columns=['TARGET'], axis = 1)
features= features[["CNT_FAM_MEMBERS", "NAME_CONTRACT_TYPE","CNT_CHILDREN","AMT_CREDIT_SUM","DAYS_INSTALMENT_delay","AMT_INCOME_TOTAL","CREDIT_ACTIVE","AMT_REQ_CREDIT_BUREAU_YEAR"]]
print('x_train data shape: ', features.shape)
print('y_train data shape: ', target.shape)

x_train data shape:  (307511, 8)
y_train data shape:  (307511,)


In [13]:
target_sample = target
target_sample.shape

(307511,)

In [14]:
features_sample = features
features_sample.shape


(307511, 8)

In [15]:
features.head()

Unnamed: 0,CNT_FAM_MEMBERS,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,AMT_INCOME_TOTAL,CREDIT_ACTIVE,AMT_REQ_CREDIT_BUREAU_YEAR
0,1.0,Cash loans,0,108131.945625,-20.421053,202500.0,Closed,1.0
1,2.0,Cash loans,0,254350.125,-7.0,270000.0,Closed,0.0
2,1.0,Revolving loans,0,94518.9,-3.0,67500.0,Closed,0.0
3,2.0,Cash loans,0,,-19.375,135000.0,,
4,1.0,Cash loans,0,146250.0,-6.32,121500.0,Closed,0.0


In [16]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(features_sample, target_sample, test_size = 0.4)

In [17]:
X_test.head()

Unnamed: 0,CNT_FAM_MEMBERS,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,AMT_INCOME_TOTAL,CREDIT_ACTIVE,AMT_REQ_CREDIT_BUREAU_YEAR
179043,2.0,Cash loans,0,287019.22125,-5.815789,180000.0,Closed,4.0
7747,2.0,Cash loans,0,120600.0,-12.916667,45000.0,Closed,1.0
20618,3.0,Cash loans,1,151595.4375,-4.857143,225000.0,Closed,3.0
250331,2.0,Cash loans,0,742050.0,-4.75,248782.5,Closed,0.0
31579,3.0,Cash loans,1,54000.0,-19.842105,193500.0,Closed,8.0


In [40]:
dataset['AMT_REQ_CREDIT_BUREAU_YEAR'].unique()

array([ 1.,  0., nan,  2.,  4.,  5.,  3.,  8.,  6.,  9.,  7., 10., 11.,
       13., 16., 12., 25., 23., 15., 14., 22., 17., 19., 18., 21., 20.])

In [38]:
X_test['CREDIT_ACTIVE'].unique()

array([1, 0, 3, 2])

In [18]:
pd.DataFrame(X_test).to_csv('/content/drive/MyDrive/X_test.csv',index=False)

In [19]:
# Define categorical columns
categoric_attribute = list(features_sample.select_dtypes(exclude=["number"]).columns)
# Define numerical columns
numeric_attribute = list(features_sample.select_dtypes(exclude=["bool_","object_"]).columns)


In [20]:
numeric_attribute

['CNT_FAM_MEMBERS',
 'CNT_CHILDREN',
 'AMT_CREDIT_SUM',
 'DAYS_INSTALMENT_delay',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

In [21]:
X_test.head()

Unnamed: 0,CNT_FAM_MEMBERS,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,AMT_INCOME_TOTAL,CREDIT_ACTIVE,AMT_REQ_CREDIT_BUREAU_YEAR
179043,2.0,Cash loans,0,287019.22125,-5.815789,180000.0,Closed,4.0
7747,2.0,Cash loans,0,120600.0,-12.916667,45000.0,Closed,1.0
20618,3.0,Cash loans,1,151595.4375,-4.857143,225000.0,Closed,3.0
250331,2.0,Cash loans,0,742050.0,-4.75,248782.5,Closed,0.0
31579,3.0,Cash loans,1,54000.0,-19.842105,193500.0,Closed,8.0


In [22]:
pd.DataFrame(X_test).to_csv('/content/drive/MyDrive/testapi.csv',index=False)

In [23]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
import joblib

In [26]:
import category_encoders as ce

from category_encoders import TargetEncoder

encoder = LabelEncoder()
joblib.dump(encoder,PATH + 'label_encoder.joblib')
for col in categoric_attribute:
   X_train[col] = encoder.fit_transform(X_train[col])
   X_test[col] = encoder.fit_transform(X_test[col])

display(X_train)

Unnamed: 0,CNT_FAM_MEMBERS,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,AMT_INCOME_TOTAL,CREDIT_ACTIVE,AMT_REQ_CREDIT_BUREAU_YEAR
118287,2.0,0,0,208125.000000,-3.306452,135000.0,0,5.0
111248,2.0,0,0,321465.687188,-18.888889,256500.0,1,4.0
294084,2.0,0,0,41736.600000,-29.750000,225000.0,0,0.0
270663,4.0,0,2,185700.000000,,135000.0,1,0.0
196961,1.0,0,0,154125.000000,-5.565217,81000.0,0,4.0
...,...,...,...,...,...,...,...,...
42596,2.0,0,0,248157.000000,-7.666667,315000.0,1,4.0
58836,2.0,0,0,516141.000000,-11.916667,81000.0,1,1.0
234532,4.0,0,2,186830.212500,19.571429,76500.0,1,0.0
182,5.0,0,3,271328.400000,-2.840909,225000.0,0,6.0


In [27]:
X_train['NAME_CONTRACT_TYPE'].unique()

array([0, 1])

In [28]:
joblib.dump(RobustScaler, PATH+'robust_scaler.joblib')

['/content/drive/MyDrive/robust_scaler.joblib']

In [29]:
# Pipeline data transformation (Imputation / Scaling / Encoding):
def Preprocessing (numeric):
    numeric_transfs = [('imputer',SimpleImputer(missing_values= np.NAN, strategy= 'median')),('scaler', RobustScaler())]
    numeric_pipeline = Pipeline(numeric_transfs)
    all_transfs = [("numeric",numeric_pipeline,numeric)]
    full_preprocessor = ColumnTransformer(all_transfs, remainder='passthrough')
    return full_preprocessor

In [30]:
# Data Transformed
preprocessor_fitted = Preprocessing(numeric_attribute).fit(X_train)
X_train_transformed = preprocessor_fitted.transform(X_train)
X_test_transformed = preprocessor_fitted.transform(X_test)

In [31]:
from imblearn.under_sampling import TomekLinks

tl = RandomUnderSampler(sampling_strategy=0.9)

# fit predictor and target variable
X_train_smtomek, y_train_smtomek = tl.fit_resample(X_train_transformed, y_train)

print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_train_smtomek))

Original dataset shape Counter({0: 169567, 1: 14939})
Resample dataset shape Counter({0: 16598, 1: 14939})


On exporte ensuite les Datasets pour les réutiliser dans les autres notebooks

In [32]:
pd.DataFrame(X_train_smtomek).to_csv('/content/drive/MyDrive/X_train_smtomek_bis.csv',index=False)
pd.DataFrame(y_train_smtomek).to_csv('/content/drive/MyDrive/y_train_smtomek_bis.csv',index=False)

In [33]:
pd.DataFrame(X_test_transformed).to_csv('/content/drive/MyDrive/X_test_smtomek_bis.csv',index=False)
pd.DataFrame(y_test).to_csv('/content/drive/MyDrive/y_test_smtomek_bis.csv',index=False)