# Projet 7 : Implémentez un modèle de scoring : Selection des features

## Problématique

Le but de ce notebook est la selection des features les plus importantes qu'on a calculées dans le notebook précedent, pour ensuite les utiliser dans le notebook suivant pour la mise en place de notre application sur le dashboard.



## Importation des modules

In [1]:
pip install scikit-plot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [2]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (569 kB)
[K     |████████████████████████████████| 569 kB 6.6 MB/s 
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [3]:
import numpy as np 
import pandas as pd

## PLOT
import matplotlib.pyplot as plt
import seaborn as sns

## Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

## Resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from collections import Counter

##Split
from sklearn.model_selection import train_test_split

## Modelisation
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

## Scores
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_recall_fscore_support
import scikitplot as skplt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import  make_scorer

## feature importance
import shap

## Threshold
from yellowbrick.classifier.threshold import discrimination_threshold

## Export
import pickle

## Warning
import warnings

In [4]:
import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform, randint
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import FitFailedWarning

In [5]:
warnings.filterwarnings("ignore")

##  Importation des données

In [6]:
use_colab = True 

if use_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH ='/content/drive/MyDrive/'
else:
    PATH ='/data/'

Mounted at /content/drive


In [7]:
dataset = pd.read_csv(PATH + 'df_final.csv')

In [8]:
dataset.shape

(307511, 47)

In [9]:
train_len = dataset.shape[0]

In [10]:
train_dataset = dataset[:train_len]
train_ids = train_dataset['SK_ID_CURR']
train_dataset.drop(columns=['SK_ID_CURR'], axis = 1, inplace=True)

* Définir les features et la variable cible pour la modélisation

In [11]:
# separate training data
train_dataset['TARGET'] = train_dataset['TARGET'].astype(int)
target = train_dataset['TARGET']
features = train_dataset.drop(columns=['TARGET'], axis = 1)
features= features[["AMT_CREDIT_x", "NAME_CONTRACT_TYPE","CNT_CHILDREN","AMT_CREDIT_SUM","DAYS_INSTALMENT_delay","REGION_RATING_CLIENT","AMT_INCOME_TOTAL","AMT_REQ_CREDIT_BUREAU_YEAR"]]
print('x_train data shape: ', features.shape)
print('y_train data shape: ', target.shape)

x_train data shape:  (307511, 8)
y_train data shape:  (307511,)


In [12]:
target_sample = target
target_sample.shape

(307511,)

In [13]:
features_sample = features
features_sample.shape


(307511, 8)

In [14]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(features_sample, target_sample, test_size = 0.4)

In [15]:
X_test.head()

Unnamed: 0,AMT_CREDIT_x,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,REGION_RATING_CLIENT,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR
223840,450000.0,Cash loans,0,999000.0,-23.5,2,382500.0,3.0
173338,835380.0,Cash loans,0,330362.1,-3.639344,1,315000.0,5.0
40015,990000.0,Cash loans,0,284379.75,-37.0,2,193500.0,1.0
15357,225000.0,Cash loans,1,300810.0,0.142857,2,157500.0,2.0
150995,547272.0,Cash loans,2,47065.5,,2,180000.0,2.0


In [16]:
pd.DataFrame(X_test).to_csv('/content/drive/MyDrive/X_test.csv',index=False)

In [17]:
# Define categorical columns
categoric_attribute = list(features_sample.select_dtypes(exclude=["number"]).columns)
# Define numerical columns
numeric_attribute = list(features_sample.select_dtypes(exclude=["bool_","object_"]).columns)


In [18]:
numeric_attribute

['AMT_CREDIT_x',
 'CNT_CHILDREN',
 'AMT_CREDIT_SUM',
 'DAYS_INSTALMENT_delay',
 'REGION_RATING_CLIENT',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

In [19]:
X_test.head()

Unnamed: 0,AMT_CREDIT_x,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,REGION_RATING_CLIENT,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR
223840,450000.0,Cash loans,0,999000.0,-23.5,2,382500.0,3.0
173338,835380.0,Cash loans,0,330362.1,-3.639344,1,315000.0,5.0
40015,990000.0,Cash loans,0,284379.75,-37.0,2,193500.0,1.0
15357,225000.0,Cash loans,1,300810.0,0.142857,2,157500.0,2.0
150995,547272.0,Cash loans,2,47065.5,,2,180000.0,2.0


In [20]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 771 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
import category_encoders as ce

from category_encoders import TargetEncoder

encoder = LabelEncoder()

for col in categoric_attribute:
   X_train[col] = encoder.fit_transform(X_train[col])
   X_test[col] = encoder.fit_transform(X_test[col])

display(X_train)

Unnamed: 0,AMT_CREDIT_x,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,REGION_RATING_CLIENT,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR
151268,148365.0,0,0,,,2,67500.0,
96102,305221.5,0,0,3.237578e+05,-5.000000,3,90000.0,0.0
107493,1078200.0,0,0,5.229789e+05,-2.611111,2,405000.0,1.0
9180,755190.0,0,0,,-16.000000,1,94500.0,
125368,868806.0,0,1,1.160540e+06,-7.846154,1,585000.0,3.0
...,...,...,...,...,...,...,...,...
36127,284400.0,0,0,5.287500e+05,,2,180000.0,0.0
174538,1350000.0,0,0,3.319533e+05,,2,270000.0,2.0
240411,364896.0,0,1,1.378741e+05,-24.111111,2,157500.0,0.0
195472,900000.0,0,0,1.181321e+05,-34.545455,2,157500.0,1.0


In [23]:
# Pipeline data transformation (Imputation / Scaling / Encoding):
def Preprocessing (numeric):
    numeric_transfs = [('imputer',SimpleImputer(missing_values= np.NAN, strategy= 'median')),('scaler', RobustScaler())]
    numeric_pipeline = Pipeline(numeric_transfs)
    all_transfs = [("numeric",numeric_pipeline,numeric)]
    full_preprocessor = ColumnTransformer(all_transfs, remainder='passthrough')
    return full_preprocessor

In [24]:
# Data Transformed
preprocessor_fitted = Preprocessing(numeric_attribute).fit(X_train)
X_train_transformed = preprocessor_fitted.transform(X_train)
X_test_transformed = preprocessor_fitted.transform(X_test)

In [27]:
from imblearn.under_sampling import TomekLinks

tl = RandomUnderSampler(sampling_strategy=0.9)

# fit predictor and target variable
X_train_smtomek, y_train_smtomek = tl.fit_resample(X_train_transformed, y_train)

print('Original dataset shape', Counter(y_train))
print('Resample dataset shape', Counter(y_train_smtomek))

Original dataset shape Counter({0: 169558, 1: 14948})
Resample dataset shape Counter({0: 16608, 1: 14948})


On exporte ensuite les Datasets pour les réutiliser dans les autres notebooks

In [28]:
pd.DataFrame(X_train_smtomek).to_csv('/content/drive/MyDrive/X_train_smtomek_bis.csv',index=False)
pd.DataFrame(y_train_smtomek).to_csv('/content/drive/MyDrive/y_train_smtomek_bis.csv',index=False)

In [29]:
pd.DataFrame(X_test_transformed).to_csv('/content/drive/MyDrive/X_test_smtomek_bis.csv',index=False)
pd.DataFrame(y_test).to_csv('/content/drive/MyDrive/y_test_smtomek_bis.csv',index=False)