# Projet 7 : Implémentez un modèle de scoring : Selection des features

## Problématique

Le but de ce notebook est la selection des features les plus importantes qu'on a calculées dans le notebook précedent, pour ensuite les utiliser dans le notebook suivant pour la mise en place de notre application sur le dashboard.



## Importation des modules

In [None]:
pip install scikit-plot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np 
import pandas as pd

## PLOT
import matplotlib.pyplot as plt
import seaborn as sns

## Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

## Resampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from collections import Counter

##Split
from sklearn.model_selection import train_test_split

## Modelisation
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

## Scores
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_recall_fscore_support
import scikitplot as skplt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import  make_scorer

## feature importance
import shap

## Threshold
from yellowbrick.classifier.threshold import discrimination_threshold

## Export
import pickle

## Warning
import warnings

In [None]:
import sklearn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform, randint
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import FitFailedWarning

In [None]:
warnings.filterwarnings("ignore")

##  Importation des données

In [None]:
use_colab = True 

if use_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH ='/content/drive/MyDrive/'
else:
    PATH ='/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = pd.read_csv(PATH + 'df_final.csv')

In [None]:
dataset.shape

(307511, 47)

In [None]:
train_len = dataset.shape[0]

In [None]:
train_dataset = dataset[:train_len]
train_ids = train_dataset['SK_ID_CURR']
train_dataset.drop(columns=['SK_ID_CURR'], axis = 1, inplace=True)

* Définir les features et la variable cible pour la modélisation

In [None]:
# separate training data
train_dataset['TARGET'] = train_dataset['TARGET'].astype(int)
target = train_dataset['TARGET']
features = train_dataset.drop(columns=['TARGET'], axis = 1)
features= features[["AMT_CREDIT_x", "NAME_CONTRACT_TYPE","CNT_CHILDREN","AMT_CREDIT_SUM","DAYS_INSTALMENT_delay","REGION_RATING_CLIENT","AMT_INCOME_TOTAL","AMT_REQ_CREDIT_BUREAU_YEAR"]]
print('x_train data shape: ', features.shape)
print('y_train data shape: ', target.shape)

x_train data shape:  (307511, 8)
y_train data shape:  (307511,)


In [None]:
target_sample = target
target_sample.shape

In [None]:
features_sample = features
features_sample.shape


(307511, 8)

In [None]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(features_sample, target_sample, test_size = 0.4)

In [None]:
X_test.head()

Unnamed: 0,AMT_CREDIT_x,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,REGION_RATING_CLIENT,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR
9216,405000.0,Cash loans,0,329734.5,-18.227273,2,270000.0,3.0
155344,518562.0,Cash loans,2,80998.0,-0.32,2,112500.0,0.0
172221,422235.0,Cash loans,0,342858.6,,1,225000.0,0.0
166978,269982.0,Cash loans,0,,-3.473684,2,135000.0,
72232,518562.0,Cash loans,0,969596.357143,-1.32,2,202500.0,3.0


In [None]:
pd.DataFrame(X_test).to_csv('/content/drive/MyDrive/X_test.csv',index=False)

In [None]:
# Define categorical columns
categoric_attribute = list(features_sample.select_dtypes(exclude=["number"]).columns)
# Define numerical columns
numeric_attribute = list(features_sample.select_dtypes(exclude=["bool_","object_"]).columns)


In [None]:
numeric_attribute

['AMT_CREDIT_x',
 'CNT_CHILDREN',
 'AMT_CREDIT_SUM',
 'DAYS_INSTALMENT_delay',
 'REGION_RATING_CLIENT',
 'AMT_INCOME_TOTAL',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

In [None]:
X_test.head()

Unnamed: 0,AMT_CREDIT_x,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,REGION_RATING_CLIENT,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR
9216,405000.0,Cash loans,0,329734.5,-18.227273,2,270000.0,3.0
155344,518562.0,Cash loans,2,80998.0,-0.32,2,112500.0,0.0
172221,422235.0,Cash loans,0,342858.6,,1,225000.0,0.0
166978,269982.0,Cash loans,0,,-3.473684,2,135000.0,
72232,518562.0,Cash loans,0,969596.357143,-1.32,2,202500.0,3.0


In [None]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
import category_encoders as ce

from category_encoders import TargetEncoder

encoder = LabelEncoder()

for col in categoric_attribute:
   X_train[col] = encoder.fit_transform(X_train[col])
   X_test[col] = encoder.fit_transform(X_test[col])

display(X_train)

Unnamed: 0,AMT_CREDIT_x,NAME_CONTRACT_TYPE,CNT_CHILDREN,AMT_CREDIT_SUM,DAYS_INSTALMENT_delay,REGION_RATING_CLIENT,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR
37381,254700.0,0,0,148284.000000,-7.000000,3,157500.0,9.0
18120,1078200.0,0,0,240706.161000,-8.555556,2,225000.0,3.0
165067,202500.0,1,0,178044.795000,-3.588235,2,90000.0,5.0
127927,270000.0,1,1,181277.505000,-11.000000,2,135000.0,1.0
53163,595903.5,0,2,311250.000000,-13.000000,2,180000.0,4.0
...,...,...,...,...,...,...,...,...
242767,900000.0,0,0,323859.000000,-28.375000,2,270000.0,2.0
247318,654048.0,0,0,189899.100000,-3.296296,2,135000.0,0.0
148480,605439.0,0,1,125048.250000,-4.333333,2,157500.0,5.0
84860,1078200.0,0,0,877316.785714,,1,247500.0,0.0


In [None]:
# Pipeline data transformation (Imputation / Scaling / Encoding):
def Preprocessing (numeric):
    numeric_transfs = [('imputer',SimpleImputer(missing_values= np.NAN, strategy= 'median')),('scaler', RobustScaler())]
    numeric_pipeline = Pipeline(numeric_transfs)
    all_transfs = [("numeric",numeric_pipeline,numeric)]
    full_preprocessor = ColumnTransformer(all_transfs, remainder='passthrough')
    return full_preprocessor

In [None]:
# Data Transformed
preprocessor_fitted = Preprocessing(numeric_attribute).fit(X_train)
X_train_transformed = preprocessor_fitted.transform(X_train)
X_test_transformed = preprocessor_fitted.transform(X_test)

On exporte ensuite les Datasets pour les réutiliser dans les autres notebooks

In [None]:
pd.DataFrame(X_train_transformed).to_csv('/content/drive/MyDrive/X_train_smtomek_bis.csv',index=False)
pd.DataFrame(y_train).to_csv('/content/drive/MyDrive/y_train_smtomek_bis.csv',index=False)

In [None]:
pd.DataFrame(X_test_transformed).to_csv('/content/drive/MyDrive/X_test_smtomek_bis.csv',index=False)
pd.DataFrame(y_test).to_csv('/content/drive/MyDrive/y_test_smtomek_bis.csv',index=False)