#  Predire si un client rembourse un crédit

<img src="logo.png" alt="logo" width="200"/>

Pour cette analyse on se limite aux fichiers **"application_train.csv"** et **"application_test.csv"** (et "HomeCredit_columns_description.csv" pour les informations).

## Import

In [None]:
# basics
import pandas as pd
import numpy as np
from random import randint

# data display
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno

# models
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import classification_report, confusion_matrix, fbeta_score, make_scorer
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

# feature engineering
from boruta import BorutaPy

# model explanation
import shap

In [None]:
def cleaning(data,columns=[],seuil_Sup=None,seuil_Inf=None):
    '''
    seuils inferieur et superieur sont > <
    '''
    res = data.copy()
    if len(columns) == 0: columns = data.columns
        
    # if threshold sup undefined
    if seuil_Sup == None:
        seuil_Sup = []
        print("default threshold: Q3 + (1.5*IQR) ")
        for col in columns:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            # IQR is interquartile range
            IQR = Q3 - Q1
            limite = Q3 + (1.5*IQR)
            seuil_Sup.append(limite)
            print("for "+col+", threshold sup is: "+str(limite))
            
    # if threshold inf undefined
    if seuil_Inf == None:
        seuil_Inf = []
        print()
        print("default threshold: Q3 - (1.5*IQR) ")
        for col in columns:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            # IQR is interquartile range
            IQR = Q3 - Q1
            limite = Q1 - (1.5*IQR)
            seuil_Inf.append(limite)
            print("for "+col+", threshold inf is: "+str(limite))
    k = 0
    print()
    for col in columns:
        #res.loc[res[col] >= seuil[k], col] = None
        res = res.drop(res[res[col] > seuil_Sup[k]].index)
        res = res.drop(res[res[col] < seuil_Inf[k]].index)
        print("ROWS after cleaning "+col+" : "+ str(len(res)))
        k += 1
        
        
    return res

In [None]:
def matrix_conf(test_Y, prediction_report):
    report = classification_report(test_Y, prediction_report,output_dict=True)
    recall0 = round(report["0"]["recall"]*100,2)
    recall1 = round(report["1"]["recall"]*100,2)
    fbeta = fbeta_score(test_Y, prediction_report, average='micro', beta=0.5)
    #print(classification_report(test_Y, prediction_report))
    conf_matrix = confusion_matrix(test_Y, prediction_report, labels=[1, 0])
    cmtx = pd.DataFrame(
        conf_matrix, 
        index=['real:1', 'real:0'], 
        columns=['pred:1', 'pred:0']
    )
    total_pred = len(test_Y)

    print("1 = non rembourse")
    print()
    print(cmtx)
    print()
    print("recall 0: "+ str(report["0"]["recall"]))
    print("recall 1: "+ str(report["1"]["recall"]))
    print("f beta score: "+str(fbeta))
    print()
    print("Nos criteres:")
    print("I) on detecte "+str(recall1)+" % de classe 1")
    print("II) on detecte "+str(recall0)+" % de classe 0")
    
    return [recall1,recall0,fbeta]

In [None]:
# Training data
data_train = pd.read_csv('data/application_train.csv')

# Testing data features
data_test = pd.read_csv('data/application_test.csv')

# Testing data features
data_info = pd.read_csv('data/HomeCredit_columns_description.csv', encoding='ISO-8859-1')

## Présentation générale

### La classe à predire

In [None]:
print('Training data shape: ', data_train.shape)
data_train.head()

In [None]:
col_dif = [x for x in data_train.columns if x not in data_test.columns]
print("What we want to predict: "+str(col_dif))

In [None]:
sns.histplot(data=data_train, x="TARGET")

Les classes 0 et 1 à prédire sont **disproportionnées**.\
Il faudra agir en consequence plus tard (voir partie "Modeles")

### Definitions des variables

L'utilisation de ce fichier sera utile pour comprendre le dataset et determiner de nouvelles variables plus tard.

In [None]:
data_info = data_info[data_info["Table"] == "application_{train|test}.csv"]
data_info

### Missing values

In [None]:
def plot_NaN(data, limite = 0):
    nb_nan = data.isna().sum()
    # pour voir que les colonnes avec NaN > limite
    nb_nan = nb_nan[nb_nan.values >= limite]

    d = {'Variables': nb_nan.index, 'Number of NaN': nb_nan.values}
    nb_nan_df = pd.DataFrame(data=d)
    plt.figure(figsize=(18, 8))

    nb_nan_df1 = nb_nan_df

    sns.set_theme(style="whitegrid")
    ax = sns.barplot(x="Variables", y="Number of NaN", data=nb_nan_df1)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    ax.set_title('Nombre de \"NaN\" par variable', fontdict= { 'fontsize': 24, 'fontweight':'bold'})
    
plot_NaN(data_train, limite = 100)

In [None]:
msno.matrix(data_train)

### Données num (training data)

In [None]:
data_numerical = data_train.select_dtypes(include=np.number)
data_numerical.drop(['SK_ID_CURR'], axis=1, inplace=False).describe()

### Données cat (training data)

In [None]:
data_categorical = data_train.select_dtypes(exclude=np.number)
data_categorical.describe()

### Données normalisées

Visualisation des données normalisées pour verifer leurs distribution.\
Ici, on voit que les distributions ne comporte pas d'extreme.\
**Pas besoin de les re-nettoyer**.

In [None]:
col_normalized = data_info[data_info["Special"] == "normalized"]["Row"]

In [None]:
for col in col_normalized:
    if col in data_numerical.columns:
        ax = sns.violinplot(x=data_train[col])
        plt.show()

## Traitement des données

### Selection des individus (empreins)

On verifie qu'aucun ID apparait à la fois dans le dataset train et test

In [None]:
new_df = data_train.loc[data_train.SK_ID_CURR.isin(data_test.SK_ID_CURR)]
print("Same id in train/test:")
new_df

On verifie qu'aucun ID apparait plusieurs fois dans le dataset train

In [None]:
print("Nombre d'individus original: "+ str(data_train.shape[0]))
data_train.drop_duplicates(subset=['SK_ID_CURR'], keep = 'first', inplace=True)
print("Nombre d'individus en verifiant les doublons dans \'code\': "+ str(data_train.shape[0]))

### OHE 

In [None]:
# one-hot encoding of categorical variables
data_train = pd.get_dummies(data_train, drop_first=True)
data_test = pd.get_dummies(data_test, drop_first=True)

print('Training Features shape: ', data_train.shape)
print('Testing Features shape: ', data_test.shape)
data_train.head()

Garder les mêmes colones pour les 2 dataset (OHE peut créer differentes colonnes si des valeurs n'apparaissent pas dans un dataset)

In [None]:
train_labels = data_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
data_train, data_test = data_train.align(data_test, join = 'inner', axis = 1)

print('Training Features shape: ', data_train.shape)
print('Testing Features shape: ', data_test.shape)

# Add the target back in
data_train['TARGET'] = train_labels

Plusieurs methodes de feature engineering.

### Feature engineering 1

In [None]:
data_feature1 = data_train.copy()

#### Eliminer variables fortement corrélées

In [None]:
X = data_feature1.drop(['TARGET'], axis=1, inplace=False)
df = pd.DataFrame(X)
cor_matrix = df.corr().abs()

fig = plt.figure(figsize=[12,6])
sns.heatmap(cor_matrix.loc[["FLOORSMAX_AVG","FLOORSMAX_MEDI","FLOORSMAX_MODE"],["FLOORSMAX_AVG","FLOORSMAX_MEDI","FLOORSMAX_MODE"]], annot=True)
#sns.heatmap(cor_matrix.iloc[0:15,0:15], annot=True)
plt.show()

On élimine les variables présentant **95%** de similarité 

In [None]:
#print(cor_matrix)
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
#print(upper_tri)
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print("col to drop:"); print(to_drop)

In [None]:
df1 = df.drop(to_drop, axis=1)
print("From " + str(len(data_feature1.columns)) + " columns, to "+ str(len(df1.columns)) + " columns")
data_feature1 = df1
data_feature1["TARGET"] = data_train['TARGET']
data_feature1.head()

#### Garder les variables avec de la variance

On garde les variables avec un minimum de variance

In [None]:
data_X = data_train.drop("TARGET",axis=1)
#data_X.var(axis=0)

In [None]:
selector = VarianceThreshold(0.02)
selector.fit(data_X)
#selector.get_support()

In [None]:
data_feature1 = data_X.iloc[:,selector.get_support()]

In [None]:
data_feature1["TARGET"] = data_train["TARGET"].copy()

In [None]:
print("On garde "+ str(sum(selector.get_support())) +" sur "+ str(len(df1.columns)) +" variables")

Les variables conservées:

In [None]:
data_X.columns[selector.get_support()]

On suprime les variable inutiles

In [None]:
data_feature1.drop("SK_ID_CURR",axis=1, inplace=True)

In [None]:
msno.matrix(data_feature1)

#### Nettoyage Valeurs aberrantes

Ci dessous la description de nos variables actuels

In [None]:
pd.set_option("display.max_colwidth", -1)

In [None]:
mask_column = [info in data_feature1.nunique()[data_feature1.nunique() > 3].index for info in data_info["Row"]]
infos = data_info[mask_column]

display(infos)
data_feature1[data_feature1.nunique()[data_feature1.nunique() > 3].index]

In [None]:
pd.set_option("display.max_colwidth", 0)

Exemple de valeur aberrante, avec la variable "DAYS_EMPLOYED":

In [None]:
ax = sns.boxplot(x=data_train["DAYS_EMPLOYED"])

On élimine les valeurs aberrantes des variables non normalisées et non categorielles

In [None]:
col_to_clean = ['AMT_REQ_CREDIT_BUREAU_YEAR', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
                'OWN_CAR_AGE', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE', 'DAYS_BIRTH',
                'DAYS_EMPLOYED', 'AMT_CREDIT', 'HOUR_APPR_PROCESS_START', "AMT_INCOME_TOTAL", "CNT_CHILDREN",
                "AMT_ANNUITY", "AMT_GOODS_PRICE", "CNT_FAM_MEMBERS", "HOUR_APPR_PROCESS_START",
                "AMT_REQ_CREDIT_BUREAU_WEEK", "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT",
                "OBS_60_CNT_SOCIAL_CIRCLE", "OBS_30_CNT_SOCIAL_CIRCLE"
               ]

limites_sup = [99, 9, 9,
               None, 0, 0, 0, 0,
               0, 99999999, 24, 99999999, 99, 
               None, None, 99, 24,
              99, 99, 99,
              99, 99]

limites_inf = [0, 0, 0,
               0, -36500, -36500, -36500, -36500,
               -36500, 0, 0, 0, 0,
               0, 0, 0, 0,
              0, 0, 0,
              0, 0]

data_feature1 = cleaning(data_feature1, col_to_clean, seuil_Sup = limites_sup, seuil_Inf = limites_inf)

#### Nettoyage Valeurs atypiques

In [None]:
data_feature1.nunique()[data_feature1.nunique() > 3].index

In [None]:
for col in data_feature1.nunique()[data_feature1.nunique() > 3].index:
    print(col + " :")
    #ax = sns.boxplot(x=data_numerical[col])
    #plt.show()
    df = data_feature1[col]
    fig = px.box(df, y=col)#, log_y=True)
    fig.show()

On exclu certaines colonnes pour le nettoyage des valeurs atypiques (ces colonnes ayant des valeurs atypiques justifiées)

In [None]:
# DAYS_EMPLOYED  ? FLOORSMIN_.. ?
col_to_not_clean = ["CNT_CHILDREN","CNT_FAM_MEMBERS", "HOUR_APPR_PROCESS_START",
                    "DEF_30_CNT_SOCIAL_CIRCLE", "DEF_60_CNT_SOCIAL_CIRCLE","AMT_REQ_CREDIT_BUREAU_WEEK",
                    "AMT_REQ_CREDIT_BUREAU_MON","AMT_REQ_CREDIT_BUREAU_QRT","AMT_REQ_CREDIT_BUREAU_YEAR"]

In [None]:
[col_to_clean.remove(ele) for ele in col_to_not_clean]

In [None]:
# bug ?
# ce nom de colonne n'est pas reconnu si dans liste "col_to_not_clean"
col_to_clean.remove("HOUR_APPR_PROCESS_START")

In [None]:
col_to_clean

In [None]:
data_feature1 = cleaning(data_feature1, col_to_clean)

#### Imputation Valeurs manquante

In [None]:
msno.matrix(data_feature1)

On impute les variables categorielles par leurs modes, puis les varaibles numeriques par leurs moyennes

In [None]:
 data_feature1.nunique()

In [None]:
col_nb_unique = data_feature1.nunique()

col_categorical = col_nb_unique[col_nb_unique <= 3].index
col_not_categorical = col_nb_unique[col_nb_unique > 3].index

In [None]:
[data_feature1[col].fillna(data_feature1[col].mean(), inplace=True) for col in col_not_categorical]
[data_feature1[col].fillna(data_feature1[col].mode(), inplace=True) for col in col_categorical]

#### Visualisation post-traitement

profil de **classe 1**

In [None]:
data_feature1[data_feature1["TARGET"] == 1].mean()

profil de **classe 0**

In [None]:
data_feature1[data_feature1["TARGET"] == 0].mean()

In [None]:
msno.matrix(data_feature1)

Regardons la relation entre les variables non categoriels

In [None]:
dataPrint = data_feature1[col_not_categorical]
dataPrint

In [None]:
ax = sns.violinplot(x=data_numerical["EXT_SOURCE_1"])

In [None]:
#Column_to_visualize = data_feature1.columns
Column_to_visualize = ["EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3"]

for col in Column_to_visualize:
    ax = sns.violinplot(x="TARGET", y=col, data=data_feature1)
    plt.show()

In [None]:
toPrint = dataPrint.iloc[:1000,:5]
toPrint["TARGET"] = data_feature1["TARGET"]
sns.pairplot(toPrint)

In [None]:
toPrint = dataPrint.iloc[:1000,5:10]
toPrint["TARGET"] = data_feature1["TARGET"]
sns.pairplot(toPrint)

In [None]:
toPrint = dataPrint.iloc[:1000,10:]
toPrint["TARGET"] = data_feature1["TARGET"]
sns.pairplot(toPrint)

### Feature engineering 2

In [None]:
X = data_feature1.drop("TARGET",axis=1)
y = data_feature1["TARGET"]

#### Boruta

attendre un moment ci dessous 30min?

In [None]:
forest = RandomForestRegressor(
   n_jobs = -1, 
   max_depth = 5
)
boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   max_iter = 50 # number of trials to perform
)### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(np.array(X), np.array(y))### print results
green_area = X.columns[boruta.support_].to_list()
blue_area = X.columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

In [None]:
'''features in the green area: ['DAYS_REGISTRATION', 'DAYS_BIRTH', 'EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'DAYS_EMPLOYED', 'AMT_CREDIT']
features in the blue area: ['FLAG_DOCUMENT_3', 'DAYS_ID_PUBLISH']

features in the green area: ['AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
features in the blue area: ['AMT_GOODS_PRICE']

features in the green area: ['AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
features in the blue area: ['AMT_GOODS_PRICE', 'DAYS_REGISTRATION', 'FLAG_DOCUMENT_3']

'''

On garde les variables utiles lors des classifications avec la methode "boruta"

In [None]:
data_feature2 = data_feature1[green_area+blue_area]
data_feature2["TARGET"] = data_feature1["TARGET"]

#### Combinaison de variables avec la documentation

nombre de mois* pour rembourser avec le salaire en entier

In [None]:
data_feature2["mois_remboursement"] = data_feature1["AMT_CREDIT"]/(data_feature1["AMT_INCOME_TOTAL"]/12)
data_feature2["mois_remboursement"]

nombre de mois* pour rembourser avec le salaire diviser par le nombre d'enfant

In [None]:
data_feature2["mois_remboursement_enfant"] = data_feature1["AMT_CREDIT"]/((data_feature1["AMT_INCOME_TOTAL"]/12)*(data_feature1["CNT_CHILDREN"]+1))
data_feature2["mois_remboursement_enfant"]

le salaire diviser par le nombre d'enfant

In [None]:
data_feature2["salaire_enfant"] = (data_feature1["AMT_INCOME_TOTAL"]/12)/(data_feature1["CNT_CHILDREN"]+1)
data_feature2["salaire_enfant"]

### features SCALE train/test

split data train/test

In [None]:
train_X1, test_X1, train_Y1, test_Y1 = train_test_split(data_feature1.drop("TARGET",axis=1),data_feature1["TARGET"],test_size=0.3)
train_X2, test_X2, train_Y2, test_Y2 = train_test_split(data_feature2.drop("TARGET",axis=1),data_feature2["TARGET"],test_size=0.3)

In [None]:
features1 = train_X1.columns
features2 = train_X2.columns

scale data train/test

In [None]:
sc = StandardScaler()

train_X1 = sc.fit_transform(train_X1)
test_X1 = sc.transform (test_X1)

train_X2 = sc.fit_transform(train_X2)
test_X2 = sc.transform(test_X2)

test_X1 = pd.DataFrame(test_X1, columns=features1)
test_X2 = pd.DataFrame(test_X2, columns=features2)

## Modeles

Pour contrer le **déséquilibre des classes** dans nos données, on associe des poids (une importance) plus grands à la classe sous représentée (la classe 1)

In [None]:
nbr_1 = sum(data_feature2["TARGET"])
print("number of class 1: "+ str(nbr_1)+"/"+str(data_feature2["TARGET"].shape[0]))
proportion1 = nbr_1/data_feature2["TARGET"].shape[0]
proportion0 = 1-proportion1
print("=> "+str(round(proportion1*100,2))+"% of the data")

In [None]:
# all results for all the choices of features
resultats = []

# all results for a choice of features
ALL_RES = {}

### Choix des features:

In [None]:
#train_X, test_X, train_Y, test_Y, features = train_X1, test_X1, train_Y1, test_Y1, features1
train_X, test_X, train_Y, test_Y, features = train_X2, test_X2, train_Y2, test_Y2, features2

Quel sont les metriques qui nous interesse ?

**1) accorder un minimum de prêt qui ne seront pas remboursés**\
 => detecter les personne non fiables\
 => maximiser recall de la classe 1 (non rembourseur de prêt)

**2) accorder un maximum de prêt qui seront remboursés**\
 => detecter les personne fiables\
 => maximiser recall de la classe 0 (rembourseur de prêt)

**Un compromis:**

The **F-beta score** is the weighted harmonic mean of precision and recall, reaching its optimal value at 1 and its worst value at 0.

The beta parameter determines the weight of recall in the combined score. beta < 1 lends more weight to precision, while beta > 1 favors recall (beta -> 0 considers only precision, beta -> +inf only recall).

In [None]:

ftwo_scorer = make_scorer(fbeta_score, beta=2)

### Naive model (most freq)

In [None]:

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_X, train_Y)
pred_dummy = dummy_clf.predict(test_X)

In [None]:
ALL_RES["most freq"] = matrix_conf(test_Y, pred_dummy)

### Logistic regression gridSearch

On fait varier la regularisation du modele avec un grid search

In [None]:
# C = smaller values specify stronger regularization.
grid={"C":np.logspace(-3,3,7)}#, "penalty":["l1","l2"]}
lr = LogisticRegression(class_weight={0:1-proportion0,1:1-proportion1})
logreg_cv=GridSearchCV(lr,grid,cv=10, scoring=ftwo_scorer)

logreg_cv.fit(train_X,train_Y)

In [None]:
logreg_cv.best_estimator_

In [None]:
# On récupère la prédiction de la valeur positive
y_prob = logreg_cv.predict_proba(test_X)[:,1] 

# On créé un vecteur de prédiction à partir du vecteur de probabilités
y_pred = np.where(y_prob > 0.5, 1, 0) 

y_prob_train = logreg_cv.predict_proba(train_X)[:,1] 
y_pred_train = np.where(y_prob_train > 0.5, 1, 0)

# no need to display ROC curve (can't compare with other models)

#false_positive_rate, true_positive_rate, thresholds = roc_curve(test_Y, y_prob)
#roc_auc = auc(false_positive_rate, true_positive_rate)
#print(roc_auc)

#### Fonctionnement global 

Variables utilisées par le modele :

In [None]:
features_importance = logreg_cv.best_estimator_.coef_[0]

d = {'features': features, 'score': features_importance}
df = pd.DataFrame(data=d)

df["score"] = df["score"].abs()
df = df.sort_values('score',ascending=False)

In [None]:
#perfs_dic = {'Modeles': res.iloc[2].index, 'fbeta score': res.iloc[2].values}

plt.figure(figsize=(18, 8))

perfs = pd.DataFrame(data=df)

sns.set_theme(style="whitegrid")
ax = sns.barplot(x="features", y="score", data=perfs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_title('Features usage by the model', fontdict= { 'fontsize': 24, 'fontweight':'bold'})

#### Resultats

Sur les données de **train**

In [None]:
matrix_conf(train_Y, y_pred_train)

Sur les données de **test**

In [None]:
ALL_RES["logistic reg"] = matrix_conf(test_Y, y_pred)

#### Fonctionnement local 

exemple de decision (SHAP):

In [None]:
# use Kernel SHAP to explain test set predictions
k_explainer = shap.KernelExplainer(logreg_cv.best_estimator_.predict_proba, pd.DataFrame(train_X).iloc[0:100])

rand = randint(0, len(test_X)-1)
k_shap_values = k_explainer.shap_values(pd.DataFrame(test_X).iloc[rand])

shap.initjs()
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], pd.DataFrame(test_X).iloc[rand])

### tree gridSearch 

Un arbre de decision permet une **interpretation simple** de la prediction du classifieur\
+
Decision trees frequently perform well on imbalanced data\
+
efficace avec des données booléennes (ou categoriels)

In [None]:
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6]}
tree_models = GridSearchCV(DecisionTreeClassifier(class_weight={0:1-proportion0,1:1-proportion1}), tree_para, cv=5, scoring = 'recall')
tree_models.fit(train_X, train_Y)

pred_tree_grid = tree_models.predict(test_X)
pred_tree_grid_train = tree_models.predict(train_X)

In [None]:
#clf.best_params_
tree_models.best_estimator_

#### Fonctionnement global 

Variables utilisées par le modele :

In [None]:
features_importance = tree_models.best_estimator_.feature_importances_

d = {'features': features, 'score': features_importance}
df = pd.DataFrame(data=d)

df["score"] = df["score"].abs()
df = df.sort_values('score',ascending=False)

In [None]:
#perfs_dic = {'Modeles': res.iloc[2].index, 'fbeta score': res.iloc[2].values}

plt.figure(figsize=(18, 8))

perfs = pd.DataFrame(data=df)

sns.set_theme(style="whitegrid")
ax = sns.barplot(x="features", y="score", data=perfs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_title('Features usage by the model', fontdict= { 'fontsize': 24, 'fontweight':'bold'})

Affichage de l'arbre de decision

In [None]:
fig = plt.figure(figsize=(25,20))
tree.plot_tree(tree_models.best_estimator_, feature_names=features, filled=True, fontsize=10)
plt.show()

#### Resultats

Sur les données de **train**

In [None]:
matrix_conf(train_Y, pred_tree_grid_train)

Sur les données de **test**

In [None]:
ALL_RES["decision tree"] = matrix_conf(test_Y, pred_tree_grid)

#### Fonctionnement local 

exemple de decision (SHAP):

In [None]:
# Create object that can calculate shap values
explainer = shap.TreeExplainer(tree_models.best_estimator_)

# Calculate Shap values
rand = randint(0, len(test_X)-1)
shap_values = explainer.shap_values(pd.DataFrame(test_X).iloc[rand])

shap.force_plot(explainer.expected_value[1], shap_values[1], pd.DataFrame(test_X).iloc[rand])

### SVM

In [None]:
clf = svm.SVC(max_iter = 100, kernel = 'linear', probability = True)
clf.fit(train_X, train_Y)

pred_SVM = clf.predict(test_X)
pred_SVM_train = clf.predict(train_X)

#### Fonctionnement global 

Variables utilisées par le modele :

In [None]:
#dt = {"coef":clf.coef_[0], "feature":features}
#df = pd.DataFrame(data=dt)
#df.sort_values(by=['coef'], ascending=False)

features_importance = clf.coef_[0]

d = {'features': features, 'score': features_importance}
df = pd.DataFrame(data=d)

df["score"] = df["score"].abs()
df = df.sort_values('score',ascending=False)

In [None]:
#perfs_dic = {'Modeles': res.iloc[2].index, 'fbeta score': res.iloc[2].values}

plt.figure(figsize=(18, 8))

perfs = pd.DataFrame(data=df)

sns.set_theme(style="whitegrid")
ax = sns.barplot(x="features", y="score", data=perfs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_title('Features usage by the model', fontdict= { 'fontsize': 24, 'fontweight':'bold'})

#### Resultats

Sur les données de **train**

In [None]:
matrix_conf(train_Y, pred_SVM_train)

Sur les données de **test**

In [None]:
ALL_RES["SVM"] = matrix_conf(test_Y, pred_SVM)

#### Fonctionnement local 

exemple de decision (SHAP):

In [None]:
# use Kernel SHAP to explain test set predictions
k_explainer = shap.KernelExplainer(clf.predict_proba, pd.DataFrame(train_X).iloc[0:100])

rand = randint(0, len(test_X)-1)
k_shap_values = k_explainer.shap_values(pd.DataFrame(test_X).iloc[rand])
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], pd.DataFrame(test_X).iloc[rand])

### random forest gridSearch

In [None]:
forest_param = {'max_depth':[4,5,6], 'n_estimators':[5,8,10]}
#rf_model = GridSearchCV(RandomForestClassifier(class_weight={0:0.1,1:1}), forest_param, cv=5, scoring = 'recall')
rf_models = GridSearchCV(RandomForestClassifier(class_weight={0:1-proportion0,1:1-proportion1}), forest_param, cv=5, scoring = ftwo_scorer)

# fit your model
rf_models.fit(train_X, train_Y)

pred_forest = rf_models.predict(test_X)
pred_forest_train = rf_models.predict(train_X)

In [None]:
rf_model = rf_models.best_estimator_

#### Fonctionnement global 

Variables utilisées par le modele :

In [None]:
features_importance = rf_model.feature_importances_

d = {'features': features, 'score': features_importance}
df = pd.DataFrame(data=d)

df["score"] = df["score"].abs()
df = df.sort_values('score',ascending=False)

In [None]:
#perfs_dic = {'Modeles': res.iloc[2].index, 'fbeta score': res.iloc[2].values}

plt.figure(figsize=(18, 8))

perfs = pd.DataFrame(data=df)

sns.set_theme(style="whitegrid")
ax = sns.barplot(x="features", y="score", data=perfs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_title('Features usage by the model', fontdict= { 'fontsize': 24, 'fontweight':'bold'})

**Un des arbres** de decision du meilleur model

In [None]:
fig = plt.figure(figsize=(25,20))
tree.plot_tree(rf_model.estimators_[0], feature_names=features, filled=True, fontsize=10)
plt.show()

#### Resultats

Sur les données de **train**

In [None]:
matrix_conf(train_Y, pred_forest_train)

Sur les données de **test**

In [None]:
ALL_RES["forest"] = matrix_conf(test_Y, pred_forest)

#### Fonctionnement local 

exemple de decision (SHAP):

In [None]:
# Create object that can calculate shap values
explainer = shap.TreeExplainer(rf_model)

# Calculate Shap values
rand = randint(0, len(test_X)-1)
shap_values = explainer.shap_values(pd.DataFrame(test_X).iloc[rand])

shap.force_plot(explainer.expected_value[1], shap_values[1], pd.DataFrame(test_X).iloc[rand])

## Conclusion

### Evaluation de nos modeles

In [None]:
resultats.append(pd.DataFrame(ALL_RES, index=['detection des non rembourseurs (recall 1)',
                                         'detection des rembourseurs (recall 0)','compromis (fbeta score)']))

In [None]:
for res in resultats:
    display(res)

In [None]:
perfs_dic = {'Modeles': res.iloc[2].index, 'fbeta score': res.iloc[2].values}

plt.figure(figsize=(18, 8))

perfs = pd.DataFrame(data=perfs_dic)
perfs = perfs.sort_values('fbeta score',ascending=False)

sns.set_theme(style="whitegrid")
ax = sns.barplot(x="Modeles", y="fbeta score", data=perfs)
ax.set_title('fbeta scores for each model', fontdict= { 'fontsize': 24, 'fontweight':'bold'})


### Le meilleur de nos modeles ?

On peux enfin entrainer le meilleur sur toutes nos données pour l'utiliser sur de nouvelles données.

In [None]:
X, Y = data_feature2.drop("TARGET",axis=1), data_feature2["TARGET"]

In [None]:
tree_models.fit(X, Y)

## Perspective

### Calculer le gain de nos modeles  

definir clairement les metriques à optimiser.\
Nottament mesurer les pertes/gains pour les differents types d'erreur afin de selectionner le modele avec le compromis le plus avantageux.

### Utiliser d'autres approches de pre-traitement des variables

Avec des mesures de correlation avec la classe (teste du chi2, square, lightgbm)\
Avec les autres fichier à disposition (avec l'id "SK_ID_CURR")

<img src="datas.png" alt="logo" width="600"/>

### Utiliser d'autres modeles 

SGD classifier ? kernel approx ? optimisation des modeles avec XGboost ?

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html