In [7]:
#coding: utf-8

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Transformations de variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder#, RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import auc, roc_curve, roc_auc_score, make_scorer
from sklearn.metrics import precision_recall_curve, confusion_matrix, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline

# Metrics de ML
from sklearn.metrics import auc, roc_curve, roc_auc_score, make_scorer
from sklearn.metrics import precision_recall_curve, confusion_matrix, f1_score

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV, KFold, train_test_split

from sklearn.base import clone
# Packages hyperopt pour la séléction d'hyperparamètres
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [9]:
def compar_col(df1, df2, name1='df1', name2='df2'):
    ''' Cette fonction compare les colonnes de 2 dataframes et sort une liste des colonnes differentes pour chaque dataframe'''
   
    col_1 = list(df1.columns) 
    col_2 = list(df2.columns)
    diff_col_1=[]
    diff_col_2=[]
    same_col=[]
    
   
    for col in col_1:
        if col in col_2:
            same_col.append(col)
        else:
            diff_col_1.append(col)
    for col in col_2:
        if col not in col_1:
            diff_col_2.append(col)   
    df1.name = 'df1'
    print(f"Variables differentes dans {name1} : {diff_col_1} \n Variables differentes dans {name2} : {diff_col_2} ") 
    

In [10]:
def merge_train_test(df_train, df_test):
    """
    Merge les jeu d'entrainement et de test en rajoutant
    une colonne 'Test' qui prend les valeurs True ou False
    df_train(pd.DataFrame): jeu d'entrainement
    df_test(pd.DataFrame): jeu test
    """

    df_train['Test'] = False
    df_test['Test'] = True
    df_test['TARGET'] = np.nan
    df_app_tot = df_train.append( df_test,
                           ignore_index=True,
                           sort=False)
    return df_app_tot

In [11]:
# pour comparer valeurs entre train et test sets
def display_uniq_cat(df):
    """
    Pour chaque colonne catégorielle, indique le nombre et le nom
    des différentes valeurs prises
    """

    for col in df.columns:
        if df[col].dtype == object:
            print(col)
            print('Number Unique in Train:', df[~df['Test']][col].nunique())
            print('Number Unique in Test: ', df[df['Test']][col].nunique())
            print('Unique in Train: ', sorted([str(element) for element in
                          df[~df['Test']][col].unique().tolist()]))
            print('Unique in Test: ', sorted([str(element) for element in
                          df[df['Test']][col].unique().tolist()]))
            print('\n')

In [12]:
# Encoder les variables categorielles 
def cat_encoder(app):
    
    """
    Encode les variables catégorielles avec un label encoder ou un one hot encoding suivant
    le nombre de valeurs différentes prises par chaque variable    
    """

    # Label encoder
    label_encoder = LabelEncoder()
    
    # categorials features to one hot encode
    cat_features = []
    
    
    for col in app: 
       
        if (col != 'Test' and col != 'TARGET'):
            if app[col].dtype == object: 
                # Label encode binary fearures in training set
                if app[col].nunique() <= 2:
                    app[col] = label_encoder.fit_transform(app[col])
                # get colunms to one hot encode
                elif app[col].nunique() > 2:
                     cat_features.append(col)

    # One-hot encode categorical features in train set
    app = pd.get_dummies(app, columns=cat_features)
       
    return app

In [13]:
# missing value retourne df avec pourcentage
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [16]:
def bar_plt_cat_var(data, column_name, figsize = (18,6), percentage_display = True, plot_defaulter = True, rotation = 0,
                                   horizontal_adjust = 0, fontsize_percent = 'x-small'):
    
    plt.figure(figsize = figsize, tight_layout = False)
    sns.set(style = 'whitegrid', font_scale = 1.2)
    
    #plotting overall distribution of category
    plt.subplot(1,2,1)
    data_to_plot = data[column_name].value_counts().sort_values(ascending = False)
    ax = sns.barplot(x = data_to_plot.index, y = data_to_plot, palette = 'Set1')
    
    if percentage_display:
        total_datapoints = len(data[column_name].dropna())
        for p in ax.patches:
            ax.text(p.get_x() + horizontal_adjust, p.get_height() + 0.005 * total_datapoints, '{:1.02f}%'.format(p.get_height() * 100 / total_datapoints), fontsize = fontsize_percent)
        
    plt.xlabel(column_name, labelpad = 10)
    plt.title(f'Distribution of {column_name}', pad = 20)
    plt.xticks(rotation = rotation)
    plt.ylabel('Counts')
    
    #plotting distribution of category for Defaulters
    if plot_defaulter:
        percentage_defaulter_per_category = (data[column_name][data.TARGET == 1].value_counts() * 100 / data[column_name].value_counts()).dropna().sort_values(ascending = False)

        plt.subplot(1,2,2)
        sns.barplot(x = percentage_defaulter_per_category.index, y = percentage_defaulter_per_category, palette = 'Set2')
        plt.ylabel('Pourcentage des clients non-solvables par categorie')
        plt.xlabel(column_name, labelpad = 10)
        plt.xticks(rotation = rotation)
        plt.title(f'Pourcentage des clients non-solvables par categorie {column_name}', pad = 20)
    plt.show()

In [17]:
def plot_feature_importances(df, name):
    """
    Plot importances returned by a model. This can work with any measure of
    feature importance provided that higher importance is better. 
    
    Parameters
    --------
        df : dataframe
            feature importances. Must have the features in a column
            called `features` and the importances in a column called `importance
        
    Return
    -------
        shows a plot of the 15 most importance features
        
        df : dataframe
            feature importances sorted by importance (highest to lowest) 
            with a column for normalized importance
        """
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index(drop=True)
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (14, 10))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title(f'Feature Importances_{name}')
    
    
    #return df

In [18]:
# fonction qui change type de variable 
def change_var_type(df, var_type_keep, new_var_type):
    for col in df:
        if df[col].dtype not in var_type_keep:  
              df[col] = df[col].astype(new_var_type)
    return df 

In [None]:
# fonction test complet de kaggle Mr. KOEHRSEN

def model(features, test_features, n_folds = 5):

    """Train and test a light gradient boosting model using
    cross validation. 

    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        n_folds (int, default = 5): number of folds to use for cross validation

    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.

    """

    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
     # Extract the labels for training
    labels = features['TARGET']

    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])

    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)

    # Extract feature names
    feature_names = list(features.columns)

    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)

    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 0)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):

        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]

        # Create the model
        model = LGBMClassifier(n_estimators=10000, boosting_type = 'goss',
                   objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, n_jobs = -1, random_state = 0)

        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, verbose = 200)

        # Record the best iteration
        best_iteration = model.best_iteration_
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits

        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]

        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()

    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})

    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})

    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
   # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')

    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 

    return submission, feature_importances, metrics    

In [19]:
# fonction pour recuperer train de test app

def get_train(df):
    train_set = df.loc[~df["Test"], :]
    test_set =  df.loc[df["Test"], :]
    train_set.drop(columns=["Test"], inplace=True)
    test_set.drop(columns=["Test", "TARGET"], inplace=True)
    
    return train_set, test_set  

In [20]:
def remove_missing_columns(train, test, threshold = 90):
    # Calculate missing stats for train and test (remember to calculate a percent!)
    train_miss = pd.DataFrame(train.isnull().sum())
    train_miss['percent'] = 100 * train_miss[0] / len(train)
    
    test_miss = pd.DataFrame(test.isnull().sum())
    test_miss['percent'] = 100 * test_miss[0] / len(test)
    
    # list of missing columns for train and test
    missing_train_columns = list(train_miss.index[train_miss['percent'] > threshold])
    missing_test_columns = list(test_miss.index[test_miss['percent'] > threshold])
    
    # Combine the two lists together
    missing_columns = list(set(missing_train_columns + missing_test_columns))
    
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold))
    
    # Drop the missing columns and return
    train = train.drop(columns = missing_columns)
    test = test.drop(columns = missing_columns)
    
    return train, test

In [1]:
# fonction model avec split train et AUC test

def model_red(features, test_features, n_folds=5):   #test_features, model,
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
     # Extract the labels for training
    labels = features['TARGET']
    labels_test = test_features['TARGET']

    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR', 'TARGET'])

    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
      
            
    # Extract feature names
    feature_names = list(features.columns)

    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = 5, shuffle = True, random_state = 0)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):

        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]

        # Create the model
        model = LGBMClassifier(n_estimators=10000, boosting_type = 'goss',
                   objective = 'binary',# is_unbalance = True,
                                    learning_rate = 0.05, class_weight = 'balanced',
                                   reg_alpha = 0.1, reg_lambda = 0.1, n_jobs = -1, random_state = 0)


#       model = GridSearchCV(estimator=model, param_grid=xgb_grid)       

#     start_time = time.time()

        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, verbose = 200)

        # Record the best iteration
        best_iteration = model.best_iteration_
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits

        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1]/ k_fold.n_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, 
                                                         num_iteration = best_iteration)[:, 1]

        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
#     predictions = model.predict_proba(test_features)[:,1]
    roc = roc_auc_score(labels_test, test_predictions)
    print(f"AUC-Test: {roc:.5f}")    
    # Make the submission dataframe
    #submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})

    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})

    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
   # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
# 
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 

    return roc,  feature_importances, metrics    


In [2]:
def imb_pipeline(mod, name_mod, features, test_features, params):
    scores = []
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
     # Extract the labels for training
    labels = features['TARGET']
    labels_test = test_features['TARGET']

    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    # Extract feature names
    feature_names = list(features.columns)
    
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    start_time = time.time()
   
    model = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('sampling', SMOTE(random_state=0)),
        ('classification', mod)
        
    ])

    score={'AUC':'roc_auc', 
           #'RECALL':'recall',
           #'PRECISION':'precision',
           'F1':'f1'}

    gcv = GridSearchCV(estimator=model,  param_grid=params, cv=5, scoring=score, n_jobs=-1, refit='AUC', #
                       return_train_score=True)   
    gcv.fit(features, labels)
    
    
    test_predictions = gcv.predict_proba(test_features)[:,1]
    roc_test = roc_auc_score(labels_test, test_predictions)
    mean_best_valid =  gcv.best_score_
   
    scores.append([name_mod, mean_best_valid, roc_test, (time.time() - start_time)])
    

    best_model = clone(gcv.best_estimator_)
    print(f"{gcv.best_estimator_}")
    print(f"{gcv.cv_results_}")
    
    return gcv, scores, best_model

In [5]:
def imb_best_class(model, name_mod, features, test_features):
    scores = []
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
     # Extract the labels for training
    labels = features['TARGET']
    labels_test = test_features['TARGET']

    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    # Extract feature names
    feature_names = list(features.columns)
    
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    start_time = time.time()
   
    score={'AUC':'roc_auc', 
           #'RECALL':'recall',
           #'PRECISION':'precision',
           'F1':'f1'}
    
    cv = cross_validate(model, features, labels, cv=5, scoring='roc_auc', n_jobs=-1,
                       return_train_score=True)
    scores = cross_val_score(model, features, labels, cv=5, scoring='roc_auc', n_jobs=-1)
    mean_score = scores.mean()
                       

    return cv, scores, mean_score


In [None]:
skf = StratifiedKFold(n_splits=5)
def model_v1(predictor, features, eval_metric, cv=skf):
    
    
    # Extract the ids
    train_ids = features.index

    # Extract the targets for training
    targets = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Apply the pipeline
#     features = pipeline.fit_transform(features)
    targets = np.array(targets)    

    
    model = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('sampling', SMOTE(random_state=0)),
        ('classification', predictor)
        
    ])


    def optim_score(params,
                    #model=predictor,
                    x_train=features,
                    y_train=targets,
                    cv=cv,
                    eval_metric=eval_metric):

        # the function gets a set of variable parameters in "param"
        params_model = {'classification__n_estimators': int(params['classification__n_estimators']), 
                        'classification__max_depth': int(params['classification__max_depth']), 
                        'classification__learning_rate': params['classification__learning_rate'],
                        'classification__subsample': params['classification__subsample'],
                        'classification__colsample_bytree': params['classification__colsample_bytree'],
                        'classification__num_leaves': int(params['classification__num_leaves']),
                       
        }

        # asssigne les paramètres du modèle
        model.set_params(**params_model)
        
        # Seuil de solvabilité
        threshold = params['classification__solvability_threshold']

        # Cross-validation à 5 passes : retourne le score de probabilité
        y_proba = cross_val_predict(model,
                                    x_train,
                                    y_train,
                                    method='predict_proba',
                                    cv=cv, n_jobs=1)[:, 1]
       
        # Si proba > seuil alors la prédiction est positive : 1
        y_pred = (y_proba > threshold)
        y_pred = np.array(y_pred > 0) * 1
        
        # Calcul du score suivant la métrique utilisé
        score = eval_metric(y_train, y_pred)
        # if loss, Since we have to minimize the score, we return 1- score.
        return 1 - score # Retourne le score

 
    # possible values of parameters
    space={'classification__n_estimators': hp.quniform('classification__n_estimators', 200, 2000, 200),
           'classification__max_depth' : hp.quniform('classification__max_depth', 2, 30, 2),
           'classification__learning_rate': hp.loguniform('classification__learning_rate', np.log(0.005), np.log(0.2)),
           'classification__subsample': hp.quniform('classification__subsample', 0.1, 1.0, 0.2),
          'classification__colsample_bytree': hp.quniform('classification__colsample_by_tree', 0.6, 1.0, 0.1),
           'classification__num_leaves': hp.quniform('classification__num_leaves', 4, 100, 4),
           'classification__solvability_threshold': hp.quniform('classification__solvability_threshold', 0.0, 1.0, 0.025)
    }


    best=fmin(fn=optim_score, # function to optimize
              space=space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=10, # maximum number of iterations
    )

    # computing the score on the test set
    model.set_params(
                   classification__n_estimators=int(best['classification__n_estimators']),
                   classification__max_depth=int(best['classification__max_depth']),
                   classification__learning_rate=best['classification__learning_rate'],
                   classification__subsample=best['classification__subsample'],           
                   classification__colsample_bytree=best['classification__colsample_by_tree'],
                   classification__num_leaves=int(best['classification__num_leaves']),
          
    )
    
    # Entrainement du modèle sur tout le jeux de données
    model.fit(features, targets)

    # Record the best parameters
    best_parameters = best
    
    # Record the feature importances
    feature_importance_values = model.steps[2][1].feature_importances_
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names,
                                        'importance': feature_importance_values})

    return best_parameters, feature_importances, model


In [None]:
def model_opt(model, features, test_features, n_folds=5):   
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
     # Extract the labels for training
    labels = features['TARGET']
    labels_test = test_features['TARGET']

    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR', 'TARGET'])

    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
      
            
    # Extract feature names
    feature_names = list(features.columns)

    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = 5, shuffle = True, random_state = 0)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):

        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]

        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, verbose = 200)

        # Record the best iteration
        best_iteration = model.best_iteration_
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits

        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1]/ k_fold.n_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]

        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        # Clean up memory
        gc.enable()
        del train_features, valid_features
        gc.collect()
        
#     predictions = model.predict_proba(test_features)[:,1]
    roc = roc_auc_score(labels_test, test_predictions)
    print(f"AUC-Test: {roc:.5f}")    

    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})

    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
   # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
# 
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 

    return roc, feature_importances, metrics    


In [None]:
def metric_metier(y_true, y_pred, fn_value=-10, fp_value=0, tp_value=0, tn_value=1):

    # Matrice de Confusion
    mat_conf = confusion_matrix(y_true, y_pred)
    
    # Nombre de True Negatifs
    tn = mat_conf[0, 0]
    # Nombre de Faux Négatifs
    fn = mat_conf[1, 0]
    # Nombre de Faux Positifs
    fp = mat_conf[0, 1]
    # Nombre de True Positifs
    tp = mat_conf[1, 1]
    
    # Gain total
    J = tp*tp_value + tn*tn_value + fp*fp_value + fn*fn_value
    
    # Gain maximum
    max_J = (fp + tn)*tn_value + (fn + tp)*tp_value
    
    # Gain minimum
    min_J = (fp + tn)*fp_value + (fn + tp)*fn_value
    
    # Gain normalisé entre 0 et 1
    J_normalized = (J - min_J)/(max_J - min_J)
    return J_normalized  # Retourne la fonction d'évaluation

In [None]:
# Matrices de confusion
#Fonction permettant d'afficher plusieurs matrices de confusions en fonctions des différentes prédictions

def plot_heatmap_confusion(y_valid, **y_valid_pred):
    
    plt.figure(1, figsize=(12, 4))
    results = {}
    
    j = 1
    for i, y_pred in y_valid_pred.items():
        
        plt.subplot(1, len(y_valid_pred), j)

        conf_mx = confusion_matrix(y_valid, y_pred)
        results[i] = conf_mx
        mat_conf_df = pd.DataFrame(conf_mx,
                                   columns=["Solvable","Non Solvable"],
                                   index=["Solvable", "Non Solvable"])
        
        sns.heatmap(mat_conf_df, annot=True, linewidths=.7, fmt='g')

        plt.title(i)
        plt.ylim(0, 2)
        plt.xlabel("Classes prédites")
        plt.ylabel("Classes réelles")
        j+=1

    
    return results

In [None]:
# Evolution du gain en fonction du seuil de solvabilité
def gain_seuil(clf, X, y):
    
    cost_function = []
    threshold_x = np.linspace(0.0, 1, 20)
    
    for threshold in threshold_x:
        
        # Score du modèle : 0 à 1
        y_scores = clf.predict_proba(X)[:, 1]
        
        # Score > seuil de solvabilité : retourne 1 sinon 0
        y_pred = (y_scores > threshold)
        
        y_pred = np.array(y_pred > 0) * 1
        # Calcul de l'indice bancaire
        cost_function.append(metric_metier(y, y_pred))
        
    # Affichage du gain en fonction du seuil de solvabilité    
    plt.plot(threshold_x, cost_function)
    plt.xlabel("Seuil de probabilité")
    plt.ylabel("Indice banquaire")
    plt.xticks(np.linspace(0.1, 1, 10))
    plt.grid()

In [7]:
def plot_roc_curve(classifiers, X, y, n_splits=5, fit=True):

    kf = KFold(n_splits=n_splits)    
    for name_clf, clf in classifiers.items():
        print(name_clf)
        tprs = []
        aucs = []
        mean_fp_rate = np.linspace(0, 1, 100)
        for i, (train, test) in enumerate(kf.split(X, y)):

            full_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
          ('sampling', SMOTE()),
          ('classification', clf)
        #,('scaler', RobustScaler())
         ])
            
            if fit :
                full_pipeline.fit(X.iloc[train, :], y.iloc[train])

            y_pred_grd = full_pipeline.predict_proba(X.iloc[test, :])[:, 1]
            fp_rate, tp_rate, tresholds = roc_curve(y.iloc[test], y_pred_grd)

            interp_tp_rate = np.interp(mean_fp_rate, fp_rate, tp_rate)
            interp_tp_rate[0] = 0.0
            tprs.append(interp_tp_rate)
            auc_ = auc(fp_rate, tp_rate)

            aucs.append(auc_)

        mean_tp_rate = np.mean(tprs, axis=0)
        mean_tp_rate[-1] = 1.0
        mean_auc = auc(mean_fp_rate, mean_tp_rate)
        std_auc = np.std(aucs)
        plt.plot(mean_fp_rate, mean_tp_rate,
            label=name_clf + r'(AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2,
            alpha=.8)

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=15)
    plt.ylabel('True Positive Rate', fontsize=15)
    plt.title('Receiver operating characteristic example', fontsize=20)
    plt.legend(loc="lower right")
 
