<h2>Implémentez un modèle de scoring</h2>

<h3>Première partie : modélisation</h3>

<h4>I) Préparation de l"environnement de travail</h4>

In [None]:
import gc
import re
import time
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
from contextlib import contextmanager

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"
import plotly.offline as pyo
pyo.init_notebook_mode()
import shap
from bayes_opt import BayesianOptimization
from collections import Counter
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from itertools import cycle
from lightgbm import LGBMClassifier
from lightgbm import early_stopping, log_evaluation
from lime.lime_tabular import LimeTabularExplainer
from plotly.subplots import make_subplots
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_recall_fscore_support,
    fbeta_score,
    precision_recall_curve,
    average_precision_score,
    roc_auc_score
)
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
open("outputs.txt", "w").close()

In [None]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    file = open("outputs.txt", "a")
    print("{} - done in {:.0f} s".format(title, time.time() - t0), file=file)
    file.close()

<h4>II) Importation et prétraitement des données</h4>

One-hot encoding for categorical columns with get_dummies

In [None]:
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns
                           if df[col].dtype == "object"]
    df = pd.get_dummies(df, columns=categorical_columns,
                        dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    
    return df, new_columns

Preprocess application_train.csv and application_test.csv :
 - Read data and merge
 - Optional: Remove 4 applications with XNA CODE_GENDER (train set)
 - Categorical features with Binary encode (0 or 1; two categories)
 - Categorical features with One-Hot encode
 - NaN values for DAYS_EMPLOYED: 365.243 -> nan
 - Some simple new features (percentages)

In [None]:
def application_train_test(num_rows=None, nan_as_category=False):
    df = pd.read_csv("../../input/application_train.csv", nrows=num_rows)
    test_df = pd.read_csv("../../input/application_test.csv", nrows=num_rows)
    file = open("outputs.txt", "a")
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)),
          file=file)
    df = df.append(test_df).reset_index()

    df = df[df["CODE_GENDER"] != "XNA"]
    
    for bin_feature in ["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"]:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])

    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)
    
    df["DAYS_EMPLOYED_PERC"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
    df["INCOME_CREDIT_PERC"] = df["AMT_INCOME_TOTAL"] / df["AMT_CREDIT"]
    df["INCOME_PER_PERSON"] = df["AMT_INCOME_TOTAL"] / df["CNT_FAM_MEMBERS"]
    df["ANNUITY_INCOME_PERC"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]
    df["PAYMENT_RATE"] = df["AMT_ANNUITY"] / df["AMT_CREDIT"]
    del test_df
    gc.collect()
    file.close()
    
    return df

Preprocess bureau.csv and bureau_balance.csv :
 - read and one-hot encode
 - Bureau balance: Perform aggregations and merge with bureau.csv
 - Bureau and bureau_balance numeric features
 - Bureau and bureau_balance categorical features
 - new dataframe with aggregations ?
 - Bureau: Active credits - using only numerical aggregations
 - Bureau: Closed credits - using only numerical aggregations

In [None]:
def bureau_and_balance(num_rows=None, nan_as_category=True):
    bureau = pd.read_csv("../../input/bureau.csv", nrows=num_rows)
    bb = pd.read_csv("../../input/bureau_balance.csv", nrows=num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    bb_aggregations = {"MONTHS_BALANCE": ["min", "max", "size"]}
    for col in bb_cat:
        bb_aggregations[col] = ["mean"]
    bb_agg = bb.groupby("SK_ID_BUREAU").agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper()
                               for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how="left", on="SK_ID_BUREAU")
    bureau.drop(["SK_ID_BUREAU"], axis=1, inplace=True)
    del bb, bb_agg
    gc.collect()

    num_aggregations = {
        "DAYS_CREDIT": ["min", "max", "mean", "var"],
        "DAYS_CREDIT_ENDDATE": ["min", "max", "mean"],
        "DAYS_CREDIT_UPDATE": ["mean"],
        "CREDIT_DAY_OVERDUE": ["max", "mean"],
        "AMT_CREDIT_MAX_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_DEBT": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM_LIMIT": ["mean", "sum"],
        "AMT_ANNUITY": ["max", "mean"],
        "CNT_CREDIT_PROLONG": ["sum"],
        "MONTHS_BALANCE_MIN": ["min"],
        "MONTHS_BALANCE_MAX": ["max"],
        "MONTHS_BALANCE_SIZE": ["mean", "sum"]
    }

    cat_aggregations = {}
    for cat in bureau_cat:
        cat_aggregations[cat] = ["mean"]
    for cat in bb_cat:
        cat_aggregations[cat + "_MEAN"] = ["mean"]

    bureau_agg = bureau.groupby("SK_ID_CURR").agg({**num_aggregations, 
                                                   **cat_aggregations})
    bureau_agg.columns = pd.Index(["BURO_" + e[0] + "_" + e[1].upper()
                                   for e in bureau_agg.columns.tolist()])

    active = bureau[bureau["CREDIT_ACTIVE_Active"] == 1]
    active_agg = active.groupby("SK_ID_CURR").agg(num_aggregations)
    active_agg.columns = pd.Index(["ACTIVE_" + e[0] + "_" + e[1].upper()
                                   for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how="left", on="SK_ID_CURR")
    del active, active_agg
    gc.collect()

    closed = bureau[bureau["CREDIT_ACTIVE_Closed"] == 1]
    closed_agg = closed.groupby("SK_ID_CURR").agg(num_aggregations)
    closed_agg.columns = pd.Index(["CLOSED_" + e[0] + "_" + e[1].upper()
                                   for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how="left", on="SK_ID_CURR")
    del closed, closed_agg, bureau
    gc.collect()
    
    return bureau_agg

Preprocess previous_applications.csv :
 - read and one-hot encode
 - Days 365.243 values -> nan
 - Add feature: value ask / value received percentage
 - Previous applications numeric features
 - Previous applications categorical features
 - new dataframe with aggregations ?
 - Previous Applications: Approved Applications - only numerical features
 - Previous Applications: Refused Applications - only numerical features

In [None]:
def previous_applications(num_rows=None, nan_as_category=True):
    prev = pd.read_csv("../../input/previous_application.csv", nrows=num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category=True)

    prev["DAYS_FIRST_DRAWING"].replace(365243, np.nan, inplace=True)
    prev["DAYS_FIRST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE_1ST_VERSION"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_TERMINATION"].replace(365243, np.nan, inplace=True)

    prev["APP_CREDIT_PERC"] = prev["AMT_APPLICATION"] / prev["AMT_CREDIT"]

    num_aggregations = {
        "AMT_ANNUITY": ["min", "max", "mean"],
        "AMT_APPLICATION": ["min", "max", "mean"],
        "AMT_CREDIT": ["min", "max", "mean"],
        "APP_CREDIT_PERC": ["min", "max", "mean", "var"],
        "AMT_DOWN_PAYMENT": ["min", "max", "mean"],
        "AMT_GOODS_PRICE": ["min", "max", "mean"],
        "HOUR_APPR_PROCESS_START": ["min", "max", "mean"],
        "RATE_DOWN_PAYMENT": ["min", "max", "mean"],
        "DAYS_DECISION": ["min", "max", "mean"],
        "CNT_PAYMENT": ["mean", "sum"],
    }

    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ["mean"]
    
    prev_agg = prev.groupby("SK_ID_CURR").agg({**num_aggregations,
                                               **cat_aggregations})
    prev_agg.columns = pd.Index(["PREV_" + e[0] + "_" + e[1].upper() 
    for e in prev_agg.columns.tolist()])

    approved = prev[prev["NAME_CONTRACT_STATUS_Approved"] == 1]
    approved_agg = approved.groupby("SK_ID_CURR").agg(num_aggregations)
    approved_agg.columns = pd.Index(["APPROVED_" + e[0] + "_" + e[1].upper()
                                     for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how="left", on="SK_ID_CURR")

    refused = prev[prev["NAME_CONTRACT_STATUS_Refused"] == 1]
    refused_agg = refused.groupby("SK_ID_CURR").agg(num_aggregations)
    refused_agg.columns = pd.Index(["REFUSED_" + e[0] + "_" + e[1].upper()
                                    for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how="left", on="SK_ID_CURR")
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()

    return prev_agg

Preprocess POS_CASH_balance.csv :
 - read and one-hot encode
 - Features
 - new dataframe with aggregations ?
 - Count pos cash accounts

In [None]:
def pos_cash(num_rows=None, nan_as_category=True):
    pos = pd.read_csv("../../input/POS_CASH_balance.csv", nrows=num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)

    aggregations = {
        "MONTHS_BALANCE": ["max", "mean", "size"],
        "SK_DPD": ["max", "mean"],
        "SK_DPD_DEF": ["max", "mean"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]
    
    pos_agg = pos.groupby("SK_ID_CURR").agg(aggregations)
    pos_agg.columns = pd.Index(["POS_" + e[0] + "_" + e[1].upper()
                                for e in pos_agg.columns.tolist()])

    pos_agg["POS_COUNT"] = pos.groupby("SK_ID_CURR").size()
    del pos
    gc.collect()

    return pos_agg

Preprocess installments_payments.csv :
 - read and one-hot encode
 - Percentage and difference paid in each installment (amount paid and installment value)
 - Days past due and days before due (no negative values)
 - Features: Perform aggregations
 - new dataframe with aggregations ?
 - Count installments accounts

In [None]:
def installments_payments(num_rows=None, nan_as_category=True):
    ins = pd.read_csv("../../input/installments_payments.csv",
                      nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)

    ins["PAYMENT_PERC"] = ins["AMT_PAYMENT"] / ins["AMT_INSTALMENT"]
    ins["PAYMENT_DIFF"] = ins["AMT_INSTALMENT"] - ins["AMT_PAYMENT"]

    ins["DPD"] = ins["DAYS_ENTRY_PAYMENT"] - ins["DAYS_INSTALMENT"]
    ins["DBD"] = ins["DAYS_INSTALMENT"] - ins["DAYS_ENTRY_PAYMENT"]
    ins["DPD"] = ins["DPD"].apply(lambda x: x if x > 0 else 0)
    ins["DBD"] = ins["DBD"].apply(lambda x: x if x > 0 else 0)

    aggregations = {
        "NUM_INSTALMENT_VERSION": ["nunique"],
        "DPD": ["max", "mean", "sum"],
        "DBD": ["max", "mean", "sum"],
        "PAYMENT_PERC": ["max", "mean", "sum", "var"],
        "PAYMENT_DIFF": ["max", "mean", "sum", "var"],
        "AMT_INSTALMENT": ["max", "mean", "sum"],
        "AMT_PAYMENT": ["min", "max", "mean", "sum"],
        "DAYS_ENTRY_PAYMENT": ["max", "mean", "sum"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]

    ins_agg = ins.groupby("SK_ID_CURR").agg(aggregations)
    ins_agg.columns = pd.Index(["INSTAL_" + e[0] + "_" + e[1].upper()
                                for e in ins_agg.columns.tolist()])

    ins_agg["INSTAL_COUNT"] = ins.groupby("SK_ID_CURR").size()
    del ins
    gc.collect()

    return ins_agg

Preprocess credit_card_balance.csv :
 - read and one-hot encode
 - General aggregations
 - Count credit card lines

In [None]:
def credit_card_balance(num_rows=None, nan_as_category=True):
    cc = pd.read_csv("../../input/credit_card_balance.csv", nrows=num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category=True)

    cc.drop(["SK_ID_PREV"], axis=1, inplace=True)
    cc_agg = cc.groupby("SK_ID_CURR").agg(["min", "max", "mean", "sum", "var"])
    cc_agg.columns = pd.Index(["CC_" + e[0] + "_" + e[1].upper()
                               for e in cc_agg.columns.tolist()])

    cc_agg["CC_COUNT"] = cc.groupby("SK_ID_CURR").size()
    del cc
    gc.collect()
    
    return cc_agg

Preprocess the whole dataset :
 - Divide in training/validation and test data
 - imputation and feature scaling

In [None]:
def split_impute_scale(df_):
    train_df = df_[df_["TARGET"].notnull()]
    test_df = df_[df_["TARGET"].isnull()]

    train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    train_feats = list(train_df.columns)
    train_imputer = SimpleImputer(strategy="median")
    train_imputer.fit(train_df)
    train_df = train_imputer.transform(train_df)
    train_scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaler.fit(train_df)
    train_df = train_scaler.transform(train_df)
    train_df = pd.DataFrame(train_df, columns=train_feats)

    test_df = test_df.drop(columns="TARGET")
    test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_feats = list(test_df.columns)
    test_imputer = SimpleImputer(strategy="median")
    test_imputer.fit(test_df)
    test_df = test_imputer.transform(test_df)
    test_scaler = MinMaxScaler(feature_range=(0, 1))
    test_scaler.fit(test_df)
    test_df = test_scaler.transform(test_df)
    test_df = pd.DataFrame(test_df, columns=test_feats)

    del df_
    gc.collect()

    return train_df, test_df

<h4>III) Prédictions et représentations graphiques</h4>

Sampling of the training set for faster model hyperparameter tuning :

In [None]:
def sampling_data(train_df_):
    sample = train_df_.sample(frac=0.1, random_state=42, axis=0,
                              ignore_index=True)
    
    return sample

Tuning models hyperparameters on the sampled training set, for each model :
 - define the function and the score to be optimized,
 - apply bayesian optimization to fine-tune the hyperparameters

In [None]:
def tuning_log(C, train_df_):
    feats = [f for f in train_df_.columns if f not in ["TARGET", "SK_ID_CURR",
             "SK_ID_BUREAU", "SK_ID_PREV", "index"]]

    estimator = LogisticRegression(C=C, solver="saga", max_iter=1000,
                                   n_jobs=-1, random_state=42)
                                   
    cval = cross_val_score(estimator, train_df_[feats], train_df_["TARGET"],
                           scoring="roc_auc", cv=10, n_jobs=-1)

    return cval.mean()

In [None]:
def optimize_log(train_df_):
    def log_crossval(C):
        return tuning_log(C=C, train_df_=train_df_)

    optimizer = BayesianOptimization(
        f=log_crossval,
        pbounds={"C": (0.0001, 10000)},
        random_state=42
    )
    optimizer.maximize()

    file = open("outputs.txt", "a")
    print("Meilleur hyperparamètre :", optimizer.max, file=file)
    file.close()

    return optimizer.max

In [None]:
def tuning_rfc(n_estimators, min_samples_split, train_df_):
    feats = [f for f in train_df_.columns if f not in ["TARGET", "SK_ID_CURR",
             "SK_ID_BUREAU", "SK_ID_PREV", "index"]]

    estimator = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        oob_score=True,
        random_state=50,
        n_jobs=-1
    )
                                   
    cval = cross_val_score(estimator, train_df_[feats], train_df_["TARGET"],
                           scoring="roc_auc", cv=10, n_jobs=-1)

    return cval.mean()

In [None]:
def optimize_rfc(train_df_):
    def rfc_crossval(n_estimators, min_samples_split):
        return tuning_rfc(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            train_df_=train_df_
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (700, 900),
            "min_samples_split": (50, 100)
        },
        random_state=42
    )
    optimizer.maximize()

    file = open("outputs.txt", "a")
    print("Meilleurs hyperparamètres :", optimizer.max, file=file)
    file.close()

    return optimizer.max

In [None]:
def tuning_lgbm(n_estimators, learning_rate, num_leaves, colsample_bytree,
                subsample, reg_alpha, reg_lambda, min_split_gain,
                min_child_weight, train_df_):
    feats = [f for f in train_df_.columns if f not in ["TARGET", "SK_ID_CURR",
             "SK_ID_BUREAU", "SK_ID_PREV", "index"]]

    estimator = make_pipeline(
        SMOTE(random_state=42, n_jobs=1),
        LGBMClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            colsample_bytree=colsample_bytree,
            subsample=subsample,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            min_split_gain=min_split_gain,
            min_child_weight=min_child_weight,
            n_jobs=-1
        )
    )
                                   
    cval = cross_val_score(estimator, train_df_[feats], train_df_["TARGET"],
                           scoring="recall", cv=10, n_jobs=-1)

    return cval.mean()

In [None]:
def optimize_lgbm(train_df_):
    def lgbm_crossval(n_estimators, learning_rate, num_leaves,
                      colsample_bytree, subsample, reg_alpha,
                      reg_lambda, min_split_gain, min_child_weight):
        return tuning_lgbm(
            n_estimators=int(n_estimators),
            learning_rate=learning_rate,
            num_leaves=int(num_leaves),
            colsample_bytree=colsample_bytree,
            subsample=subsample,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            min_split_gain=min_split_gain,
            min_child_weight=min_child_weight,
            train_df_=train_df_
        )

    optimizer = BayesianOptimization(
        f=lgbm_crossval,
        pbounds={
            "n_estimators": (100, 1000),
            "learning_rate": (0.001, 0.1),
            "num_leaves": (32, 128),
            "colsample_bytree": (0.01, 1),
            "subsample": (0.01, 1),
            "reg_alpha": (0.0001, 10000),
            "reg_lambda": (0.0001, 10000),
            "min_split_gain": (0, 0.1),
            "min_child_weight": (0.001, 1000)
        },
        random_state=42
    )
    optimizer.maximize()

    file = open("outputs.txt", "a")
    print("Meilleurs hyperparamètres :", optimizer.max, file=file)
    file.close()

    return optimizer.max

Oversampling the "TARGET = 1" category AFTER cross-validation :

In [None]:
def smote_oversampling(x, y):
    smote = SMOTE(sampling_strategy="minority", random_state=42, n_jobs=-1)
    x_smote, y_smote = smote.fit_resample(x, y)
    file = open("outputs.txt", "a")
    print("Original fold shape :", Counter(y), file=file)
    print("Resampled fold shape :", Counter(y_smote), file=file)
    gc.collect()
    file.close()
    
    return x_smote, y_smote

In [None]:
def display_importances(feature_importance_df_, neg_values=False):
    if neg_values:
        cols = (feature_importance_df_[["feature", "abs_importance"]]
                .groupby("feature")
                .mean()
                .sort_values(by="abs_importance", ascending=False)[:40]
                .index)
    else:
        cols = (feature_importance_df_[["feature", "importance"]]
                .groupby("feature")
                .mean()
                .sort_values(by="importance", ascending=False)[:40]
                .index)
    best_features = feature_importance_df_.loc[feature_importance_df_.feature
                                               .isin(cols)]
    if neg_values:
        data = best_features.sort_values(by="abs_importance", ascending=True)
    else:
        data = best_features.sort_values(by="importance", ascending=True)
    
    fig = go.Figure()
    palette = cycle(px.colors.sequential.Turbo_r)
    for feat in data["feature"].unique():
        feat_bar = data[data["feature"]==feat]
        fig.add_trace(
            go.Bar(
                x=[feat_bar["importance"].mean()],
                y=[feat],
                orientation="h",
                error_x=dict(
                    type="constant",
                    value=feat_bar["importance"].std()
                ),
                marker_color=next(palette)
            )
        )

    fig.update_layout(
        height=1200,
        width=1200,
        template="simple_white",
        showlegend=False,
        title_text="Feature Importance (avg over folds)"
    )

    fig.show()

In [None]:
def lime_features(train_, labels_, feats_, test_, clf_):
    explainer = LimeTabularExplainer(
        training_data=train_.values,
        mode="classification",
        training_labels=labels_,
        feature_names=feats_,
        class_names=list(labels_.unique()),
        random_state=42
    )
    explanation = explainer.explain_instance(test_.values[5], clf_)
    explanation.show_in_notebook()

In [None]:
def shap_features(clf_, train_):
    explainer = shap.TreeExplainer(clf_)
    shap_values = explainer.shap_values(train_)
    shap.summary_plot(shap_values, train_)

Display classification metrics :

In [None]:
def class_metrics(train_df_, preds_, oof_preds_, model):
    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=(
            "F-beta score as f(beta)",
            "Precision-recall curve",
            "F-beta score as f(beta, threshold)",
            "Metrics as f(threshold)"
        )
    )

    betas = list(range(0, 16))
    pre_model = []
    rec_model = []
    fbeta_model = []

    for i in betas:
        model_precision, model_recall, model_fbetascore, model_support = (
            precision_recall_fscore_support(
                train_df_["TARGET"],
                preds_,
                beta=i,
                average="binary",
                zero_division=0
            )
        )
        pre_model.append(model_precision)
        rec_model.append(model_recall)
        fbeta_model.append(model_fbetascore)

    fig.append_trace(
        go.Scatter(
            x=betas,
            y=pre_model,
            mode="lines",
            legendgroup="group1",
            legendgrouptitle_text="F-beta score as f(beta)",
            name="Precision",
            line=dict(color="blue")
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Scatter(
            x=betas,
            y=rec_model,
            mode="lines",
            legendgroup="group1",
            name="Recall",
            line=dict(color="red")
        ),
        row=1,
        col=1
    )
    fig.append_trace(
        go.Scatter(
            x=betas,
            y=fbeta_model,
            mode="lines",
            legendgroup="group1",
            name="F-beta score",
            line=dict(color="green")
        ),
        row=1,
        col=1
    )

    prcurve_y, prcurve_x, prcurve_t = precision_recall_curve(
        train_df_["TARGET"],
        oof_preds_,
        pos_label=1
    )
    ap_score = average_precision_score(
        train_df_["TARGET"],
        oof_preds_,
    )
    ap_score = float(f"{ap_score:.2f}")

    fig.append_trace(
        go.Scatter(
            x=prcurve_x,
            y=prcurve_y,
            mode="lines",
            legendgroup="group2",
            legendgrouptitle_text="Precision-recall curve",
            name="{} (AP = {})".format(model, ap_score)
        ),
        row=1,
        col=2
    )

    def adjusted_metrics(y_preds, t):
        return [1 if y >= t else 0 for y in y_preds]

    thresholds = np.linspace(0, 1, 101)
    betas_range = list(range(0, 6))
    colors = ["orchid", "royalblue", "limegreen", "gold", "orangered",
              "darkred"]

    for i in betas_range:
        fbeta_range = []
        
        for t in thresholds:
            y_adj = adjusted_metrics(oof_preds_, t)
            fbetascore = fbeta_score(train_df_["TARGET"], y_adj, beta=i,
                                     zero_division=0)
            fbeta_range.append(fbetascore)

        fig.append_trace(
            go.Scatter(
                x=thresholds,
                y=fbeta_range,
                mode="lines",
                legendgroup="group3",
                legendgrouptitle_text="F-beta score as f(beta, threshold)",
                name="beta = %2d" % i,
                line=dict(color=colors[i])
            ),
            row=2,
            col=1
        )

    pre_adj = []
    rec_adj = []
    fbeta_adj = []

    for t in thresholds:
        y_adj = adjusted_metrics(oof_preds_, t)
        adj_precision, adj_recall, adj_fbetascore, adj_support = (
            precision_recall_fscore_support(
                train_df_["TARGET"],
                y_adj,
                beta=1,
                average="binary",
                zero_division=0
            )
        )
        pre_adj.append(adj_precision)
        rec_adj.append(adj_recall)
        fbeta_adj.append(adj_fbetascore)

    fig.append_trace(
        go.Scatter(
            x=thresholds,
            y=pre_adj,
            mode="lines",
            legendgroup="group4",
            legendgrouptitle_text="Metrics as f(threshold)",
            name="Adjusted Precision",
            line=dict(color="blue")
        ),
        row=2,
        col=2
    )
    fig.append_trace(
        go.Scatter(
            x=thresholds,
            y=rec_adj,
            mode="lines",
            legendgroup="group4",
            name="Adjusted Recall",
            line=dict(color="red")
        ),
        row=2,
        col=2
    )
    fig.append_trace(
        go.Scatter(
            x=thresholds,
            y=fbeta_adj,
            mode="lines",
            legendgroup="group4",
            name="Adjusted F1 score",
            line=dict(color="green")
        ),
        row=2,
        col=2
    )   

    fig.update_layout(height=800, width=1200, template="simple_white",
                      showlegend=True)

    fig.show()

LightGBM, Random Forests, Logistic Regression and Dummy classifiers, with K-Folds or Stratified K-Folds, SMOTE-balanced or not :
 - print used model
 - Cross validation model
 - Create arrays and dataframes to store results
 - call xs and ys folds,
 - oversampling model,
 - fit and predict
 - create dataframe for features importance
 - print fold and full AUC scores
 - Write submission file and plot feature importance

In [None]:
def kfold_model(train_df, test_df, num_folds, model, filename, over=False,
                strat=False, debug=False):
    file = open("outputs.txt", "a")
    print("----------", file=file)
    print("Starting {}.".format(model), file=file)
    file.close()

    if strat:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)

    preds = np.zeros(train_df.shape[0])
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ["TARGET", "SK_ID_CURR",
             "SK_ID_BUREAU", "SK_ID_PREV", "index"]]

    for n_fold, (train_idx, valid_idx) in enumerate(
        folds.split(train_df[feats], train_df["TARGET"]
    )):
        train_x, train_y = (train_df[feats].iloc[train_idx],
                            train_df["TARGET"].iloc[train_idx])
        valid_x, valid_y = (train_df[feats].iloc[valid_idx],
                            train_df["TARGET"].iloc[valid_idx])

        if over:
            over_x, over_y = smote_oversampling(train_x, train_y)
        else:
            over_x, over_y = train_x, train_y

        if model == "LGBM":
            clf = LGBMClassifier(
                objective="binary",
                n_estimators=int(291.10519961044855),
                learning_rate=0.0030378649352844423,
                num_leaves=int(49.45519685188166),
                colsample_bytree=0.710991852018085,
                subsample=0.5295088673159155,
                reg_alpha=1834.0451801938873,
                reg_lambda=3042.4224991711535,
                min_split_gain=0.08324426408004218,
                min_child_weight=969.9098822521422,
                n_jobs=-1
            )
            clf.fit(
                over_x,
                over_y,
                eval_set=[(over_x, over_y), (valid_x, valid_y)],
                eval_metric="auc",
                callbacks=[
                    log_evaluation(period=200),
                    early_stopping(stopping_rounds=200)
                ],
            )
            preds[valid_idx] = clf.predict(valid_x, 
                                           num_iteration=clf.best_iteration_)
            oof_preds[valid_idx] = clf.predict_proba(
                valid_x,
                num_iteration=clf.best_iteration_
            )[:, 1]
            sub_preds += clf.predict_proba(
                test_df[feats],
                num_iteration=clf.best_iteration_
            )[:, 1] / folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df,
                                              fold_importance_df], axis=0)
        elif model == "RFC":
            clf = RandomForestClassifier(
                n_estimators=600,
                min_samples_split=25,
                oob_score=True,
                random_state=50,
                n_jobs=-1
            )
            clf.fit(over_x, over_y)
            preds[valid_idx] = clf.predict(valid_x)
            oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
            sub_preds += clf.predict_proba(
                test_df[feats]
            )[:, 1] / folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df,
                                              fold_importance_df], axis=0)
        elif model == "LOG":
            clf = LogisticRegression(
                C=0.3807947176588889,
                solver="saga",
                max_iter=1000,
                n_jobs=-1,
                random_state=42
            )
            clf.fit(over_x, over_y)
            preds[valid_idx] = clf.predict(valid_x)
            oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
            sub_preds += clf.predict_proba(
                test_df[feats]
            )[:, 1] / folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.coef_[0]
            fold_importance_df["abs_importance"] = np.absolute(clf.coef_[0])
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df,
                                              fold_importance_df], axis=0)
        elif model == "DUMMY":
            clf = DummyClassifier(strategy="uniform", random_state=42)
            clf.fit(over_x, over_y)
            preds[valid_idx] = clf.predict(valid_x)
            oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
            sub_preds += clf.predict_proba(
                test_df[feats]
            )[:, 1] / folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = np.zeros_like(feats)
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df,
                                              fold_importance_df], axis=0)

        file = open("outputs.txt", "a")
        print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(valid_y,
              oof_preds[valid_idx])), file=file)
        fold_precision, fold_recall, fold_fbetascore, fold_support = (
            precision_recall_fscore_support(valid_y, preds[valid_idx], beta=1, 
                                            average="binary", zero_division=0)
        )
        print(" - precision :", fold_precision, file=file)
        print(" - recall :", fold_recall, file=file)
        print(" - F1 score :", fold_fbetascore, file=file)

    print("Full AUC score %.6f" % roc_auc_score(train_df["TARGET"],
          oof_preds), file=file)
    file.close()

    class_metrics(train_df, preds, oof_preds, model)

    if not debug:
        test_df["TARGET"] = sub_preds
        test_df[["SK_ID_CURR", "TARGET"]].to_csv(filename, index=False)

    if model == "LOG":
        display_importances(feature_importance_df, neg_values=True)
    elif model == "RFC" or model == "LGBM":
        display_importances(feature_importance_df, neg_values=False)
        shap_features(clf, over_x)
        lime_features(over_x, over_y, feats, test_df[feats], clf.predict_proba)

    return feature_importance_df

<h4>IV) Exécution du code</h4>

In [None]:
def main(debug=False):
    num_rows = 10000 if debug else None
    df = application_train_test(num_rows)

    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        file = open("outputs.txt", "a")
        print("Bureau df shape:", bureau.shape, file=file)
        df = df.join(bureau, how="left", on="SK_ID_CURR")
        del bureau
        gc.collect()
        file.close()

    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        file = open("outputs.txt", "a")
        print("Previous applications df shape:", prev.shape, file=file)
        df = df.join(prev, how="left", on="SK_ID_CURR")
        del prev
        gc.collect()
        file.close()

    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        file = open("outputs.txt", "a")
        print("Pos-cash balance df shape:", pos.shape, file=file)
        df = df.join(pos, how="left", on="SK_ID_CURR")
        del pos
        gc.collect()
        file.close()

    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        file = open("outputs.txt", "a")
        print("Installments payments df shape:", ins.shape, file=file)
        df = df.join(ins, how="left", on="SK_ID_CURR")
        del ins
        gc.collect()
        file.close()

    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        file = open("outputs.txt", "a")
        print("Credit card balance df shape:", cc.shape, file=file)
        df = df.join(cc, how="left", on="SK_ID_CURR")
        df = df.rename(columns = lambda x: re.sub("[^A-Za-z0-9_]+", "", x))
        del cc
        gc.collect()
        file.close()

    with timer("Split, impute and scale data"):
        train_df, test_df = split_impute_scale(df)
        file = open("outputs.txt", "a")
        print("Train shape: {}, test shape: {}"
              .format(train_df.shape, test_df.shape), file=file)
        file.close()

    """with timer("Sample training set"):
        sample_df = sampling_data(train_df)
        file = open("outputs.txt", "a")
        print("Sampled train shape :", sample_df.shape, file=file)
        file.close()"""

    """with timer("Bayesian optimization of logistic regression hyperparameter"):
        best_c = optimize_log(sample_df)"""

    """with timer("Bayesian optimization of Random Forests hyperparameters"):
        best_n = optimize_rfc(sample_df)"""

    """with timer("Bayesian optimization of LightGBM hyperparameters"):
        best_params = optimize_lgbm(train_df)"""
        
    """with timer("Run dummies with k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "DUMMY",
            "dum_kf.csv", over=False, strat=False, debug=debug)

    with timer("Run dummies with k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "DUMMY",
            "dum_kf_bal.csv", over=True, strat=False, debug=debug)

    with timer("Run dummies with stratified k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "DUMMY",
            "dum_strat.csv", over=False, strat=True, debug=debug)

    with timer("Run dummies with stratified k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "DUMMY",
            "dum_strat_bal.csv", over=True, strat=True, debug=debug)"""

    """with timer("Run logistic regression with k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LOG",
            "log_kf.csv", over=False, strat=False, debug=debug)

    with timer("Run logistic regression with k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LOG",
            "log_kf_bal.csv", over=True, strat=False, debug=debug)

    with timer("Run logistic regression with stratified k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LOG",
            "log_strat.csv", over=False, strat=True, debug=debug)

    with timer("Run logistic regression with stratified k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LOG",
            "log_strat_bal.csv", over=True, strat=True, debug=debug)"""

    """with timer("Run Random Forests with k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "RFC",
            "rfc_kf.csv", over=False, strat=False, debug=debug)

    with timer("Run Random Forests with k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "RFC",
            "rfc_kf_bal.csv", over=True, strat=False, debug=debug)

    with timer("Run Random Forests with stratified k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "RFC",
            "rfc_strat.csv", over=False, strat=True, debug=debug)

    with timer("Run Random Forests with stratified k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "RFC",
            "rfc_strat_bal.csv", over=True, strat=True, debug=debug)"""

    with timer("Run LightGBM with k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LGBM",
            "lgbm_kf.csv", over=False, strat=False, debug=debug)

    with timer("Run LightGBM with k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LGBM",
            "lgbm_kf_bal.csv", over=True, strat=False, debug=debug)

    with timer("Run LightGBM with stratified k-folds, unbalanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LGBM",
            "lgbm_strat.csv", over=False, strat=True, debug=debug)

    with timer("Run LightGBM with stratified k-folds, balanced"):
        feat_importance = kfold_model(train_df, test_df, 10, "LGBM",
            "lgbm_strat_bal.csv", over=True, strat=True, debug=debug)

In [None]:
if __name__ == "__main__":
    with timer("Full model run"):
        main()