<h2>Implémentez un modèle de scoring</h2>

<h3>Première partie : modélisation</h3>

<h4>I) Préparation de l"environnement de travail</h4>

In [None]:
import gc
import re
import time
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
from contextlib import contextmanager

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"
import plotly.offline as pyo
pyo.init_notebook_mode()
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

In [None]:
open("outputs.txt", "w").close()

In [None]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    file = open("outputs.txt", "a")
    print("{} - done in {:.0f} s".format(title, time.time() - t0), file=file)
    file.close()

<h4>II) Importation et prétraitement des données</h4>

One-hot encoding for categorical columns with get_dummies

In [None]:
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns
                           if df[col].dtype == "object"]
    df = pd.get_dummies(df, columns=categorical_columns,
                        dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    
    return df, new_columns

Preprocess application_train.csv and application_test.csv :
 - Read data and merge
 - Optional: Remove 4 applications with XNA CODE_GENDER (train set)
 - Categorical features with Binary encode (0 or 1; two categories)
 - Categorical features with One-Hot encode
 - NaN values for DAYS_EMPLOYED: 365.243 -> nan
 - Some simple new features (percentages)

In [None]:
def application_train_test(num_rows=None, nan_as_category=False):
    df = pd.read_csv("../../input/application_train.csv", nrows=num_rows)
    test_df = pd.read_csv("../../input/application_test.csv", nrows=num_rows)
    file = open("outputs.txt", "a")
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)),
          file=file)
    df = df.append(test_df).reset_index()

    df = df[df["CODE_GENDER"] != "XNA"]
    
    for bin_feature in ["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"]:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])

    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)
    
    df["DAYS_EMPLOYED_PERC"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
    df["INCOME_CREDIT_PERC"] = df["AMT_INCOME_TOTAL"] / df["AMT_CREDIT"]
    df["INCOME_PER_PERSON"] = df["AMT_INCOME_TOTAL"] / df["CNT_FAM_MEMBERS"]
    df["ANNUITY_INCOME_PERC"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]
    df["PAYMENT_RATE"] = df["AMT_ANNUITY"] / df["AMT_CREDIT"]
    del test_df
    gc.collect()
    file.close()
    
    return df

Preprocess bureau.csv and bureau_balance.csv :
 - read and one-hot encode
 - Bureau balance: Perform aggregations and merge with bureau.csv
 - Bureau and bureau_balance numeric features
 - Bureau and bureau_balance categorical features
 - new dataframe with aggregations ?
 - Bureau: Active credits - using only numerical aggregations
 - Bureau: Closed credits - using only numerical aggregations

In [None]:
def bureau_and_balance(num_rows=None, nan_as_category=True):
    bureau = pd.read_csv("../../input/bureau.csv", nrows=num_rows)
    bb = pd.read_csv("../../input/bureau_balance.csv", nrows=num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    bb_aggregations = {"MONTHS_BALANCE": ["min", "max", "size"]}
    for col in bb_cat:
        bb_aggregations[col] = ["mean"]
    bb_agg = bb.groupby("SK_ID_BUREAU").agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper()
                               for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how="left", on="SK_ID_BUREAU")
    bureau.drop(["SK_ID_BUREAU"], axis=1, inplace=True)
    del bb, bb_agg
    gc.collect()

    num_aggregations = {
        "DAYS_CREDIT": ["min", "max", "mean", "var"],
        "DAYS_CREDIT_ENDDATE": ["min", "max", "mean"],
        "DAYS_CREDIT_UPDATE": ["mean"],
        "CREDIT_DAY_OVERDUE": ["max", "mean"],
        "AMT_CREDIT_MAX_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_DEBT": ["max", "mean", "sum"],
        "AMT_CREDIT_SUM_OVERDUE": ["mean"],
        "AMT_CREDIT_SUM_LIMIT": ["mean", "sum"],
        "AMT_ANNUITY": ["max", "mean"],
        "CNT_CREDIT_PROLONG": ["sum"],
        "MONTHS_BALANCE_MIN": ["min"],
        "MONTHS_BALANCE_MAX": ["max"],
        "MONTHS_BALANCE_SIZE": ["mean", "sum"]
    }

    cat_aggregations = {}
    for cat in bureau_cat:
        cat_aggregations[cat] = ["mean"]
    for cat in bb_cat:
        cat_aggregations[cat + "_MEAN"] = ["mean"]

    bureau_agg = bureau.groupby("SK_ID_CURR").agg({**num_aggregations, 
                                                   **cat_aggregations})
    bureau_agg.columns = pd.Index(["BURO_" + e[0] + "_" + e[1].upper()
                                   for e in bureau_agg.columns.tolist()])

    active = bureau[bureau["CREDIT_ACTIVE_Active"] == 1]
    active_agg = active.groupby("SK_ID_CURR").agg(num_aggregations)
    active_agg.columns = pd.Index(["ACTIVE_" + e[0] + "_" + e[1].upper()
                                   for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how="left", on="SK_ID_CURR")
    del active, active_agg
    gc.collect()

    closed = bureau[bureau["CREDIT_ACTIVE_Closed"] == 1]
    closed_agg = closed.groupby("SK_ID_CURR").agg(num_aggregations)
    closed_agg.columns = pd.Index(["CLOSED_" + e[0] + "_" + e[1].upper()
                                   for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how="left", on="SK_ID_CURR")
    del closed, closed_agg, bureau
    gc.collect()
    
    return bureau_agg

Preprocess previous_applications.csv :
 - read and one-hot encode
 - Days 365.243 values -> nan
 - Add feature: value ask / value received percentage
 - Previous applications numeric features
 - Previous applications categorical features
 - new dataframe with aggregations ?
 - Previous Applications: Approved Applications - only numerical features
 - Previous Applications: Refused Applications - only numerical features

In [None]:
def previous_applications(num_rows=None, nan_as_category=True):
    prev = pd.read_csv("../../input/previous_application.csv", nrows=num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category=True)

    prev["DAYS_FIRST_DRAWING"].replace(365243, np.nan, inplace=True)
    prev["DAYS_FIRST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE_1ST_VERSION"].replace(365243, np.nan, inplace=True)
    prev["DAYS_LAST_DUE"].replace(365243, np.nan, inplace=True)
    prev["DAYS_TERMINATION"].replace(365243, np.nan, inplace=True)

    prev["APP_CREDIT_PERC"] = prev["AMT_APPLICATION"] / prev["AMT_CREDIT"]

    num_aggregations = {
        "AMT_ANNUITY": ["min", "max", "mean"],
        "AMT_APPLICATION": ["min", "max", "mean"],
        "AMT_CREDIT": ["min", "max", "mean"],
        "APP_CREDIT_PERC": ["min", "max", "mean", "var"],
        "AMT_DOWN_PAYMENT": ["min", "max", "mean"],
        "AMT_GOODS_PRICE": ["min", "max", "mean"],
        "HOUR_APPR_PROCESS_START": ["min", "max", "mean"],
        "RATE_DOWN_PAYMENT": ["min", "max", "mean"],
        "DAYS_DECISION": ["min", "max", "mean"],
        "CNT_PAYMENT": ["mean", "sum"],
    }

    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ["mean"]
    
    prev_agg = prev.groupby("SK_ID_CURR").agg({**num_aggregations,
                                               **cat_aggregations})
    prev_agg.columns = pd.Index(["PREV_" + e[0] + "_" + e[1].upper() 
    for e in prev_agg.columns.tolist()])

    approved = prev[prev["NAME_CONTRACT_STATUS_Approved"] == 1]
    approved_agg = approved.groupby("SK_ID_CURR").agg(num_aggregations)
    approved_agg.columns = pd.Index(["APPROVED_" + e[0] + "_" + e[1].upper()
                                     for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how="left", on="SK_ID_CURR")

    refused = prev[prev["NAME_CONTRACT_STATUS_Refused"] == 1]
    refused_agg = refused.groupby("SK_ID_CURR").agg(num_aggregations)
    refused_agg.columns = pd.Index(["REFUSED_" + e[0] + "_" + e[1].upper()
                                    for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how="left", on="SK_ID_CURR")
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()

    return prev_agg

Preprocess POS_CASH_balance.csv :
 - read and one-hot encode
 - Features
 - new dataframe with aggregations ?
 - Count pos cash accounts

In [None]:
def pos_cash(num_rows=None, nan_as_category=True):
    pos = pd.read_csv("../../input/POS_CASH_balance.csv", nrows=num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)

    aggregations = {
        "MONTHS_BALANCE": ["max", "mean", "size"],
        "SK_DPD": ["max", "mean"],
        "SK_DPD_DEF": ["max", "mean"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]
    
    pos_agg = pos.groupby("SK_ID_CURR").agg(aggregations)
    pos_agg.columns = pd.Index(["POS_" + e[0] + "_" + e[1].upper()
                                for e in pos_agg.columns.tolist()])

    pos_agg["POS_COUNT"] = pos.groupby("SK_ID_CURR").size()
    del pos
    gc.collect()

    return pos_agg

Preprocess installments_payments.csv :
 - read and one-hot encode
 - Percentage and difference paid in each installment (amount paid and installment value)
 - Days past due and days before due (no negative values)
 - Features: Perform aggregations
 - new dataframe with aggregations ?
 - Count installments accounts

In [None]:
def installments_payments(num_rows=None, nan_as_category=True):
    ins = pd.read_csv("../../input/installments_payments.csv",
                      nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)

    ins["PAYMENT_PERC"] = ins["AMT_PAYMENT"] / ins["AMT_INSTALMENT"]
    ins["PAYMENT_DIFF"] = ins["AMT_INSTALMENT"] - ins["AMT_PAYMENT"]

    ins["DPD"] = ins["DAYS_ENTRY_PAYMENT"] - ins["DAYS_INSTALMENT"]
    ins["DBD"] = ins["DAYS_INSTALMENT"] - ins["DAYS_ENTRY_PAYMENT"]
    ins["DPD"] = ins["DPD"].apply(lambda x: x if x > 0 else 0)
    ins["DBD"] = ins["DBD"].apply(lambda x: x if x > 0 else 0)

    aggregations = {
        "NUM_INSTALMENT_VERSION": ["nunique"],
        "DPD": ["max", "mean", "sum"],
        "DBD": ["max", "mean", "sum"],
        "PAYMENT_PERC": ["max", "mean", "sum", "var"],
        "PAYMENT_DIFF": ["max", "mean", "sum", "var"],
        "AMT_INSTALMENT": ["max", "mean", "sum"],
        "AMT_PAYMENT": ["min", "max", "mean", "sum"],
        "DAYS_ENTRY_PAYMENT": ["max", "mean", "sum"]
    }
    for cat in cat_cols:
        aggregations[cat] = ["mean"]

    ins_agg = ins.groupby("SK_ID_CURR").agg(aggregations)
    ins_agg.columns = pd.Index(["INSTAL_" + e[0] + "_" + e[1].upper()
                                for e in ins_agg.columns.tolist()])

    ins_agg["INSTAL_COUNT"] = ins.groupby("SK_ID_CURR").size()
    del ins
    gc.collect()

    return ins_agg

Preprocess credit_card_balance.csv :
 - read and one-hot encode
 - General aggregations
 - Count credit card lines

In [None]:
def credit_card_balance(num_rows=None, nan_as_category=True):
    cc = pd.read_csv("../../input/credit_card_balance.csv", nrows=num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category=True)

    cc.drop(["SK_ID_PREV"], axis=1, inplace=True)
    cc_agg = cc.groupby("SK_ID_CURR").agg(["min", "max", "mean", "sum", "var"])
    cc_agg.columns = pd.Index(["CC_" + e[0] + "_" + e[1].upper()
                               for e in cc_agg.columns.tolist()])

    cc_agg["CC_COUNT"] = cc.groupby("SK_ID_CURR").size()
    del cc
    gc.collect()
    
    return cc_agg

<h4>III) Prédictions et représentations graphiques</h4>

Display/plot feature importance

In [None]:
def display_importances(feature_importance_df_):
    cols = (feature_importance_df_[["feature", "importance"]]
            .groupby("feature").mean()
            .sort_values(by="importance", ascending=False)[:40].index)
    best_features = feature_importance_df_.loc[feature_importance_df_.feature
                                               .isin(cols)]
    data=best_features.sort_values(by="importance", ascending=False)
    fig = go.Figure()
    fig.add_bar(x=data["importance"], y=data["feature"])
    fig.update_layout(
        height=800,
        width=1000,
        template="simple_white",
        showlegend=False,
        title_text="LightGBM Features (avg over folds)"
    )
    fig.write_image("lgbm_importances01.png", engine="kaleido")

LightGBM, Random Forests, Logistic Regression and Dummy classifiers, with K-Folds or Stratified K-Folds :
 - Divide in training/validation and test data
 - (LOG only) imputation and feature scaling
 - print data shapes
 - Cross validation model
 - Create arrays and dataframes to store results
 - ranges of searching for models hyperparameters
 - call xs and ys,
 - models hyperparameters found by Grid Search
 - fit and predict
 - create dataframe for features importance
 - print fold and full AUC scores
 - Write submission file and plot feature importance

In [None]:
def kfold_model(df, num_folds, model, csv_name,
                stratified=False, debug=False):
    train_df = df[df["TARGET"].notnull()]
    test_df = df[df["TARGET"].isnull()]

    if model == "LOG" or model == "RFC":
        train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        train_feats = list(train_df.columns)
        train_imputer = SimpleImputer(strategy="median")
        train_imputer.fit(train_df)
        train_df = train_imputer.transform(train_df)
        train_scaler = MinMaxScaler(feature_range=(0, 1))
        train_scaler.fit(train_df)
        train_df = train_scaler.transform(train_df)
        train_df = pd.DataFrame(train_df, columns=train_feats)

        test_df = test_df.drop(columns="TARGET")
        test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
        test_feats = list(test_df.columns)
        test_imputer = SimpleImputer(strategy="median")
        test_imputer.fit(test_df)
        test_df = test_imputer.transform(test_df)
        test_scaler = MinMaxScaler(feature_range=(0, 1))
        test_scaler.fit(test_df)
        test_df = test_scaler.transform(test_df)
        test_df = pd.DataFrame(test_df, columns=test_feats)

    file=open("outputs.txt", "a")
    print("Starting {}. Train shape: {}, test shape: {}"
          .format(model, train_df.shape, test_df.shape), file=file)
    del df
    gc.collect()

    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)

    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ["TARGET", "SK_ID_CURR",
             "SK_ID_BUREAU", "SK_ID_PREV", "index"]]

    threads = range(1, 6)
    estims = [10, 100, 1000, 10000, 100000, 1000000]
    rates = np.logspace(-6, 0, 7)
    leaves = range(10, 55, 5)
    colsamples = range(0, 1, 100)
    subsamples = range(0, 1, 100)
    depths = range(5, 11)
    alphas = np.logspace(-3, 8, 60)
    lambdas = np.logspace(-3, 2, 6)
    gains = np.logspace(-3, 0, 50)
    weights = np.logspace(1, 2, 50)
    
    for n_fold, (train_idx, valid_idx) in enumerate(
        folds.split(train_df[feats], train_df["TARGET"]
    )):

        train_x, train_y = (train_df[feats].iloc[train_idx],
                            train_df["TARGET"].iloc[train_idx])
        valid_x, valid_y = (train_df[feats].iloc[valid_idx],
                            train_df["TARGET"].iloc[valid_idx])

        if model == "LGBM":
            param01 = {
                "nthread": threads,
                "n_estimators": estims,
                "learning_rate": rates,
                "num_leaves": leaves,
                "colsample_bytree": colsamples,
                "subsample": subsamples,
                "max_depth": depths,
                "reg_alpha": alphas,
                "reg_lambda": lambdas,
                "min_split_gain": gains,
                "min_child_weight": weights
            }
            model_cv = GridSearchCV(
                LGBMClassifier(
                    silent=-1,
                    verbose=-1
                ),
                param01,
                scoring="accuracy",
                cv=10,
                n_jobs=-1
            )
            best_cv = model_cv.fit(train_x,train_y)
            best_params = best_cv.best_params_
            clf = LGBMClassifier(
                nthread=best_params["nthread"],
                n_estimators=best_params["n_estimators"],
                learning_rate=best_params["learning_rate"],
                num_leaves=best_params["num_leaves"],
                colsample_bytree=best_params["colsample_bytree"],
                subsample=best_params["subsample"],
                max_depth=best_params["max_depth"],
                reg_alpha=best_params["reg_alpha"],
                reg_lambda=best_params["reg_lambda"],
                min_split_gain=best_params["min_split_gain"],
                min_child_weight=best_params["min_child_weight"],
                silent=-1,
                verbose=-1
            )
        elif model == "RFC":
            param02 = {"n_estimators": estims}
            """model_cv = GridSearchCV(
                RandomForestClassifier(
                    oob_score=True,
                    random_state=50,
                    verbose=1,
                    n_jobs=-1
                ),
                param02,
                scoring = "accuracy",
                cv = 10,
                n_jobs = -1
            )
            best_cv = model_cv.fit(train_x,train_y)
            best_n = best_cv.best_params_["n_estimators"]"""
            clf = RandomForestClassifier(
                #n_estimators=best_n,
                n_estimators=100,
                oob_score=True,
                random_state=50,
                verbose=1,
                n_jobs=-1
            )
        elif model == "LOG":
            clf = LogisticRegression(
                #Cs=9,
                C=0.0001,
                solver="saga",
                max_iter=1000,
                n_jobs=-1,
                random_state=42
            )
            #print("Meilleur hyperparamètre :", clf.C_, file=file)
        elif model == "DUMMY":
            clf = DummyClassifier(
                strategy="uniform",  
                random_state=42)

        if model == "DUMMY" or model == "LOG":
            clf.fit(
                train_x,
                train_y
            )
            oof_preds[valid_idx] = clf.predict_proba(
                valid_x
            )[:, 1]
            sub_preds += clf.predict_proba(
                test_df[feats]
            )[:, 1] / folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = np.zeros_like(feats)
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df,
                                               fold_importance_df], axis=0)
        elif model == "RFC":
            clf.fit(
                train_x,
                train_y
            )
            oof_preds[valid_idx] = clf.predict_proba(
                valid_x
            )[:, 1]
            sub_preds += clf.predict_proba(
                test_df[feats]
            )[:, 1] / folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df,
                                               fold_importance_df], axis=0)
        elif model == "LGBM":
            clf.fit(
                train_x,
                train_y,
                eval_set=[
                    (train_x, train_y),
                    (valid_x, valid_y)
                ],
                eval_metric="auc",
                verbose=200,
                early_stopping_rounds=200
            )
            oof_preds[valid_idx] = clf.predict_proba(
                valid_x,
                num_iteration=clf.best_iteration_
            )[:, 1]
            sub_preds += clf.predict_proba(
                test_df[feats],
                num_iteration=clf.best_iteration_
            )[:, 1] / folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df,
                                               fold_importance_df], axis=0)

        print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(valid_y,
              oof_preds[valid_idx])), file=file)
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print("Full AUC score %.6f" % roc_auc_score(train_df["TARGET"],
          oof_preds), file=file)
    file.close()

    if not debug:
        test_df["TARGET"] = sub_preds
        test_df[["SK_ID_CURR", "TARGET"]].to_csv(csv_name, index=False)

    if model == "RFC" or model == "LGBM":
        display_importances(feature_importance_df)
        return feature_importance_df

<h4>IV) Exécution du code</h4>

In [None]:
def main(debug=False):
    num_rows = 100000 if debug else None
    df = application_train_test(num_rows)

    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        file = open("outputs.txt", "a")
        print("Bureau df shape:", bureau.shape, file=file)
        df = df.join(bureau, how="left", on="SK_ID_CURR")
        del bureau
        gc.collect()
        file.close()

    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        file = open("outputs.txt", "a")
        print("Previous applications df shape:", prev.shape, file=file)
        df = df.join(prev, how="left", on="SK_ID_CURR")
        del prev
        gc.collect()
        file.close()

    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        file = open("outputs.txt", "a")
        print("Pos-cash balance df shape:", pos.shape, file=file)
        df = df.join(pos, how="left", on="SK_ID_CURR")
        del pos
        gc.collect()
        file.close()

    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        file = open("outputs.txt", "a")
        print("Installments payments df shape:", ins.shape, file=file)
        df = df.join(ins, how="left", on="SK_ID_CURR")
        del ins
        gc.collect()
        file.close()

    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        file = open("outputs.txt", "a")
        print("Credit card balance df shape:", cc.shape, file=file)
        df = df.join(cc, how="left", on="SK_ID_CURR")
        df = df.rename(columns = lambda x: re.sub("[^A-Za-z0-9_]+", "", x))
        del cc
        gc.collect()
        file.close()

    with timer("Run dummies with k-folds"):
        feat_importance = kfold_model(df, 10, "DUMMY", "dummies_kf.csv",
                                      stratified=False, debug=debug)

    with timer("Run dummies with stratified k-folds"):
        feat_importance = kfold_model(df, 10, "DUMMY", "dummies_stratkf.csv",
                                      stratified=True, debug=debug)

    with timer("Run logistic regression with k-folds"):
        feat_importance = kfold_model(df, 10, "LOG", "logistic_kf.csv",
                                      stratified=False, debug=debug)

    with timer("Run logistic regression with stratified k-folds"):
        feat_importance = kfold_model(df, 10, "LOG", "logistic_stratkf.csv",
                                      stratified=True, debug=debug)

    with timer("Run Random Forests with k-folds"):
        feat_importance = kfold_model(df, 10, "RFC", "rfc_kf.csv",
                                      stratified=False, debug=debug)

    with timer("Run Random Forests with stratified k-folds"):
        feat_importance = kfold_model(df, 10, "RFC", "rfc_stratkf.csv",
                                      stratified=True, debug=debug)

    """with timer("Run LightGBM with k-folds"):
        feat_importance = kfold_model(df, 10, "LGBM", "lightgbm_kf.csv",
                                      stratified=False, debug=debug)

    with timer("Run LightGBM with stratified k-folds"):
        feat_importance = kfold_model(df, 10, "LGBM", "lightgbm_stratkf.csv",
                                      stratified=True, debug=debug)"""

In [None]:
if __name__ == "__main__":
    with timer("Full model run"):
        main()