In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option("display.max_columns", None)

from itertools import groupby

import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from scipy.stats import ks_2samp
import seaborn as sns
        

from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier, RandomTreesEmbedding
from sklearn.model_selection import cross_val_score,StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer, StandardScaler, KBinsDiscretizer
from sklearn.naive_bayes import BernoulliNB

from xgboost import XGBClassifier


# local imports
# from get_rawdata import main

import pdb
import warnings
warnings.filterwarnings("ignore")


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/features-data/features_data.pkl
/kaggle/input/home-credit-default-risk/sample_submission.csv
/kaggle/input/home-credit-default-risk/bureau_balance.csv
/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv
/kaggle/input/home-credit-default-risk/application_train.csv
/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv
/kaggle/input/home-credit-default-risk/application_test.csv
/kaggle/input/home-credit-default-risk/previous_application.csv
/kaggle/input/home-credit-default-risk/credit_card_balance.csv
/kaggle/input/home-credit-default-risk/installments_payments.csv
/kaggle/input/home-credit-default-risk/bureau.csv


In [3]:
def _get_model_name(model):
    """
        Returns a string with the name of a sklearn model
            model: Sklearn stimator class
    """
    if isinstance(model, Pipeline):
        estimator = model.steps[-1][1]
        name = "Pipeline_" + str(estimator)[:str(estimator).find("(")]
    else: 
        name = str(model)[:str(model).find("(")]
    return name 

def plot_cv_score(X, y, models_list, cv = 5, scoring = None, refit = True, verbose = True):
    """ 
        X: numpy_array/pandas dataframe n_rows, m_features
        y: numpy_array/pandas dataframe n_rows
        Plots min, max and avg kfold crosval_score for a list of models
    
    """

    
    
    names, scores, min_score, max_score, mean_score = list(), list(), list(), list(), list()

    for i, model in enumerate(models_list):
        t0 = time.time()
        name = _get_model_name(model)
        names.append(name)

        if refit:
            model.fit(X, y)
        
        score = cross_val_score(model, X, y, cv = cv, scoring = scoring, n_jobs= -1)

        min_score.append(np.min(score))
        max_score.append(np.max(score))
        mean_score.append(np.mean(score))
        scores.append(score)
        t1 = time.time()
        
        if verbose:
            print(f"Iteration: {i} done in {round((t1-t0)/60,2)} minutes")
            print(f"Mean score for model: {names[i]}: {mean_score[i]}")
        
            
    
    frame_summary = pd.DataFrame({'Min':min_score, 'Average': mean_score, 'Max': max_score,}, index = names).sort_values(by = 'Average')

    frame_scores = pd.DataFrame(np.vstack(scores).T, columns = names) 


    fig, ax  = plt.subplots(1,2, figsize = (15,7))

    frame_summary.plot.barh(edgecolor = 'black', ax = ax[0], cmap = 'RdYlBu')
    ax[0].legend(loc = 'best')
    ax[0].set_xlabel("Score")

    frame_scores.boxplot(ax = ax[1])
    ax[1].set_title("Model scores distribution")
    ax[1].set_ylabel("Score")
    ax[1].tick_params(labelrotation=90)
    
    
def plot_importances(estimator, X, y, scoring = None, n_repeats = 5, plot_results = True, n_jobs = -1):
    """
    Computes permutation feature importance for a given model
    """
    pimp = permutation_importance(estimator= estimator, X= X, y = y, n_repeats= n_repeats, n_jobs = n_jobs)
    
    df = pd.DataFrame({"Mean performance decrease":pimp.importances_mean}, index = X.columns).sort_values(by = "Mean performance decrease")
    
    if plot_results:
        fig, ax = plt.subplots(figsize = (10,5))

        df.plot.barh(ax = ax, edgecolor = "black", cmap = "RdYlBu")
        ax.set_title("Importances")
    else:
        return df

In [4]:
def plot_results(X, y, estimator, prefit = True, color_test_set = True):
    if color_test_set:
        color_positive = "darkgreen"
        color_negative = "darkred"
        label = "Test"
    else:
        color_positive = "steelblue"
        color_negative = "orange"
        label = "Train"
    if not prefit:
        estimator.fit(X,y)
    y_pred = estimator.predict_proba(X)[:,1]
    fpr,tpr,_ = roc_curve(y_true = y, y_score = y_pred)
    
    fig, ax = plt.subplots(1,3, figsize = (17,5))
    
    ax[0].hist(y_pred[y == 0], color = color_positive, alpha = .5, edgecolor = color_positive, bins = "auto", label = "positive class")
    ax[0].hist(y_pred[y == 1], color = color_negative, alpha = .5, edgecolor = color_negative, bins = "auto", label = "negative class")
    ax[0].set_title(f"Class distribution on [{label}]")
    ax[0].set_ylabel("Number of samples")
    ax[0].set_xlabel("Model probabilities")
    ax[0].legend()
    
    ax[1].plot(fpr, tpr, color = "darkred", label = "roc")
    ax[1].plot([0,1], [0,1], linestyle = "--", color = "black")
    ax[1].set_title(f"Area under roc curve\nScore: {round(roc_auc_score(y_true = y, y_score = y_pred),3)}")
    ax[1].set_xlabel("false positive rate")
    ax[1].set_ylabel("true positive rate")
    ax[1].legend()
    
    ks, p_value = ks_2samp(y_pred[y == 0], y_pred[y == 1])
    
    sns.distplot(y_pred[y==0], hist = False, kde_kws={"cumulative":True}, rug = False, color = color_positive, ax = ax[2], label = "positive class")
    sns.distplot(y_pred[y==1], hist = False, kde_kws={"cumulative":True}, rug = False, color = color_negative, ax = ax[2], label = "negative class")
    ax[2].set_title(f"Class separation\ks: {round(ks,3)}")
    
    plt.tight_layout()
    

In [5]:
data_dict = pd.read_csv("/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv", encoding = "latin1")

In [6]:
data_dict.Table.value_counts()

application_{train|test}.csv    122
previous_application.csv         38
credit_card_balance.csv          23
bureau.csv                       17
installments_payments.csv         8
POS_CASH_balance.csv              8
bureau_balance.csv                3
Name: Table, dtype: int64

In [14]:
data_dict[data_dict.Table == "application_{train|test}.csv"][["Row","Description"]].style

Unnamed: 0,Row,Description
0,SK_ID_CURR,ID of loan in our sample
1,TARGET,"Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)"
2,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving
3,CODE_GENDER,Gender of the client
4,FLAG_OWN_CAR,Flag if the client owns a car
5,FLAG_OWN_REALTY,Flag if client owns a house or flat
6,CNT_CHILDREN,Number of children the client has
7,AMT_INCOME_TOTAL,Income of the client
8,AMT_CREDIT,Credit amount of the loan
9,AMT_ANNUITY,Loan annuity


In [7]:
data_dict[data_dict.Table == "previous_application.csv"][["Row", "Description"]].style

Unnamed: 0,Row,Description
173,SK_ID_PREV,"ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loan applications in Home Credit, previous application could, but not necessarily have to lead to credit)"
174,SK_ID_CURR,ID of loan in our sample
175,NAME_CONTRACT_TYPE,"Contract product type (Cash loan, consumer loan [POS] ,...) of the previous application"
176,AMT_ANNUITY,Annuity of previous application
177,AMT_APPLICATION,For how much credit did client ask on the previous application
178,AMT_CREDIT,"Final credit amount on the previous application. This differs from AMT_APPLICATION in a way that the AMT_APPLICATION is the amount for which the client initially applied for, but during our approval process he could have received different amount - AMT_CREDIT"
179,AMT_DOWN_PAYMENT,Down payment on the previous application
180,AMT_GOODS_PRICE,Goods price of good that client asked for (if applicable) on the previous application
181,WEEKDAY_APPR_PROCESS_START,On which day of the week did the client apply for previous application
182,HOUR_APPR_PROCESS_START,Approximately at what day hour did the client apply for the previous application


In [8]:
data_dict[data_dict.Table == "POS_CASH_balance.csv"][["Row", "Description"]].style

Unnamed: 0,Row,Description
142,SK_ID_PREV,"ID of previous credit in Home Credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)"
143,SK_ID_CURR,ID of loan in our sample
144,MONTHS_BALANCE,"Month of balance relative to application date (-1 means the information to the freshest monthly snapshot, 0 means the information at application - often it will be the same as -1 as many banks are not updating the information to Credit Bureau regularly )"
145,CNT_INSTALMENT,Term of previous credit (can change over time)
146,CNT_INSTALMENT_FUTURE,Installments left to pay on the previous credit
147,NAME_CONTRACT_STATUS,Contract status during the month
148,SK_DPD,DPD (days past due) during the month of previous credit
149,SK_DPD_DEF,DPD during the month with tolerance (debts with low loan amounts are ignored) of the previous credit


In [15]:
previous_application.sample(5)

NameError: name 'previous_application' is not defined

In [None]:
# tiempo promedio entre cada applicacion
# numero de aplicaciones rechazadas, aprovadas, etc

# monto promedio de las rechazadas vs el monto (menos el actual)
# canal de adquicicion
# name goods category vs actual application
# reason rejection (most frequent)
# tasa de interes promedio en las rechazadas/aprovadas vs tasa actual
# monto requerido vs monto aprobado en las app previas (rechazadas aprobadas)
# al juntar las tablas de pos, credit card y installments con previous application, puedo generar featrures de dpd como lo que se hizo en el burò


In [None]:
from time import time

premium_loans = ["Real estate loan",
                 "Car loan",
                 "Loan for purchase of shares (margin lending)"]

working_capital_loans = ["Loan for working capital replenishment",
                         "Loan for the purchase of equipment",
                         "Loan for business development"]

bank_credits = ["Microloan",
                "Consumer credit",
                "Interbank credit",
                "Consumer credit"]

credit_cards = ["Credit card"]

others = ["Unknown type of loan", "Another type of loan",
       "Cash loan (non-earmarked)",
       "Mobile operator loan", "Interbank credit",
       "Loan for purchase of shares (margin lending)"]




flag_doc_feat = [
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL', 
 'HOUR_APPR_PROCESS_START']

social_demo_feat = [
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'REGION_POPULATION_RELATIVE',
 'CNT_CHILDREN',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'DAYS_LAST_PHONE_CHANGE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_CONTRACT_TYPE',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE', 
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

main_application_feat = ['AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',  
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',    
 "EXT_SOURCE_MEAN",
  "EXT_SOURCE_ENTROPY",
  "LOG_AMT_CREDIT_AMT_INCOME",
  "LOG_AMT_GOODS_PRICE_AMT_INCOME",
  'CREDIT_TO_ANNUITY_RATIO',
  'CREDIT_TO_GOODS_RATIO',
  'INCOME_CREDIT_PERCENTAGE',
  'INCOME_PER_CHILD',
  'INCOME_PER_PERSON', 
  'PAYMENT_RATE']

expendable_feat = ['INCOME_PER_CHILD',
 'INCOME_PER_PERSON', 
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION']


credit_bureau_features = ['sum_debt_active_bank_credits',
 'max_consecutive_non_delinq',
 'times_bucket_5',
 'open_accounts_last6m',
 'open_accounts_last9m',
 'total_annuity',
 'active_credit_cards',
 'max_cred_lim_overdue',
 'times_bucket_2',
 'antiguedad_media_dias',
 'credit_debt_div_lim_cred',
 'mean_amt_credit_limit',
 'closed_other_credits',
 'max_cred_lim_delinq',
 'closed_accounts_last6m',
 'credit_sum_by_debt',
 'closed_prem_credits',
 'credit_lim_div_cred_overdue',
 'closed_accounts_last9m',
 'max_debt_closed_debt_others',
 'total_accounts',
 'active_other_credits',
 'times_bucket_4',
 'sum_prolong_days',
 'max_debt_closed_wk_credits',
 'max_cred_lim_non_delinq',
 'closed_accounts_last2m',
 'open_accounts_last1m',
 'payment_history',
 'sum_debt_active_prem_credits',
 'sum_debt_active_wk_credits',
 'sum_debt_active_credit_cards',
 'max_debt_closed_bank_credits',
 'closed_wk_credits',
 'mean_consecutive_non_delinq',
 'times_bucket_1',
 'mean_amt_overdue',
 'sum_amt_credit_debt',
 'times_bucket_6',
 'trend_debt_active_credits',
 'credit_sum_by_overdue',
 'active_wk_credits',
 'open_accounts_last2m',
 'sum_amt_credit_overdue',
 'max_cred_lim_non_overdue',
 'perc_util_revolving',
 'closed_accounts_last12m',
 'credit_sum_by_limit',
 'max_debt_closed_credit_card',
 'closed_accounts_last3m',
 'closed_bank_credits',
 'open_accounts_last12m',
 'closed_credit_cards',
 'max_amt_overdue',
 'cloased_accounts',
 'n_unique_credit_types',
 'max_debt_closed_premium_loans',
 'open_accounts_last3m',
 'antiguedad_media_dias_closed_accounts',
 'antiguedad_media_dias_open_accounts',
 'times_bucket_3',
 'closed_accounts_last1m',
 'sum_debt_active_other_credits',
 'active_prem_credits',
 'active_bank_credits',
 'active_accounts',
 'mean_amt_credit_debt',
 "cosine_cred_sum",
 "sine_cred_sum",
 "cosine_cred_sum_debt",
 "sine_cred_sum_debt",
 "cosine_cred_sum_limit",
 "sine_cred_sum_limit"]



def train_binary(X ,y, estimator, cv = 3, refit_all = True, verbose = True):
    """
    Creates a feature based on predictions from kfold
    """
    ls = list()
    result = {}
    kf = StratifiedKFold(n_splits = cv)
    k = 0
    for train_idx, test_idx in kf.split(X, y):
        k+=1
        X_train, X_test = X.iloc[train_idx,:], X.iloc[test_idx,:]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = estimator.fit(X_train, y_train)
        predictions = model.predict_proba(X_test)[:,1]
        if verbose:
            score = roc_auc_score(y_true=y_test, y_score=predictions)
            print(f"Score on test set for fold {k} is :{round(score,3)}")
        
        ls.append(predictions)
        
    feature = np.hstack(ls)
    
    if refit_all:
        model = estimator.fit(X, y)
        
        result["model"] = model
    
    result["feature"] = feature
    
    return result

def get_entropy(series, categorical = True):

    try:
        if categorical:
            data = series.value_counts(True)
        else:
            data = pd.qcut(series,[0, .25, .5, .75, 1.], duplicates= "drop").value_counts(True)

        return entropy(data)
    except:
        return 0

def len_iter(items):
    return sum(1 for _ in items)


def consecutive_values(data, bin_val):
    try:
        return max(len_iter(run) for val, run in groupby(data) if val == bin_val)/len(data)
    except:
        return 0

def get_linear_regression(series):
    result ={}
    try:
        n = len(series)
        X = np.arange(n)
        y = series.cumsum()
        lr = stats.linregress(x, y)
    
        result["trend"] = lr.slope
        result["intercept"] = lr.intercept
    except:
        result["trend"] = -1
        result["intercept"] = -1
    return result


"""
Generates a binary classification model

"""

print(f"Processing bureau balance...")
t0 = time()

train_skid_curr = application_train["SK_ID_CURR"].unique().tolist()
test_skid_curr = application_test["SK_ID_CURR"].unique().tolist()


# For preprocessing
bureau_balance["STATUS_NUMERIC"] = bureau_balance.STATUS.map({"0":0,"1":1,"2":2,"3":3,"4":4,"5":5,"X":None,"C":None})
bureau_balance["EVER_DELINQUENT"] = np.where(bureau_balance.STATUS != '0',1,0)

aux_c = bureau_balance[bureau_balance.STATUS == "0"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_0"})
aux_1 = bureau_balance[bureau_balance.STATUS == "1"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_1"})
aux_2 = bureau_balance[bureau_balance.STATUS == "2"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_2"})
aux_3 = bureau_balance[bureau_balance.STATUS == "3"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_3"})
aux_4 = bureau_balance[bureau_balance.STATUS == "4"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_4"})
aux_5 = bureau_balance[bureau_balance.STATUS == "5"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_5"})
aux_6 = bureau_balance[bureau_balance.STATUS == "6"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_6"})


# aux_ever_delinq = bureau_balance.groupby("SK_ID_BUREAU").STATUS_NUMERIC.apply(lambda x: get_ever_delinquent(x)).to_frame().reset_index().rename(columns = {"STATUS_NUMERIC":"EVER_DELINQUENT"})
aux_ever_delinq = bureau_balance.groupby("SK_ID_BUREAU").EVER_DELINQUENT.max().to_frame().reset_index().rename(columns = {"STATUS":"EVER_DELINQUENT"})
aux_ph = bureau_balance.groupby("SK_ID_BUREAU").STATUS_NUMERIC.mean().to_frame().reset_index().rename(columns = {"STATUS_NUMERIC":"PAYMENT_HISTORY"})
aux_consecutive = bureau_balance.groupby("SK_ID_BUREAU").STATUS_NUMERIC.apply(lambda x:consecutive_values(x, 0)).to_frame().reset_index().rename(columns = {"STATUS_NUMERIC":"CONSECUTIVE_NO_DELINQ"})


ldf_aux = [bureau,aux_c, aux_1, aux_2, aux_3, aux_4, aux_5, aux_6, aux_ph, aux_consecutive,aux_ever_delinq]

bureau = reduce(lambda x, y: pd.merge(x, y, on = "SK_ID_BUREAU", how = "left"), ldf_aux)
print(f"Bureau processed on: {round((time() - t0)/60,2)} minutes")

print("Creating extra application features...")




# application features on train
application_train["EXT_SOURCE_MEAN"] = (application_train["EXT_SOURCE_1"] + application_train["EXT_SOURCE_2"] + application_train["EXT_SOURCE_3"])/3
application_train["EXT_SOURCE_ENTROPY"] = -(application_train["EXT_SOURCE_1"] * np.log1p(application_train["EXT_SOURCE_1"]) + application_train["EXT_SOURCE_2"] * np.log1p(application_train["EXT_SOURCE_2"]) + application_train["EXT_SOURCE_3"] * np.log1p(application_train["EXT_SOURCE_3"]))
application_train["LOG_AMT_CREDIT_AMT_INCOME"] = np.log1p(application_train["AMT_CREDIT"]/application_train["AMT_INCOME_TOTAL"])
application_train["LOG_AMT_GOODS_PRICE_AMT_INCOME"] = np.log(application_train["AMT_GOODS_PRICE"]/application_train["AMT_INCOME_TOTAL"])
application_train['CREDIT_TO_ANNUITY_RATIO'] = application_train['AMT_CREDIT'] / application_train['AMT_ANNUITY']
application_train['CREDIT_TO_GOODS_RATIO'] = application_train['AMT_CREDIT'] / application_train['AMT_GOODS_PRICE']
application_train['INCOME_CREDIT_PERCENTAGE'] = application_train['AMT_INCOME_TOTAL'] / application_train['AMT_CREDIT']
application_train['INCOME_PER_CHILD'] = application_train['AMT_INCOME_TOTAL'] / (1 + application_train['CNT_CHILDREN'])
application_train['INCOME_PER_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']
application_train['PAYMENT_RATE'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

 # application features on test
application_test["EXT_SOURCE_MEAN"] = (application_test["EXT_SOURCE_1"] + application_test["EXT_SOURCE_2"] + application_test["EXT_SOURCE_3"])/3
application_test["EXT_SOURCE_ENTROPY"] = -(application_test["EXT_SOURCE_1"] * np.log1p(application_test["EXT_SOURCE_1"]) + application_test["EXT_SOURCE_2"] * np.log1p(application_test["EXT_SOURCE_2"]) + application_test["EXT_SOURCE_3"] * np.log1p(application_test["EXT_SOURCE_3"]))
application_test["LOG_AMT_CREDIT_AMT_INCOME"] = np.log1p(application_test["AMT_CREDIT"]/application_test["AMT_INCOME_TOTAL"])
application_test["LOG_AMT_GOODS_PRICE_AMT_INCOME"] = np.log(application_test["AMT_GOODS_PRICE"]/application_test["AMT_INCOME_TOTAL"])
application_test['CREDIT_TO_ANNUITY_RATIO'] = application_test['AMT_CREDIT'] / application_test['AMT_ANNUITY']
application_test['CREDIT_TO_GOODS_RATIO'] = application_test['AMT_CREDIT'] / application_test['AMT_GOODS_PRICE']
application_test['INCOME_CREDIT_PERCENTAGE'] = application_test['AMT_INCOME_TOTAL'] / application_test['AMT_CREDIT']
application_test['INCOME_PER_CHILD'] = application_test['AMT_INCOME_TOTAL'] / (1 + application_test['CNT_CHILDREN'])
application_test['INCOME_PER_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['PAYMENT_RATE'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']




print(f"Training stacked model with app features...")
t0 = time()

cont_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = -1)), ("preprocessing", StandardScaler()), ("transformer",QuantileTransformer(output_distribution = "normal")), ("discretizer", KBinsDiscretizer(n_bins = 10))])
cat_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = "missing")), ("encoding", OneHotEncoder(handle_unknown = "ignore"))])

preprocessing = make_column_transformer((cont_prepro, selector(dtype_exclude = "object")), (cat_prepro,selector(dtype_include = "object")))

dataset = application_train[flag_doc_feat].copy()

estimator_flag = LogisticRegression(max_iter = 1000, class_weight = "balanced")
estimator_social = Pipeline([("preprocessing", preprocessing),("model",LogisticRegression(class_weight= "balanced", max_iter= 1000))])
estimator_main_app = Pipeline([("preprocessing", preprocessing),("model",LogisticRegression(class_weight= "balanced", max_iter= 1000))])
ensembler = GradientBoostingClassifier(random_state =42,n_estimators= 100, max_features= None)

# Train a model using only validation features
X = application_train[flag_doc_feat].copy()
y = application_train["TARGET"].copy()
print("Customer validation model...")
cust_behaiv_model = train_binary(X = X, y =y, estimator= estimator_flag)

application_train["FLAG_MODEL_PREDICTED"] = cust_behaiv_model["model"].predict_proba(application_train[flag_doc_feat])[:,1]
application_test["FLAG_MODEL_PREDICTED"] = cust_behaiv_model["model"].predict_proba(application_test[flag_doc_feat])[:,1]

# Train a model using only socio-demo features
X = application_train[social_demo_feat].copy()
y = application_train["TARGET"].copy()
print("Sociodemograph model...")
social_model = train_binary(X = X, y = y, estimator = estimator_social)

application_train["SOCIODEMO_MODEL_PREDICTED"] = social_model["model"].predict_proba(application_train[social_demo_feat])[:,1]
application_test["SOCIODEMO_MODEL_PREDICTED"] = social_model["model"].predict_proba(application_test[social_demo_feat])[:,1]

# Train a model using most relevant application features
X = application_train[main_application_feat].copy()
y = application_train["TARGET"].copy()
print("Application model...")
application_model = train_binary(X = X, y = y, estimator = estimator_main_app)

application_train["APPLICATION_MODEL_PREDICTED"] = application_model["model"].predict_proba(application_train[main_application_feat])[:,1]
application_test["APPLICATION_MODEL_PREDICTED"] = application_model["model"].predict_proba(application_test[main_application_feat])[:,1]

predictions_feat = ["FLAG_MODEL_PREDICTED", "SOCIODEMO_MODEL_PREDICTED", "APPLICATION_MODEL_PREDICTED"]

# Train a model using the output of the models
X = application_train[predictions_feat].copy()
y = application_train["TARGET"].copy()
print("Application ensemble...")
ensemble_model = train_binary(X = X, y = y, estimator = ensembler)

application_train["APPLICATION_MODEL_STACKED_MODEL_PREDICTED"] = ensemble_model["model"].predict_proba(application_train[predictions_feat])[:,1]
application_test["APPLICATION_MODEL_STACKED_MODEL_PREDICTED"] = ensemble_model["model"].predict_proba(application_test[predictions_feat])[:,1]
    
    
print("Stacked models successfully trained!")


application_test["TARGET"] = None

application_test["SPLIT"] = "test"
application_train["SPLIT"] = "train"

print(f"- application train instances: {application_train.shape[0]}")
print(f"- application test instances: {application_test.shape[0]}")
print(f"- concatenated dataset instances: {application_train.shape[0] + application_test.shape[0]}")

application_data = pd.concat([application_train, application_test], ignore_index  = True)

# application features
application_data["ext_source_mean"] = (application_data["EXT_SOURCE_1"] + application_data["EXT_SOURCE_2"] + application_data["EXT_SOURCE_3"])/3
application_data["entropy_ex_source"] = -(application_data["EXT_SOURCE_1"] * np.log1p(application_data["EXT_SOURCE_1"]) + application_data["EXT_SOURCE_2"] * np.log1p(application_data["EXT_SOURCE_2"]) + application_data["EXT_SOURCE_3"] * np.log1p(application_data["EXT_SOURCE_3"]))
application_data["log_amt_credit_amt_income"] = np.log1p(application_data["AMT_CREDIT"]/application_data["AMT_INCOME_TOTAL"])
application_data["log_amt_goods_price_amt_income"] = np.log(application_data["AMT_GOODS_PRICE"]/application_data["AMT_INCOME_TOTAL"])

application_data['credit_to_annuity_ratio'] = application_data['AMT_CREDIT'] / application_data['AMT_ANNUITY']
application_data['credit_to_goods_ratio'] = application_data['AMT_CREDIT'] / application_data['AMT_GOODS_PRICE']
application_data['days_employed_percentage'] = application_data['DAYS_EMPLOYED'] / application_data['DAYS_BIRTH']
application_data['income_credit_percentage'] = application_data['AMT_INCOME_TOTAL'] / application_data['AMT_CREDIT']
application_data['income_per_child'] = application_data['AMT_INCOME_TOTAL'] / (1 + application_data['CNT_CHILDREN'])
application_data['income_per_person'] = application_data['AMT_INCOME_TOTAL'] / application_data['CNT_FAM_MEMBERS']
application_data['payment_rate'] = application_data['AMT_ANNUITY'] / application_data['AMT_CREDIT']


sk_id = application_data["SK_ID_CURR"].unique().tolist()

data = pd.DataFrame(index = sk_id)

print(f"Model based features on: {round((time() - t0)/60,2)} minutes")
print("Initial dataframe info:")
print(data.info())

print(f"Creating features...")
T0 = time()
t0 = time()
# total accounts
data["total_accounts"] = bureau.groupby("SK_ID_CURR").size()
# Open and closed accounts
data["active_accounts"] = bureau.groupby("SK_ID_CURR").apply(lambda x: (x["CREDIT_ACTIVE"] == "Active").sum())
data["cloased_accounts"] = bureau.groupby("SK_ID_CURR").apply(lambda x: (x["CREDIT_ACTIVE"] == "Closed").sum())

data["max_amt_overdue"] = bureau.groupby("SK_ID_CURR").AMT_CREDIT_MAX_OVERDUE.max()
data["mean_amt_overdue"] = bureau.groupby("SK_ID_CURR").AMT_CREDIT_MAX_OVERDUE.mean()
data["mean_amt_credit_debt"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].mean()
data["max_amt_credit_debt"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].max()
data["sum_amt_credit_sum"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].sum()
data["mean_amt_credit_sum"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].mean()
data["max_amt_credit_sum"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].max()
data["sum_amt_credit_debt"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].sum()
data["mean_amt_credit_limit"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].mean()
data["sum_amt_credit_limit"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].sum()
data["sum_amt_credit_overdue"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].sum()
data["mean_amt_credit_overdue"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].mean()
data["max_amt_credit_overdue"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].max()
print(f"First batch of features on: {round((time() - t0)/60,2)} minutes")

data["credit_sum_by_debt"] = data["sum_amt_credit_debt"]/(data["sum_amt_credit_sum"] + 1)
data["credit_sum_by_limit"] = data["sum_amt_credit_limit"]/data["sum_amt_credit_sum"]
data["credit_sum_by_overdue"] = data["sum_amt_credit_overdue"]/data["sum_amt_credit_sum"]

t0 = time()
# dividir entre AMT_CREDIT_SUM
data["total_annuity"] = bureau.groupby("SK_ID_CURR").AMT_ANNUITY.sum()
# Percentage open accounts
data["active_prem_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
data["active_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
data["active_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
data["active_credit_cards"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
data["active_other_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
# Percentage closed accounts
data["closed_prem_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
data["closed_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
data["closed_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
data["closed_credit_cards"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
data["closed_other_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]



# Saldo en $$ en cuentas abiertas
data["mean_debt_active_prem_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.mean()
data["mean_debt_active_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.mean()
data["mean_debt_active_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.mean()
data["mean_debt_active_credit_cards"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.mean()
data["mean_debt_active_other_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.mean()

data["sum_debt_active_prem_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
data["sum_debt_active_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
data["sum_debt_active_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
data["sum_debt_active_credit_cards"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
data["sum_debt_active_other_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()

data["sum_debt_active_credits"] = bureau[(bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
# Trend Saldo en $$ en cuentas abiertas
data["trend_debt_active_credits"] = bureau[(bureau["CREDIT_ACTIVE"] == "Active")].sort_values(by = ["SK_ID_CURR", "DAYS_CREDIT"]).groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.fillna(-1).apply(lambda x: get_linear_regression(x)["trend"])
# Max en $$ en cuentas cerradas
data["max_debt_closed_premium_loans"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
data["max_debt_closed_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
data["max_debt_closed_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
data["max_debt_closed_credit_card"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
data["max_debt_closed_debt_others"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
# div
data["sum_amt_credit_limit_credit_cards_open"]=bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].sum()
data["perc_util_revolving"] = data["sum_debt_active_credit_cards"]/(data["sum_amt_credit_limit_credit_cards_open"] + 1)
data["credit_debt_div_lim_cred"] = data["sum_amt_credit_debt"]/(data["sum_amt_credit_limit"] + 1)
data["credit_lim_div_cred_overdue"] = data["sum_amt_credit_limit"]/(data["sum_amt_credit_overdue"] + 1)

data.drop(["sum_amt_credit_sum", "sum_amt_credit_limit", "sum_amt_credit_limit_credit_cards_open"], axis = 1, inplace = True)
# Max cred lim non defaulted
data["max_cred_lim_non_delinq"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_SUM_LIMIT"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 0)]["AMT_CREDIT_SUM_LIMIT"].max())
data["max_cred_lim_non_overdue"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_MAX_OVERDUE"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 0)]["AMT_CREDIT_MAX_OVERDUE"].max())

data["max_cred_lim_delinq"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_SUM_LIMIT"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 1)]["AMT_CREDIT_SUM_LIMIT"].max())
data["max_cred_lim_overdue"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_MAX_OVERDUE"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 1)]["AMT_CREDIT_MAX_OVERDUE"].max())

data["mean_cred_lim_delinq"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_SUM_LIMIT"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 1)]["AMT_CREDIT_SUM_LIMIT"].mean())
data["mean_cred_lim_overdue"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_MAX_OVERDUE"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 1)]["AMT_CREDIT_MAX_OVERDUE"].mean())

# paymet history
data["payment_history"] = bureau.groupby("SK_ID_CURR")["PAYMENT_HISTORY"].mean()
data["mean_consecutive_non_delinq"] = bureau.groupby("SK_ID_CURR")["CONSECUTIVE_NO_DELINQ"].mean()
data["max_consecutive_non_delinq"] = bureau.groupby("SK_ID_CURR")["CONSECUTIVE_NO_DELINQ"].max()
print(f"Second batch of features on: {round((time() - t0)/60,2)} minutes")

t0 = time()
# Number of times in bucket x
data["times_bucket_1"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_1"].sum()
data["times_bucket_2"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_2"].sum()
data["times_bucket_3"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_3"].sum()
data["times_bucket_4"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_4"].sum()
data["times_bucket_5"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_5"].sum()
data["times_bucket_6"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_6"].sum()

data["times_bad_delinquency"] = data["times_bucket_6"] + data["times_bucket_5"] + data["times_bucket_4"] + data["times_bucket_3"]
data["times_no_bad_delinquency"] = data["times_bucket_2"] + data["times_bucket_1"]
# Antiguedad meses
data["antiguedad_media_dias"] = bureau.groupby("SK_ID_CURR")["DAYS_CREDIT"].mean()
data["antiguedad_media_dias_closed_accounts"] = bureau.groupby("SK_ID_CURR")["DAYS_CREDIT_ENDDATE"].mean()
data["antiguedad_media_dias_open_accounts"] = bureau.groupby("SK_ID_CURR")["DAYS_ENDDATE_FACT"].mean()
data["antiguedad_maxima_dias_closed_accounts"] = bureau.groupby("SK_ID_CURR")["DAYS_CREDIT_ENDDATE"].max()
data["antiguedad_maxima_dias_open_accounts"] = bureau.groupby("SK_ID_CURR")["DAYS_ENDDATE_FACT"].max()
data["sum_prolong_days"] = bureau.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].sum()
data["mean_prolong_days"] = bureau.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].mean()
# Closed accounts last x days
data["closed_accounts_last12m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -365].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["closed_accounts_last9m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -270].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["closed_accounts_last6m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -180].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["closed_accounts_last3m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -90].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["closed_accounts_last2m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -60].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["closed_accounts_last1m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -30].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
# Open credit last x days
data["open_accounts_last12m"] = bureau[bureau["DAYS_CREDIT"] >= -365].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["open_accounts_last9m"] = bureau[bureau["DAYS_CREDIT"] >= -270].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["open_accounts_last6m"] = bureau[bureau["DAYS_CREDIT"] >= -180].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["open_accounts_last3m"] = bureau[bureau["DAYS_CREDIT"] >= -90].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["open_accounts_last2m"] = bureau[bureau["DAYS_CREDIT"] >= -60].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["open_accounts_last1m"] = bureau[bureau["DAYS_CREDIT"] >= -30].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
data["n_unique_credit_types"] = bureau.groupby("SK_ID_CURR")["CREDIT_TYPE"].nunique()
# Trigonometric features
data["cosine_cred_sum"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].apply(lambda x: (2 * np.pi  *np.cos(x)))
data["sine_cred_sum"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].apply(lambda x: (2 * np.pi  *np.sin(x)))
data["cosine_cred_sum_debt"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].apply(lambda x: (2 * np.pi  *np.cos(x)))
data["sine_cred_sum_debt"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].apply(lambda x: (2 * np.pi  *np.sin(x)))
data["cosine_cred_sum_limit"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].apply(lambda x: (2 * np.pi  *np.cos(x)))
data["sine_cred_sum_limit"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].apply(lambda x: (2 * np.pi  *np.sin(x)))


In [None]:
t0 = time()
print("previous application features...")
# previous applications features
data["sum_prev_applications"] = previous_application.groupby("SK_ID_CURR").SK_ID_PREV.size()
data["mean_amt_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_APPLICATION.mean()
data["mean_amt_cred_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_CREDIT.mean()
data["mean_amt_annuity_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_ANNUITY.mean()
data["sum_amt_downpayment_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_DOWN_PAYMENT.sum()
data["mean_amt_goodsprice_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_GOODS_PRICE.mean()
data["mean_days_last_due_prev_applications"] = previous_application.groupby("SK_ID_CURR").DAYS_LAST_DUE.mean()
data["mean_days_first_due_prev_applications"] = previous_application.groupby("SK_ID_CURR").DAYS_FIRST_DUE.mean()
data["mean_term_prev_applications"] = previous_application.groupby("SK_ID_CURR").CNT_PAYMENT.mean()
data["total_previous_revolving_credits"] = previous_application[previous_application["NAME_CONTRACT_TYPE"] == "Revolving loans"].groupby("SK_ID_CURR").SK_ID_PREV.size()


data["mean_amt_credit_previous_revolving_credits"] = previous_application[previous_application["NAME_CONTRACT_TYPE"] == "Revolving loans"].groupby("SK_ID_CURR")["AMT_CREDIT"].mean()
data["mean_amt_downpayment_previous_revolving_credits"] = previous_application[previous_application["NAME_CONTRACT_TYPE"] == "Revolving loans"].groupby("SK_ID_CURR")["AMT_DOWN_PAYMENT"].mean()

data["perc_prev_app_refused"] = previous_application[previous_application.NAME_CONTRACT_STATUS == "Refused"].groupby("SK_ID_CURR").SK_ID_PREV.size()/(data["sum_prev_applications"]+1)
data["perc_prev_app_approved"] = previous_application[previous_application.NAME_CONTRACT_STATUS == "Approved"].groupby("SK_ID_CURR").SK_ID_PREV.size()/(data["sum_prev_applications"]+1)
data["perc_prev_app_other"] = previous_application[previous_application.NAME_CONTRACT_STATUS.isin(["Canceled", "Unused offer"])].groupby("SK_ID_CURR").SK_ID_PREV.size()/(data["sum_prev_applications"]+1)

data["mean_amt_prev_app_refused"] =previous_application[previous_application.NAME_CONTRACT_STATUS == "Refused"].groupby("SK_ID_CURR").AMT_APPLICATION.mean()
data["max_amt_prev_app_refused"] =previous_application[previous_application.NAME_CONTRACT_STATUS == "Approved"].groupby("SK_ID_CURR").AMT_APPLICATION.max()
data["min_amt_prev_app_refused"] =previous_application[previous_application.NAME_CONTRACT_STATUS.isin(["Canceled", "Unused offer"])].groupby("SK_ID_CURR").AMT_APPLICATION.min()

data["channel_aq_last_refused_app"] = previous_application[previous_application.NAME_CONTRACT_STATUS == "Refused"].sort_values(by = ["SK_ID_CURR", "SK_ID_PREV"], ascending = [0,1]).drop_duplicates(subset = "SK_ID_CURR").groupby("SK_ID_CURR").CHANNEL_TYPE.apply(lambda x: x.unique()[0])


data = data.replace({np.inf:np.NaN})


print(f"Third batch of features on: {round((time() - t0)/60,2)} minutes")


print(f"Building a model using credit features...")

In [None]:

pos_history = pOS_CASH_balance.copy()
# pos features
data["cnt_installments_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.sum().groupby("SK_ID_CURR").mean()
data["cnt_installments_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.sum().groupby("SK_ID_CURR").max()
data["cnt_installments_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.sum().groupby("SK_ID_CURR").sum()
data["cnt_installments_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["cnt_installments_fut_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.sum().groupby("SK_ID_CURR").mean()
data["cnt_installments_fut_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.sum().groupby("SK_ID_CURR").max()
data["cnt_installments_fut_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.sum().groupby("SK_ID_CURR").sum()
data["cnt_installments_fut_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
data["cnt_installments_pos_minus_pos_f"] = data["cnt_installments_pos_total"] - data["cnt_installments_fut_pos_total"] 

data["cnt_installments_dpd_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").mean()
data["cnt_installments_dpd_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").max()
data["cnt_installments_dpd_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").sum()
data["cnt_installments_dpd_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
      
pos_history["PAR_X_AT_Y"] = pos_history["SK_DPD"]/pos_history["MONTHS_BALANCE"]

data["cnt_installments_par_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.sum().groupby("SK_ID_CURR").mean()
data["cnt_installments_par_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.sum().groupby("SK_ID_CURR").max()
data["cnt_installments_par_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.sum().groupby("SK_ID_CURR").sum()
data["cnt_installments_par_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

In [None]:
pos_features = ["cnt_installments_pos_mean"
,"cnt_installments_pos_max"
,"cnt_installments_pos_total"
,"cnt_installments_pos_trend"
,"cnt_installments_fut_pos_mean"
,"cnt_installments_fut_pos_max"
,"cnt_installments_fut_pos_total"
,"cnt_installments_fut_pos_trend"
,"cnt_installments_pos_minus_pos_f"
,"cnt_installments_dpd_pos_mean" 
,"cnt_installments_dpd_pos_max"
,"cnt_installments_dpd_pos_total"
,"cnt_installments_dpd_pos_trend"
,"cnt_installments_par_pos_mean"
,"cnt_installments_par_pos_max"
,"cnt_installments_par_pos_total"
,"cnt_installments_par_pos_trend"]

In [None]:
# installmet features
data["amt_installments_max_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").max()
data["amt_installments_min_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").min()
data["amt_installments_total_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").sum()
data["amt_installments_mean_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").mean()
data["amt_installments_trend"] =installments_payments.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV","NUM_INSTALMENT_NUMBER"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["amt_pay_installments_max_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT.sum().groupby("SK_ID_CURR").max()
data["amt_pay_installments_min_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT.sum().groupby("SK_ID_CURR").min()
data["amt_pay_installments_total_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT.sum().groupby("SK_ID_CURR").sum()
data["amt_pay_installments_mean_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").mean()
data["amt_pay_installments_trend"] =installments_payments.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV","NUM_INSTALMENT_NUMBER"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()


data["installments_payment_vs_amt_installment"] = data["amt_pay_installments_total_amt"]/data["amt_installments_total_amt"]
print(f"Seventh batch of features on: {round((time() - t0)/60,2)} minutes")


In [None]:
installment_features = ["amt_installments_max_amt"
,"amt_installments_min_amt"
,"amt_installments_total_amt"
,"amt_installments_mean_amt"
,"amt_installments_trend"
,"amt_pay_installments_max_amt" 
,"amt_pay_installments_min_amt" 
,"amt_pay_installments_total_amt"
,"amt_pay_installments_mean_amt"
,"amt_pay_installments_trend"
,"installments_payment_vs_amt_installment"
    
]

In [None]:
  # credit cards balance
t0 = time()
data["mean_credit_cards_months"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.size().groupby("SK_ID_CURR").mean()
data["max_credit_cards_months"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.size().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.sum().groupby("SK_ID_CURR").sum()
data["max_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.sum().groupby("SK_ID_CURR").max()
data["mean_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_balance"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
print(f"Fifth batch of features on: {round((time() - t0)/60,2)} minutes")


t0 = time()
data["max_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_CREDIT_LIMIT_ACTUAL.sum().groupby("SK_ID_CURR").max()
data["mean_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_CREDIT_LIMIT_ACTUAL.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_balance"] = credit_card_balance.sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_CREDIT_LIMIT_ACTUAL.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
data["max_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_paymentcurrent"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").max()
data["mean_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_paymentcurrent"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_total_recieivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_total_recieivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_total_recieivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_total_recieivable"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_min_regularity"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_min_regularity"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_min_regularity"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_min_regularity"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_payment_total_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_payment_total_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_payment_total_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_payment_total_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_drawings_atm_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_drawings_atm_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_drawings_atm_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_drawings_atm_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_drawings_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
print(f"Sixth batch of features on: {round((time() - t0)/60,2)} minutes")

t0 = time()
data["max_credit_cards_amt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_drawings_other_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_drawings_pos_total_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_amt_recivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_amt_recivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_amt_recivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_amt_recivable"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_cnt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_cnt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_cnt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_cnt_drawings_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_cnt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_cnt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_cnt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_cnt_drawings_other_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_dpd_def"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_dpd_def"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_dpd_def"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_dpd_def"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

data["max_credit_cards_dpd"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").max()
data["total_credit_cards_dpd"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").sum()
data["mean_credit_cards_dpd"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").mean()
data["mean_trend_credit_dpd"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

# interacciones credit card balance

data["amt_paycurr_div_total_amt_balance"] = data["max_credit_cards_amt_paymentcurrent"]/data["total_credit_cards_amt_balance"]
data["amt_total_balnce_div_total_recivable"] = data["total_credit_cards_amt_balance"]/data["total_credit_cards_amt_total_recieivable"]



In [None]:
credit_card_features = ["mean_credit_cards_months"
,"max_credit_cards_months"
,"total_credit_cards_amt_balance"
,"max_credit_cards_amt_balance"
,"mean_credit_cards_amt_balance"
,"mean_trend_credit_cards_amt_balance"
,"max_credit_cards_amt_balance"
,"mean_credit_cards_amt_balance"
,"mean_trend_credit_cards_amt_balance"
,"max_credit_cards_amt_paymentcurrent"
,"total_credit_cards_amt_paymentcurrent"
,"mean_credit_cards_amt_paymentcurrent"
,"mean_trend_credit_cards_amt_paymentcurrent"
,"max_credit_cards_amt_paymentcurrent"
,"mean_credit_cards_amt_paymentcurrent"
,"mean_trend_credit_cards_amt_paymentcurrent"
,"max_credit_cards_amt_total_recieivable"
,"total_credit_cards_amt_total_recieivable"
,"mean_credit_cards_amt_total_recieivable"
,"mean_trend_credit_cards_amt_total_recieivable"
,"max_credit_cards_amt_min_regularity"
,"total_credit_cards_amt_min_regularity"
,"mean_credit_cards_amt_min_regularity"
,"mean_trend_credit_cards_amt_min_regularity"
,"max_credit_cards_amt_payment_total_current" 
,"total_credit_cards_amt_payment_total_current"
,"mean_credit_cards_amt_payment_total_current"
,"mean_trend_credit_cards_amt_payment_total_current"
,"max_credit_cards_amt_drawings_atm_current" 
,"total_credit_cards_amt_drawings_atm_current"
,"mean_credit_cards_amt_drawings_atm_current"
,"mean_trend_credit_cards_amt_drawings_atm_current"
,"max_credit_cards_amt_drawings_current"
,"total_credit_cards_amt_drawings_current"
,"mean_credit_cards_amt_drawings_current"
,"mean_trend_credit_cards_amt_drawings_current"
,"max_credit_cards_amt_drawings_other_current"
,"total_credit_cards_amt_drawings_other_current"
,"mean_credit_cards_amt_drawings_other_current"
,"mean_trend_credit_cards_amt_drawings_other_current"
,"max_credit_cards_amt_drawings_pos_current"
,"total_credit_cards_amt_drawings_pos_current"
,"mean_credit_cards_amt_drawings_pos_current"
,"mean_trend_credit_cards_drawings_pos_total_current"
,"max_credit_cards_amt_recivable"
,"total_credit_cards_amt_recivable"
,"mean_credit_cards_amt_recivable"
,"mean_trend_credit_cards_amt_recivable"
,"max_credit_cards_cnt_atm_drawings_current"
,"total_credit_cards_cnt_atm_drawings_current"
,"mean_credit_cards_cnt_atm_drawings_current"
,"mean_trend_credit_cards_cnt_atm_drawings_current"
,"max_credit_cards_cnt_drawings_current"
,"total_credit_cards_cnt_drawings_current"
,"mean_credit_cards_cnt_drawings_current"
,"mean_trend_credit_cards_cnt_drawings_current"
,"max_credit_cards_cnt_drawings_other_current"
,"total_credit_cards_cnt_drawings_other_current"
,"mean_credit_cards_cnt_drawings_other_current"
,"mean_trend_credit_cards_cnt_drawings_other_current"
,"max_credit_cards_cnt_drawings_pos_current"
,"total_credit_cards_cnt_drawings_pos_current"
,"mean_credit_cards_cnt_drawings_pos_current"
,"mean_trend_credit_cards_cnt_drawings_pos_current"
,"max_credit_cards_cnt_installment_mature_cum" 
,"total_credit_cards_cnt_installment_mature_cum"
,"mean_credit_cards_cnt_installment_mature_cum"
,"mean_trend_credit_cards_cnt_installment_mature_cum"
,"max_credit_cards_dpd_def"
,"total_credit_cards_dpd_def"
,"mean_credit_cards_dpd_def"
,"mean_trend_credit_dpd_def"
,"max_credit_cards_dpd"
,"total_credit_cards_dpd"
,"mean_credit_cards_dpd"
,"mean_trend_credit_dpd"
,"amt_paycurr_div_total_amt_balance"
,"amt_total_balnce_div_total_recivable"
]







In [None]:
credit_bureau_features = ['active_accounts',
 'cloased_accounts',
 'max_amt_overdue',
 'mean_amt_overdue',
 'mean_amt_credit_debt',
 'max_amt_credit_debt',
 'mean_amt_credit_sum',
 'max_amt_credit_sum',
 'sum_amt_credit_debt',
 'mean_amt_credit_limit',
 'sum_amt_credit_overdue',
 'credit_sum_by_debt',
 'credit_sum_by_limit',
 'credit_sum_by_overdue',
 'total_annuity',
 'active_prem_credits',
 'active_bank_credits',
 'active_credit_cards',
 'closed_prem_credits',
 'mean_debt_active_bank_credits',
 'mean_debt_active_credit_cards',
 'sum_debt_active_bank_credits',
 'sum_debt_active_credit_cards',
 'sum_debt_active_credits',
 'max_debt_closed_premium_loans',
 'perc_util_revolving',
 'credit_debt_div_lim_cred',
 'credit_lim_div_cred_overdue',
 'max_cred_lim_non_delinq',
 'max_cred_lim_delinq',
 'max_cred_lim_overdue',
 'mean_cred_lim_overdue',
 'payment_history',
 'mean_consecutive_non_delinq',
 'max_consecutive_non_delinq',
 'times_no_bad_delinquency',
 'times_bad_delinquency',
 'antiguedad_media_dias',
 'antiguedad_media_dias_closed_accounts',
 'antiguedad_maxima_dias_closed_accounts',
 'antiguedad_maxima_dias_open_accounts',
 'closed_accounts_last6m',
 'closed_accounts_last3m',
 'closed_accounts_last2m',
 'open_accounts_last12m',
 'open_accounts_last9m',
 'open_accounts_last6m',
 'open_accounts_last3m',
 'open_accounts_last2m',
 'open_accounts_last1m',
 'sine_cred_sum',
 'cosine_cred_sum_debt',
 'amt_req_credit_bureau_qrt',
 'amt_req_credit_bureau_year',
 'amt_income_total',
 'amt_credit',
 'amt_annuity',
 'amt_goods_price',
 'days_birth',
 'days_employed',
 'ext_source_1',
 'ext_source_2',
 'ext_source_3',
 'ext_source_mean',
 'ext_source_entropy',
 'log_amt_goods_price_amt_income',
 'credit_to_annuity_ratio',
 'credit_to_goods_ratio',
 'income_per_person',
 'payment_rate',
 'application_model_stacked_model_predicted',
 'flag_model_predicted',
 'sociodemo_model_predicted',
 'application_model_predicted']


In [None]:
tmp = data.reset_index().rename(columns = {"index":"SK_ID_CURR"})
tmp = tmp[tmp["SK_ID_CURR"].isin(train_skid_curr)]
    
target = application_train[["SK_ID_CURR", "TARGET"]]

tmp.columns = tmp.columns.str.lower()
target.columns = target.columns.str.lower()

tmp = pd.merge(left = tmp, right =target, on = "sk_id_curr" )

app = application_train[['SK_ID_CURR',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',  
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',    
 "EXT_SOURCE_MEAN",
  "EXT_SOURCE_ENTROPY",
  "LOG_AMT_CREDIT_AMT_INCOME",
  "LOG_AMT_GOODS_PRICE_AMT_INCOME",
  'CREDIT_TO_ANNUITY_RATIO',
  'CREDIT_TO_GOODS_RATIO',
  'INCOME_CREDIT_PERCENTAGE',
  'INCOME_PER_CHILD',
  'INCOME_PER_PERSON', 
  'PAYMENT_RATE','APPLICATION_MODEL_STACKED_MODEL_PREDICTED',
 "FLAG_MODEL_PREDICTED", "SOCIODEMO_MODEL_PREDICTED", "APPLICATION_MODEL_PREDICTED"]].copy()


app.columns = app.columns.str.lower()

tmp = tmp.merge(app, on = "sk_id_curr")

In [None]:
tmp.head()

In [None]:
cont_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = -99999)), ("preprocessing", StandardScaler())])
cat_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = "missing")), ("encoding", OneHotEncoder(handle_unknown = "ignore"))])

preprocessing = make_column_transformer((cont_prepro, selector(dtype_exclude = "object")), (cat_prepro,selector(dtype_include = "object")))

estimator = Pipeline([("preprocessing", preprocessing),("model",XGBClassifier(random_state = 42))])

credit_model = train_binary(X = tmp[credit_bureau_features] ,y = target.target, estimator = estimator, cv = 3, refit_all = True, verbose = True)
credit_model = credit_model["model"]

In [None]:
pd.DataFrame(credit_model["model"].feature_importances_, index = tmp[credit_bureau_features].columns).sort_values(by = 0, ascending = False).tail(25)

In [None]:
# Preparar modelo con previous app data

prev_app_features = [
"sum_prev_applications"
,"mean_amt_prev_applications"
,"mean_amt_cred_prev_applications"
,"mean_amt_annuity_prev_applications"
,"sum_amt_downpayment_prev_applications"
,"mean_amt_goodsprice_prev_applications"
,"mean_days_last_due_prev_applications"
,"mean_days_first_due_prev_applications"
,"mean_term_prev_applications"
,"total_previous_revolving_credits"
,"mean_amt_credit_previous_revolving_credits"
,"mean_amt_downpayment_previous_revolving_credits"
,"perc_prev_app_refused"
,"perc_prev_app_approved"
,"perc_prev_app_other"
,"mean_amt_prev_app_refused" 
,"max_amt_prev_app_refused" 
,"min_amt_prev_app_refused"
,"channel_aq_last_refused_app"]

In [None]:
cont_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = -99999)), ("preprocessing", StandardScaler())])
cat_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = "missing")), ("encoding", OneHotEncoder(handle_unknown = "ignore"))])

preprocessing = make_column_transformer((cont_prepro, selector(dtype_exclude = "object")), (cat_prepro,selector(dtype_include = "object")))

estimator = Pipeline([("preprocessing", preprocessing),("model",XGBClassifier(random_state = 42))])

prev_app_model = train_binary(X = tmp[prev_app_features] ,y = target.target, estimator = estimator, cv = 3, refit_all = True, verbose = True)
prev_app_model = credit_model["model"]

In [None]:
tmp.shape

In [None]:
data.shape

In [None]:
application_train.shape

In [None]:
# agregar una columna con las prediciones del modelo de credito
tmp["credit_bureau_model_stacked_predicted"] = credit_model.predict_proba(tmp[credit_bureau_features])[:,1]

In [None]:
# Preparar modelo con previous app data

prev_app_features = [
"sum_prev_applications"
,"mean_amt_prev_applications"
,"mean_amt_cred_prev_applications"
,"mean_amt_annuity_prev_applications"
,"sum_amt_downpayment_prev_applications"
,"mean_amt_goodsprice_prev_applications"
,"mean_days_last_due_prev_applications"
,"mean_days_first_due_prev_applications"
,"mean_term_prev_applications"
,"total_previous_revolving_credits"
,"mean_amt_credit_previous_revolving_credits"
,"mean_amt_downpayment_previous_revolving_credits"
,"perc_prev_app_refused"
,"perc_prev_app_approved"
,"perc_prev_app_other"
,"mean_amt_prev_app_refused" 
,"max_amt_prev_app_refused" 
,"min_amt_prev_app_refused"
,"channel_aq_last_refused_app"
,"credit_bureau_model_stacked_predicted"]

In [None]:
cont_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = -99999)), ("preprocessing", StandardScaler())])
cat_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = "missing")), ("encoding", OneHotEncoder(handle_unknown = "ignore"))])

preprocessing = make_column_transformer((cont_prepro, selector(dtype_exclude = "object")), (cat_prepro,selector(dtype_include = "object")))

estimator = Pipeline([("preprocessing", preprocessing),("model",XGBClassifier(random_state = 42))])

prev_app_model = train_binary(X = tmp[prev_app_features] ,y = target.target, estimator = estimator, cv = 3, refit_all = True, verbose = True)
prev_app_model = prev_app_model["model"]

In [None]:
application_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
application_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
bureau = pd.read_csv("../input/home-credit-default-risk/bureau.csv")
bureau_balance = pd.read_csv("../input/home-credit-default-risk/bureau_balance.csv")
pOS_CASH_balance = pd.read_csv("../input/home-credit-default-risk/POS_CASH_balance.csv")
previous_application = pd.read_csv("../input/home-credit-default-risk/previous_application.csv")
homeCredit_columns_description = pd.read_csv("../input/home-credit-default-risk/HomeCredit_columns_description.csv", encoding="latin1")
installments_payments = pd.read_csv("../input/home-credit-default-risk/installments_payments.csv")
credit_card_balance = pd.read_csv("../input/home-credit-default-risk/credit_card_balance.csv")

# features_data= pd.read_pickle("./features.pkl")

![](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

In [None]:
# %% [code] {"_kg_hide-input":false}
import numpy as np
import pandas as pd

from functools import reduce
from itertools import groupby

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from scipy import stats

from scipy.stats import entropy
import pickle
import gzip

import pdb

from time import time




premium_loans = ["Real estate loan",
                 "Car loan",
                 "Loan for purchase of shares (margin lending)"]

working_capital_loans = ["Loan for working capital replenishment",
                         "Loan for the purchase of equipment",
                         "Loan for business development"]

bank_credits = ["Microloan",
                "Consumer credit",
                "Interbank credit",
                "Consumer credit"]

credit_cards = ["Credit card"]

others = ["Unknown type of loan", "Another type of loan",
       "Cash loan (non-earmarked)",
       "Mobile operator loan", "Interbank credit",
       "Loan for purchase of shares (margin lending)"]




flag_doc_feat = [
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL', 
 'HOUR_APPR_PROCESS_START']

social_demo_feat = [
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'REGION_POPULATION_RELATIVE',
 'CNT_CHILDREN',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'DAYS_LAST_PHONE_CHANGE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_CONTRACT_TYPE',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE', 
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

main_application_feat = ['AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',  
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',    
 "EXT_SOURCE_MEAN",
  "EXT_SOURCE_ENTROPY",
  "LOG_AMT_CREDIT_AMT_INCOME",
  "LOG_AMT_GOODS_PRICE_AMT_INCOME",
  'CREDIT_TO_ANNUITY_RATIO',
  'CREDIT_TO_GOODS_RATIO',
  'INCOME_CREDIT_PERCENTAGE',
  'INCOME_PER_CHILD',
  'INCOME_PER_PERSON', 
  'PAYMENT_RATE']

# expendable_feat = ['INCOME_PER_CHILD',
#   'INCOME_PER_PERSON', 
#   'AMT_REQ_CREDIT_BUREAU_DAY',
#   'AMT_REQ_CREDIT_BUREAU_WEEK',
#   'AMT_REQ_CREDIT_BUREAU_MON',
#   'AMT_REQ_CREDIT_BUREAU_QRT',
#   'AMT_REQ_CREDIT_BUREAU_YEAR',
#   'DAYS_BIRTH',
#   'DAYS_EMPLOYED',
#   'DAYS_REGISTRATION']

useless_credit_feat = ['total_accounts',
 'mean_amt_credit_overdue',
 'max_amt_credit_overdue',
 'active_wk_credits',
 'active_other_credits',
 'closed_wk_credits',
 'closed_bank_credits',
 'closed_credit_cards',
 'closed_other_credits',
 'mean_debt_active_prem_credits',
 'mean_debt_active_wk_credits',
 'mean_debt_active_other_credits',
 'sum_debt_active_prem_credits',
 'sum_debt_active_wk_credits',
 'sum_debt_active_other_credits',
 'trend_debt_active_credits',
 'max_debt_closed_wk_credits',
 'max_debt_closed_bank_credits',
 'max_debt_closed_credit_card',
 'max_debt_closed_debt_others',
 'max_cred_lim_non_overdue',
 'mean_cred_lim_delinq',
 'times_bucket_1',
 'times_bucket_2',
 'times_bucket_4',
 'times_bucket_6',
 'antiguedad_media_dias_open_accounts',
 'sum_prolong_days',
 'mean_prolong_days',
 'closed_accounts_last12m',
 'closed_accounts_last9m',
 'closed_accounts_last1m',
 'n_unique_credit_types',
 'cosine_cred_sum',
 'sine_cred_sum_debt',
 'cosine_cred_sum_limit',
 'sine_cred_sum_limit',
 'target',
 'amt_req_credit_bureau_hour',
 'amt_req_credit_bureau_day',
 'amt_req_credit_bureau_week',
 'amt_req_credit_bureau_mon',
 'days_registration',
 'log_amt_credit_amt_income',
 'income_credit_percentage',
 'income_per_child']


credit_bureau_features = ['active_accounts',
 'cloased_accounts',
 'max_amt_overdue',
 'mean_amt_overdue',
 'mean_amt_credit_debt',
 'max_amt_credit_debt',
 'mean_amt_credit_sum',
 'max_amt_credit_sum',
 'sum_amt_credit_debt',
 'mean_amt_credit_limit',
 'sum_amt_credit_overdue',
 'credit_sum_by_debt',
 'credit_sum_by_limit',
 'credit_sum_by_overdue',
 'total_annuity',
 'active_prem_credits',
 'active_bank_credits',
 'active_credit_cards',
 'closed_prem_credits',
 'mean_debt_active_bank_credits',
 'mean_debt_active_credit_cards',
 'sum_debt_active_bank_credits',
 'sum_debt_active_credit_cards',
 'sum_debt_active_credits',
 'max_debt_closed_premium_loans',
 'perc_util_revolving',
 'credit_debt_div_lim_cred',
 'credit_lim_div_cred_overdue',
 'max_cred_lim_non_delinq',
 'max_cred_lim_delinq',
 'max_cred_lim_overdue',
 'mean_cred_lim_overdue',
 'payment_history',
 'mean_consecutive_non_delinq',
 'max_consecutive_non_delinq',
 'times_bucket_3',
 'times_bucket_5',
 'antiguedad_media_dias',
 'antiguedad_media_dias_closed_accounts',
 'antiguedad_maxima_dias_closed_accounts',
 'antiguedad_maxima_dias_open_accounts',
 'closed_accounts_last6m',
 'closed_accounts_last3m',
 'closed_accounts_last2m',
 'open_accounts_last12m',
 'open_accounts_last9m',
 'open_accounts_last6m',
 'open_accounts_last3m',
 'open_accounts_last2m',
 'open_accounts_last1m',
 'sine_cred_sum',
 'cosine_cred_sum_debt',
 'credit_type_entropy',
 'amt_cred_sum_entropy',
 'amt_sum_debt_entropy',
 'amt_cred_limit_entropy',
 'amt_req_credit_bureau_qrt',
 'amt_req_credit_bureau_year',
 'amt_income_total',
 'amt_credit',
 'amt_annuity',
 'amt_goods_price',
 'days_birth',
 'days_employed',
 'ext_source_1',
 'ext_source_2',
 'ext_source_3',
 'ext_source_mean',
 'ext_source_entropy',
 'log_amt_goods_price_amt_income',
 'credit_to_annuity_ratio',
 'credit_to_goods_ratio',
 'income_per_person',
 'payment_rate',
 'application_model_stacked_model_predicted',
 'flag_model_predicted',
 'sociodemo_model_predicted',
 'application_model_predicted']


def train_binary(X ,y, estimator, cv = 3, refit_all = True, verbose = True):
    """
    Creates a feature based on predictions from kfold
    """
    ls = list()
    result = {}
    kf = StratifiedKFold(n_splits = cv)
    k = 0
    for train_idx, test_idx in kf.split(X, y):
        k+=1
        X_train, X_test = X.iloc[train_idx,:], X.iloc[test_idx,:]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = estimator.fit(X_train, y_train)
        predictions = model.predict_proba(X_test)[:,1]
        if verbose:
            score = roc_auc_score(y_true=y_test, y_score=predictions)
            print(f"Score on test set for fold {k} is :{round(score,3)}")
        
        ls.append(predictions)
        
    feature = np.hstack(ls)
    
    if refit_all:
        model = estimator.fit(X, y)
        
        result["model"] = model
    
    result["feature"] = feature
    
    return result

def get_entropy(series, categorical = True):

    try:
        if categorical:
            data = series.value_counts(True)
        else:
            data = pd.qcut(series,[0, .25, .5, .75, 1.], duplicates= "drop").value_counts(True)

        return entropy(data)
    except:
        return 0

def len_iter(items):
    return sum(1 for _ in items)


def consecutive_values(data, bin_val):
    try:
        return max(len_iter(run) for val, run in groupby(data) if val == bin_val)/len(data)
    except:
        return 0

def get_linear_regression(series):
    result ={}
    try:
        n = len(series)
        # X = np.arange(n).reshape(-1,1)
        X = np.arange(n)
        y = series.cumsum()
        lr = stats.linregress(x, y)
    
        result["trend"] = lr.slope
        result["intercept"] = lr.intercept
    except:
        result["trend"] = -1
        result["intercept"] = -1
    return result


def main(application_train, application_test, previous_application, bureau, bureau_balance, installments_payments, credit_card_balance, pos_history, return_X_y = False, verbose = True):
    """
    Generates a binary classification model

    """

    print(f"Processing bureau balance...")
    t0 = time()

    train_skid_curr = application_train["SK_ID_CURR"].unique().tolist()
    test_skid_curr = application_test["SK_ID_CURR"].unique().tolist()


    # For preprocessing
    bureau_balance["STATUS_NUMERIC"] = bureau_balance.STATUS.map({"0":0,"1":1,"2":2,"3":3,"4":4,"5":5,"X":None,"C":None})
    bureau_balance["EVER_DELINQUENT"] = np.where(bureau_balance.STATUS != '0',1,0)

    aux_c = bureau_balance[bureau_balance.STATUS == "0"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_0"})
    aux_1 = bureau_balance[bureau_balance.STATUS == "1"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_1"})
    aux_2 = bureau_balance[bureau_balance.STATUS == "2"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_2"})
    aux_3 = bureau_balance[bureau_balance.STATUS == "3"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_3"})
    aux_4 = bureau_balance[bureau_balance.STATUS == "4"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_4"})
    aux_5 = bureau_balance[bureau_balance.STATUS == "5"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_5"})
    aux_6 = bureau_balance[bureau_balance.STATUS == "6"].groupby("SK_ID_BUREAU").STATUS.size().to_frame().reset_index().rename(columns = {"STATUS":"TIMES_BUCKET_6"})

    
    # aux_ever_delinq = bureau_balance.groupby("SK_ID_BUREAU").STATUS_NUMERIC.apply(lambda x: get_ever_delinquent(x)).to_frame().reset_index().rename(columns = {"STATUS_NUMERIC":"EVER_DELINQUENT"})
    aux_ever_delinq = bureau_balance.groupby("SK_ID_BUREAU").EVER_DELINQUENT.max().to_frame().reset_index().rename(columns = {"STATUS":"EVER_DELINQUENT"})
    aux_ph = bureau_balance.groupby("SK_ID_BUREAU").STATUS_NUMERIC.mean().to_frame().reset_index().rename(columns = {"STATUS_NUMERIC":"PAYMENT_HISTORY"})
    aux_consecutive = bureau_balance.groupby("SK_ID_BUREAU").STATUS_NUMERIC.apply(lambda x:consecutive_values(x, 0)).to_frame().reset_index().rename(columns = {"STATUS_NUMERIC":"CONSECUTIVE_NO_DELINQ"})

    
    ldf_aux = [bureau,aux_c, aux_1, aux_2, aux_3, aux_4, aux_5, aux_6, aux_ph, aux_consecutive,aux_ever_delinq]

    bureau = reduce(lambda x, y: pd.merge(x, y, on = "SK_ID_BUREAU", how = "left"), ldf_aux)
    print(f"Bureau processed on: {round((time() - t0)/60,2)} minutes")
    
    print("Creating extra application features...")
    
    # application features on train
    application_train["EXT_SOURCE_MEAN"] = (application_train["EXT_SOURCE_1"] + application_train["EXT_SOURCE_2"] + application_train["EXT_SOURCE_3"])/3
    application_train["EXT_SOURCE_ENTROPY"] = -(application_train["EXT_SOURCE_1"] * np.log1p(application_train["EXT_SOURCE_1"]) + application_train["EXT_SOURCE_2"] * np.log1p(application_train["EXT_SOURCE_2"]) + application_train["EXT_SOURCE_3"] * np.log1p(application_train["EXT_SOURCE_3"]))
    application_train["LOG_AMT_CREDIT_AMT_INCOME"] = np.log1p(application_train["AMT_CREDIT"]/application_train["AMT_INCOME_TOTAL"])
    application_train["LOG_AMT_GOODS_PRICE_AMT_INCOME"] = np.log(application_train["AMT_GOODS_PRICE"]/application_train["AMT_INCOME_TOTAL"])
    application_train['CREDIT_TO_ANNUITY_RATIO'] = application_train['AMT_CREDIT'] / application_train['AMT_ANNUITY']
    application_train['CREDIT_TO_GOODS_RATIO'] = application_train['AMT_CREDIT'] / application_train['AMT_GOODS_PRICE']
    application_train['INCOME_CREDIT_PERCENTAGE'] = application_train['AMT_INCOME_TOTAL'] / application_train['AMT_CREDIT']
    application_train['INCOME_PER_CHILD'] = application_train['AMT_INCOME_TOTAL'] / (1 + application_train['CNT_CHILDREN'])
    application_train['INCOME_PER_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']
    application_train['PAYMENT_RATE'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']
    
     # application features on test
    application_test["EXT_SOURCE_MEAN"] = (application_test["EXT_SOURCE_1"] + application_test["EXT_SOURCE_2"] + application_test["EXT_SOURCE_3"])/3
    application_test["EXT_SOURCE_ENTROPY"] = -(application_test["EXT_SOURCE_1"] * np.log1p(application_test["EXT_SOURCE_1"]) + application_test["EXT_SOURCE_2"] * np.log1p(application_test["EXT_SOURCE_2"]) + application_test["EXT_SOURCE_3"] * np.log1p(application_test["EXT_SOURCE_3"]))
    application_test["LOG_AMT_CREDIT_AMT_INCOME"] = np.log1p(application_test["AMT_CREDIT"]/application_test["AMT_INCOME_TOTAL"])
    application_test["LOG_AMT_GOODS_PRICE_AMT_INCOME"] = np.log(application_test["AMT_GOODS_PRICE"]/application_test["AMT_INCOME_TOTAL"])
    application_test['CREDIT_TO_ANNUITY_RATIO'] = application_test['AMT_CREDIT'] / application_test['AMT_ANNUITY']
    application_test['CREDIT_TO_GOODS_RATIO'] = application_test['AMT_CREDIT'] / application_test['AMT_GOODS_PRICE']
    application_test['INCOME_CREDIT_PERCENTAGE'] = application_test['AMT_INCOME_TOTAL'] / application_test['AMT_CREDIT']
    application_test['INCOME_PER_CHILD'] = application_test['AMT_INCOME_TOTAL'] / (1 + application_test['CNT_CHILDREN'])
    application_test['INCOME_PER_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
    application_test['PAYMENT_RATE'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']
    



    print(f"Training stacked model with app features...")
    t0 = time()

    cont_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = -1)), ("preprocessing", StandardScaler()), ("transformer",QuantileTransformer(output_distribution = "normal")), ("discretizer", KBinsDiscretizer(n_bins = 10))])
    cat_prepro = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = "missing")), ("encoding", OneHotEncoder(handle_unknown = "ignore"))])

    preprocessing = make_column_transformer((cont_prepro, selector(dtype_exclude = "object")), (cat_prepro,selector(dtype_include = "object")))

    dataset = application_train[flag_doc_feat].copy()

    estimator_flag = LogisticRegression(max_iter = 1000, class_weight = "balanced")
    estimator_social = Pipeline([("preprocessing", preprocessing),("model",LogisticRegression(class_weight= "balanced", max_iter= 1000))])
    estimator_main_app = Pipeline([("preprocessing", preprocessing),("model",LogisticRegression(class_weight= "balanced", max_iter= 1000))])
    ensembler = GradientBoostingClassifier(random_state =42,n_estimators= 100, max_features= None)

    # Train a model using only validation features
    X = application_train[flag_doc_feat].copy()
    y = application_train["TARGET"].copy()
    print("Customer validation model...")
    cust_behaiv_model = train_binary(X = X, y =y, estimator= estimator_flag)
    
    application_train["FLAG_MODEL_PREDICTED"] = cust_behaiv_model["model"].predict_proba(application_train[flag_doc_feat])[:,1]
    application_test["FLAG_MODEL_PREDICTED"] = cust_behaiv_model["model"].predict_proba(application_test[flag_doc_feat])[:,1]

    # Train a model using only socio-demo features
    X = application_train[social_demo_feat].copy()
    y = application_train["TARGET"].copy()
    print("Sociodemograph model...")
    social_model = train_binary(X = X, y = y, estimator = estimator_social)
    
    application_train["SOCIODEMO_MODEL_PREDICTED"] = social_model["model"].predict_proba(application_train[social_demo_feat])[:,1]
    application_test["SOCIODEMO_MODEL_PREDICTED"] = social_model["model"].predict_proba(application_test[social_demo_feat])[:,1]
    
    # Train a model using most relevant application features
    X = application_train[main_application_feat].copy()
    y = application_train["TARGET"].copy()
    print("Application model...")
    application_model = train_binary(X = X, y = y, estimator = estimator_main_app)
    
    application_train["APPLICATION_MODEL_PREDICTED"] = application_model["model"].predict_proba(application_train[main_application_feat])[:,1]
    application_test["APPLICATION_MODEL_PREDICTED"] = application_model["model"].predict_proba(application_test[main_application_feat])[:,1]
    
    predictions_feat = ["FLAG_MODEL_PREDICTED", "SOCIODEMO_MODEL_PREDICTED", "APPLICATION_MODEL_PREDICTED"]
    
    # Train a model using the output of the models
    X = application_train[predictions_feat].copy()
    y = application_train["TARGET"].copy()
    print("Application ensemble...")
    ensemble_model = train_binary(X = X, y = y, estimator = ensembler)
    
    application_train["APPLICATION_MODEL_STACKED_MODEL_PREDICTED"] = ensemble_model["model"].predict_proba(application_train[predictions_feat])[:,1]
    application_test["APPLICATION_MODEL_STACKED_MODEL_PREDICTED"] = ensemble_model["model"].predict_proba(application_test[predictions_feat])[:,1]
    
    
    print("Stacked models successfully trained!")
    
    
    application_test["TARGET"] = None
    
    application_test["SPLIT"] = "test"
    application_train["SPLIT"] = "train"

    print(f"- application train instances: {application_train.shape[0]}")
    print(f"- application test instances: {application_test.shape[0]}")
    print(f"- concatenated dataset instances: {application_train.shape[0] + application_test.shape[0]}")

    application_data = pd.concat([application_train, application_test], ignore_index  = True)
    
    # application features
    application_data["ext_source_mean"] = (application_data["EXT_SOURCE_1"] + application_data["EXT_SOURCE_2"] + application_data["EXT_SOURCE_3"])/3
    application_data["entropy_ex_source"] = -(application_data["EXT_SOURCE_1"] * np.log1p(application_data["EXT_SOURCE_1"]) + application_data["EXT_SOURCE_2"] * np.log1p(application_data["EXT_SOURCE_2"]) + application_data["EXT_SOURCE_3"] * np.log1p(application_data["EXT_SOURCE_3"]))
    application_data["log_amt_credit_amt_income"] = np.log1p(application_data["AMT_CREDIT"]/application_data["AMT_INCOME_TOTAL"])
    application_data["log_amt_goods_price_amt_income"] = np.log(application_data["AMT_GOODS_PRICE"]/application_data["AMT_INCOME_TOTAL"])
    
    application_data['credit_to_annuity_ratio'] = application_data['AMT_CREDIT'] / application_data['AMT_ANNUITY']
    application_data['credit_to_goods_ratio'] = application_data['AMT_CREDIT'] / application_data['AMT_GOODS_PRICE']
    application_data['days_employed_percentage'] = application_data['DAYS_EMPLOYED'] / application_data['DAYS_BIRTH']
    application_data['income_credit_percentage'] = application_data['AMT_INCOME_TOTAL'] / application_data['AMT_CREDIT']
    application_data['income_per_child'] = application_data['AMT_INCOME_TOTAL'] / (1 + application_data['CNT_CHILDREN'])
    application_data['income_per_person'] = application_data['AMT_INCOME_TOTAL'] / application_data['CNT_FAM_MEMBERS']
    application_data['payment_rate'] = application_data['AMT_ANNUITY'] / application_data['AMT_CREDIT']
    

    sk_id = application_data["SK_ID_CURR"].unique().tolist()

    data = pd.DataFrame(index = sk_id)

    print(f"Model based features on: {round((time() - t0)/60,2)} minutes")
    print("Initial dataframe info:")
    print(data.info())

    print(f"Creating features...")
    T0 = time()
    t0 = time()
    # total accounts
    data["total_accounts"] = bureau.groupby("SK_ID_CURR").size()
    # Open and closed accounts
    data["active_accounts"] = bureau.groupby("SK_ID_CURR").apply(lambda x: (x["CREDIT_ACTIVE"] == "Active").sum())
    data["cloased_accounts"] = bureau.groupby("SK_ID_CURR").apply(lambda x: (x["CREDIT_ACTIVE"] == "Closed").sum())
    
    data["max_amt_overdue"] = bureau.groupby("SK_ID_CURR").AMT_CREDIT_MAX_OVERDUE.max()
    data["mean_amt_overdue"] = bureau.groupby("SK_ID_CURR").AMT_CREDIT_MAX_OVERDUE.mean()
    data["mean_amt_credit_debt"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].mean()
    data["sum_amt_credit_sum"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].sum()
    data["sum_amt_credit_debt"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].sum()
    data["mean_amt_credit_limit"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].mean()
    data["sum_amt_credit_limit"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].sum()
    data["sum_amt_credit_overdue"]=bureau.groupby("SK_ID_CURR")["AMT_CREDIT_MAX_OVERDUE"].sum()
    print(f"First batch of features on: {round((time() - t0)/60,2)} minutes")
    
    data["credit_sum_by_debt"] = data["sum_amt_credit_debt"]/data["sum_amt_credit_sum"]
    data["credit_sum_by_limit"] = data["sum_amt_credit_limit"]/data["sum_amt_credit_sum"]
    data["credit_sum_by_overdue"] = data["sum_amt_credit_overdue"]/data["sum_amt_credit_sum"]
    
    t0 = time()
    # dividir entre AMT_CREDIT_SUM
    data["total_annuity"] = bureau.groupby("SK_ID_CURR").AMT_ANNUITY.sum()
    # Percentage open accounts
    data["active_prem_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
    data["active_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
    data["active_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
    data["active_credit_cards"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
    data["active_other_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["active_accounts"]
    # Percentage closed accounts
    data["closed_prem_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
    data["closed_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
    data["closed_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
    data["closed_credit_cards"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
    data["closed_other_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.size()/data["cloased_accounts"]
    
    
    
    # Saldo en $$ en cuentas abiertas
    data["sum_debt_active_prem_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
    data["sum_debt_active_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
    data["sum_debt_active_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
    data["sum_debt_active_credit_cards"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
    data["sum_debt_active_other_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.sum()
    # Trend Saldo en $$ en cuentas abiertas
    data["trend_debt_active_credits"] = bureau[(bureau["CREDIT_ACTIVE"] == "Active")].sort_values(by = ["SK_ID_CURR", "DAYS_CREDIT"]).groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.fillna(-1).apply(lambda x: get_linear_regression(x)["trend"])
    # Max en $$ en cuentas cerradas
    data["max_debt_closed_premium_loans"] = bureau[(bureau["CREDIT_TYPE"].isin(premium_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
    data["max_debt_closed_wk_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(working_capital_loans)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
    data["max_debt_closed_bank_credits"] = bureau[(bureau["CREDIT_TYPE"].isin(bank_credits)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
    data["max_debt_closed_credit_card"] = bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
    data["max_debt_closed_debt_others"] = bureau[(bureau["CREDIT_TYPE"].isin(others)) & (bureau["CREDIT_ACTIVE"] == "Closed")].groupby("SK_ID_CURR").AMT_CREDIT_SUM_DEBT.max()
    # div
    data["sum_amt_credit_limit_credit_cards_open"]=bureau[(bureau["CREDIT_TYPE"].isin(credit_cards)) & (bureau["CREDIT_ACTIVE"] == "Active")].groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].sum()
    data["perc_util_revolving"] = data["sum_debt_active_credit_cards"]/(data["sum_amt_credit_limit_credit_cards_open"] + 1)
    data["credit_debt_div_lim_cred"] = data["sum_amt_credit_debt"]/(data["sum_amt_credit_limit"] + 1)
    data["credit_lim_div_cred_overdue"] = data["sum_amt_credit_limit"]/(data["sum_amt_credit_overdue"] + 1)

    data.drop(["sum_amt_credit_sum", "sum_amt_credit_limit", "sum_amt_credit_limit_credit_cards_open"], axis = 1, inplace = True)
    # Max cred lim non defaulted
    data["max_cred_lim_non_delinq"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_SUM_LIMIT"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 0)]["AMT_CREDIT_SUM_LIMIT"].max())
    data["max_cred_lim_non_overdue"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_MAX_OVERDUE"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 0)]["AMT_CREDIT_MAX_OVERDUE"].max())
    
    data["max_cred_lim_delinq"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_SUM_LIMIT"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 1)]["AMT_CREDIT_SUM_LIMIT"].max())
    data["max_cred_lim_overdue"] = bureau.groupby("SK_ID_CURR")[["EVER_DELINQUENT", "AMT_CREDIT_MAX_OVERDUE"]].apply(lambda x: x[(x["EVER_DELINQUENT"] == 1)]["AMT_CREDIT_MAX_OVERDUE"].max())
 
    # paymet history
    data["payment_history"] = bureau.groupby("SK_ID_CURR")["PAYMENT_HISTORY"].mean()
    data["mean_consecutive_non_delinq"] = bureau.groupby("SK_ID_CURR")["CONSECUTIVE_NO_DELINQ"].mean()
    data["max_consecutive_non_delinq"] = bureau.groupby("SK_ID_CURR")["CONSECUTIVE_NO_DELINQ"].max()
    print(f"Second batch of features on: {round((time() - t0)/60,2)} minutes")
    
    t0 = time()
    # Number of times in bucket x
    data["times_bucket_1"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_1"].sum()
    data["times_bucket_2"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_2"].sum()
    data["times_bucket_3"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_3"].sum()
    data["times_bucket_4"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_4"].sum()
    data["times_bucket_5"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_5"].sum()
    data["times_bucket_6"] = bureau.groupby("SK_ID_CURR")["TIMES_BUCKET_6"].sum()
    # Antiguedad meses
    data["antiguedad_media_dias"] = bureau.groupby("SK_ID_CURR")["DAYS_CREDIT"].mean()
    data["antiguedad_media_dias_closed_accounts"] = bureau.groupby("SK_ID_CURR")["DAYS_CREDIT_ENDDATE"].mean()
    data["antiguedad_media_dias_open_accounts"] = bureau.groupby("SK_ID_CURR")["DAYS_ENDDATE_FACT"].mean()
    data["sum_prolong_days"] = bureau.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].sum()
    # Closed accounts last x days
    data["closed_accounts_last12m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -365].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["closed_accounts_last9m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -270].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["closed_accounts_last6m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -180].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["closed_accounts_last3m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -90].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["closed_accounts_last2m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -60].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["closed_accounts_last1m"] = bureau[bureau["DAYS_ENDDATE_FACT"] >= -30].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    # Open credit last x days
    data["open_accounts_last12m"] = bureau[bureau["DAYS_CREDIT"] >= -365].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["open_accounts_last9m"] = bureau[bureau["DAYS_CREDIT"] >= -270].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["open_accounts_last6m"] = bureau[bureau["DAYS_CREDIT"] >= -180].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["open_accounts_last3m"] = bureau[bureau["DAYS_CREDIT"] >= -90].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["open_accounts_last2m"] = bureau[bureau["DAYS_CREDIT"] >= -60].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["open_accounts_last1m"] = bureau[bureau["DAYS_CREDIT"] >= -30].groupby("SK_ID_CURR")["CREDIT_TYPE"].size()
    data["n_unique_credit_types"] = bureau.groupby("SK_ID_CURR")["CREDIT_TYPE"].nunique()
    # Trigonometric features
    data["cosine_cred_sum"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].apply(lambda x: (2 * np.pi  *np.cos(x)))
    data["sine_cred_sum"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM"].apply(lambda x: (2 * np.pi  *np.sin(x)))
    data["cosine_cred_sum_debt"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].apply(lambda x: (2 * np.pi  *np.cos(x)))
    data["sine_cred_sum_debt"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].apply(lambda x: (2 * np.pi  *np.sin(x)))
    data["cosine_cred_sum_limit"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].apply(lambda x: (2 * np.pi  *np.cos(x)))
    data["sine_cred_sum_limit"] = bureau.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_LIMIT"].apply(lambda x: (2 * np.pi  *np.sin(x)))
    

    
    print(f"Third batch of features on: {round((time() - t0)/60,2)} minutes")
    
    
    print(f"Building a model using credit features...")
    
    tmp = data.reset_index().rename(columns = {"index":"SK_ID_CURR"})
    tmp = tmp[tmp["SK_ID_CURR"].isin(train_skid_curr)]
        
    target = application_train[["SK_ID_CURR", "TARGET"]]
    
    tmp.columns = tmp.columns.str.lower()
    target.columns = target.columns.str.lower()
    
    tmp = pd.merge(left = tmp, right =target, on = "sk_id_curr" )
    
    tmp = tmp[credit_bureau_features]
    
    estimator = Pipeline([("preprocessing", preprocessing),("model",XGBClassifier(random_state = 42))])

    credit_model = train_binary(X = tmp ,y = target.target, estimator = estimator, cv = 3, refit_all = True, verbose = True)
    credit_model = credit_model["model"]
    
    del tmp
    
    
    t0 = time()
    # previous applications features
    data["sum_prev_applications"] = previous_application.groupby("SK_ID_CURR").SK_ID_PREV.size()
    data["mean_amt_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_APPLICATION.mean()
    data["mean_amt_cred_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_CREDIT.mean()
    data["mean_amt_annuity_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_ANNUITY.mean()
    data["sum_amt_downpayment_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_DOWN_PAYMENT.sum()
    data["mean_amt_goodsprice_prev_applications"] = previous_application.groupby("SK_ID_CURR").AMT_GOODS_PRICE.mean()
    data["mean_days_last_due_prev_applications"] = previous_application.groupby("SK_ID_CURR").DAYS_LAST_DUE.mean()
    data["mean_days_first_due_prev_applications"] = previous_application.groupby("SK_ID_CURR").DAYS_FIRST_DUE.mean()
    data["mean_term_prev_applications"] = previous_application.groupby("SK_ID_CURR").CNT_PAYMENT.mean()
    data["total_previous_revolving_credits"] = previous_application[previous_application["NAME_CONTRACT_TYPE"] == "Revolving loans"].groupby("SK_ID_CURR").SK_ID_PREV.size()
  
    
    
    print(f"Fourth batch of features on: {round((time() - t0)/60,2)} minutes")
    
    data["mean_amt_credit_previous_revolving_credits"] = previous_application[previous_application["NAME_CONTRACT_TYPE"] == "Revolving loans"].groupby("SK_ID_CURR")["AMT_CREDIT"].mean()
    data["mean_amt_downpayment_previous_revolving_credits"] = previous_application[previous_application["NAME_CONTRACT_TYPE"] == "Revolving loans"].groupby("SK_ID_CURR")["AMT_DOWN_PAYMENT"].mean()
    
    data["perc_prev_app_refused"] = previous_application[previous_application.NAME_CONTRACT_STATUS == "Refused"].groupby("SK_ID_CURR").SK_ID_PREV.size()/data["sum_prev_applications"]
    data["perc_prev_app_approved"] = previous_application[previous_application.NAME_CONTRACT_STATUS == "Approved"].groupby("SK_ID_CURR").SK_ID_PREV.size()/data["sum_prev_applications"]
    data["perc_prev_app_other"] = previous_application[previous_application.NAME_CONTRACT_STATUS.isin(["Canceled", "Unused offer"])].groupby("SK_ID_CURR").SK_ID_PREV.size()/data["sum_prev_applications"]
    
    data["mean_amt_prev_app_refused"] =previous_application[previous_application.NAME_CONTRACT_STATUS == "Refused"].groupby("SK_ID_CURR").AMT_APPLICATION.mean()
    data["max_amt_prev_app_refused"] =previous_application[previous_application.NAME_CONTRACT_STATUS == "Approved"].groupby("SK_ID_CURR").AMT_APPLICATION.max()
    data["min_amt_prev_app_refused"] =previous_application[previous_application.NAME_CONTRACT_STATUS.isin(["Canceled", "Unused offer"])].groupby("SK_ID_CURR").AMT_APPLICATION.min()
    
    data["channel_aq_last_refused_app"] = previous_application[previous_application.NAME_CONTRACT_STATUS == "Refused"].sort_values(by = ["SK_ID_CURR", "SK_ID_PREV"], ascending = [0,1]).drop_duplicates(subset = "SK_ID_CURR").groupby("SK_ID_CURR").CHANNEL_TYPE.apply(lambda x: x.unique()[0])

    # pos features
    data["cnt_installments_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.sum().groupby("SK_ID_CURR").mean()
    data["cnt_installments_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.sum().groupby("SK_ID_CURR").max()
    data["cnt_installments_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.sum().groupby("SK_ID_CURR").sum()
    data["cnt_installments_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    
    data["cnt_installments_fut_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.sum().groupby("SK_ID_CURR").mean()
    data["cnt_installments_fut_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.sum().groupby("SK_ID_CURR").max()
    data["cnt_installments_fut_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.sum().groupby("SK_ID_CURR").sum()
    data["cnt_installments_fut_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_FUTURE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    data["cnt_installments_pos_minus_pos_f"] = data["cnt_installments_pos_total"] - data["cnt_installments_fut_pos_total"] 
    
    data["cnt_installments_dpd_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").mean()
    data["cnt_installments_dpd_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").max()
    data["cnt_installments_dpd_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").sum()
    data["cnt_installments_dpd_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
      
    pos_history["PAR_X_AT_Y"] = pos_history["SK_DPD"]/pos_history["MONTHS_BALANCE"]

    data["cnt_installments_par_pos_mean"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.sum().groupby("SK_ID_CURR").mean()
    data["cnt_installments_par_pos_max"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.sum().groupby("SK_ID_CURR").max()
    data["cnt_installments_par_pos_total"] =pos_history.groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.sum().groupby("SK_ID_CURR").sum()
    data["cnt_installments_par_pos_trend"] =pos_history.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).PAR_X_AT_Y.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
   

    # credit cards balance
    t0 = time()
    data["mean_credit_cards_months"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.size().groupby("SK_ID_CURR").mean()
    data["max_credit_cards_months"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.size().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.sum().groupby("SK_ID_CURR").sum()
    data["max_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.sum().groupby("SK_ID_CURR").max()
    data["mean_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_balance"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_BALANCE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    print(f"Fifth batch of features on: {round((time() - t0)/60,2)} minutes")
    
    
    t0 = time()
    data["max_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_CREDIT_LIMIT_ACTUAL.sum().groupby("SK_ID_CURR").max()
    data["mean_credit_cards_amt_balance"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_CREDIT_LIMIT_ACTUAL.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_balance"] = credit_card_balance.sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_CREDIT_LIMIT_ACTUAL.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    data["max_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_paymentcurrent"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["mean_credit_cards_amt_paymentcurrent"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_paymentcurrent"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_total_recieivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_total_recieivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_total_recieivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_total_recieivable"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_TOTAL_RECEIVABLE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_min_regularity"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_min_regularity"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_min_regularity"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_min_regularity"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INST_MIN_REGULARITY.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_payment_total_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_payment_total_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_payment_total_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_payment_total_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT_TOTAL_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_drawings_atm_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_drawings_atm_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_drawings_atm_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_drawings_atm_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_ATM_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_drawings_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    print(f"Sixth batch of features on: {round((time() - t0)/60,2)} minutes")
    
    t0 = time()
    data["max_credit_cards_amt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_drawings_other_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_OTHER_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_drawings_pos_total_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_DRAWINGS_POS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_amt_recivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_amt_recivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_amt_recivable"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_amt_recivable"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_RECIVABLE.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_cnt_atm_drawings_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_ATM_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_cnt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_cnt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_cnt_drawings_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_cnt_drawings_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_cnt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_cnt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_cnt_drawings_other_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_cnt_drawings_other_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_OTHER_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_cnt_drawings_pos_current"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_DRAWINGS_POS_CURRENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_cards_cnt_installment_mature_cum"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).CNT_INSTALMENT_MATURE_CUM.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    data["max_credit_cards_dpd_def"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_dpd_def"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_dpd_def"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_dpd_def"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD_DEF.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    
    data["max_credit_cards_dpd"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").max()
    data["total_credit_cards_dpd"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").sum()
    data["mean_credit_cards_dpd"] = credit_card_balance.groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.sum().groupby("SK_ID_CURR").mean()
    data["mean_trend_credit_dpd"] = credit_card_balance.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).SK_DPD.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()

    # interacciones credit card balance

    data["amt_paycurr_div_total_amt_balance"] = data["max_credit_cards_amt_paymentcurrent"]/data["total_credit_cards_amt_balance"]
    data["amt_total_balnce_div_total_recivable"] = data["total_credit_cards_amt_balance"]/data["total_credit_cards_amt_total_recieivable"]

    # installmet features
    data["amt_installments_max_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").max()
    data["amt_installments_total_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").sum()
    data["amt_installments_mean_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").mean()
    data["amt_installments_trend"] =installments_payments.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV","NUM_INSTALMENT_NUMBER"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    
    data["amt_pay_installments_max_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT.sum().groupby("SK_ID_CURR").max()
    data["amt_pay_installments_total_amtl"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT.sum().groupby("SK_ID_CURR").sum()
    data["amt_pay_installments_mean_amt"] =installments_payments.groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_INSTALMENT.sum().groupby("SK_ID_CURR").mean()
    data["amt_pay_installments_trend"] =installments_payments.fillna(0).sort_values(by = ["SK_ID_CURR","SK_ID_PREV","NUM_INSTALMENT_NUMBER"]).groupby(["SK_ID_CURR", "SK_ID_PREV"]).AMT_PAYMENT.apply(lambda x: get_linear_regression(x)["trend"]).groupby("SK_ID_CURR").mean()
    

    data["installments_payment_vs_amt_installment"] = data["amt_pay_installments_total_amtl"]/data["amt_installments_total_amt"]
    print(f"Seventh batch of features on: {round((time() - t0)/60,2)} minutes")
    
    application_data.drop(social_demo_feat, axis = 1, inplace = True)
    application_data.drop(flag_doc_feat, axis = 1, inplace = True)
    
    
    data.reset_index(inplace = True)
    data.rename(columns = {"index":"SK_ID_CURR"}, inplace = True)
    data = data.merge(application_data, on = "SK_ID_CURR", how = "left")

    print(f"Features on: {round((time() - T0)/60,2)} minutes")
    
    data["credit_bureau_stacked_model"] = credit_model.predict_proba(data[credit_bureau_featurest_])[:,1]
    
    data.drop(useless_credit_feat, axis = 1, inplace = True)
    


    print("Successfully done!!!")
    data.columns = data.columns.str.lower()
    print(data.info())
    
    try:
    
        data = data.replace({np.inf:99999})
    except:
        print(f"Could not eliminate infinite values")
        pass
    
    data.to_csv("features.csv")
    data.to_pickle("features.pkl")

    if return_X_y:
    	X = data.drop("target", axis = 1)
    	y = data["target"]

    	return X, y
    else:
    	return data

In [None]:
features_data = main(application_train = application_train
            ,application_test = application_test
            ,previous_application = previous_application
            ,bureau = bureau
            ,bureau_balance = bureau_balance
            ,installments_payments = installments_payments
            ,credit_card_balance = credit_card_balance
            ,pos_history = pOS_CASH_balance)

In [None]:
features_data.iloc[:,:100].info()

In [None]:
features_data.iloc[:,100:200].info()

In [None]:
features_data.iloc[:,200:].info()

In [None]:
features_data.to_csv("./features.csv")

In [None]:
train = features_data[features_data["target"].notnull()]
test = features_data[features_data["target"].isnull()]

In [None]:
Xy_train = train.drop(["sk_id_curr", "split"], axis = 1)


X_train, X_test, y_train, y_test = train_test_split(Xy_train.drop(["target"], axis = 1), Xy_train[["target"]], test_size = .3, random_state = 42)

y_train = y_train.astype("int64")
y_test = y_test.astype("int64")

In [None]:
# Preprocessing

pipe_con = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = -1)), ("scaling", StandardScaler())])
pipe_cat = Pipeline([("imputing", SimpleImputer(strategy = "constant", fill_value = "missing")), ("encoding", OneHotEncoder(handle_unknown = "ignore"))])

preprocessor = make_column_transformer((pipe_con, make_column_selector(dtype_exclude = "object")), (pipe_cat, make_column_selector(dtype_include = "object")))

In [None]:
t0 = time.time()
pipe_linear = Pipeline([("preprocessing", preprocessor), ("modeling", SGDClassifier(loss = "log", class_weight = "balanced"))]).fit(X_train, y_train)
t1 = time.time()
print(f"Linear model trained in {round((t1-t0)/60,2)} minutes")
t0 = time.time()
pipe_xgb = Pipeline([("preprocessing", preprocessor), ("modeling", XGBClassifier(random_state = 42, booster = "dart", sampling_method = "gradient_based", scale_pos_weight = 0.088899))]).fit(X_train, y_train)
t1 = time.time()
print(f"XGBoost model trained in {round((t1-t0)/60,2)} minutes")
t0 = time.time()
pipe_rf = Pipeline([("preprocessing", preprocessor), ("modeling", XGBRFClassifier(random_state = 42, scale_pos_weight = 0.088899))]).fit(X_train, y_train)
t1 = time.time()
print(f"Random Forest model trained in {round((t1-t0)/60,2)} minutes")
t0 = time.time()
pipe_baseline = Pipeline([("preprocessing", preprocessor), ("modeling", DummyClassifier(random_state = 42))]).fit(X_train, y_train)
t1 = time.time()
print(f"Base model trained in {round((t1-t0)/60,2)} minutes")

In [None]:
models = [pipe_linear, pipe_xgb, pipe_rf, pipe_baseline]

plot_cv_score(X = X_train, y = y_train, models_list= models, refit= False, scoring= "roc_auc")

In [None]:
    credit_bureau_features = ["total_accounts"
    ,"active_accounts"
    ,"cloased_accounts"
    ,"max_amt_overdue"
    ,"mean_amt_overdue"
    ,"mean_amt_credit_debt"
    ,"sum_amt_credit_debt"
    ,"mean_amt_credit_limit"
    ,"sum_amt_credit_overdue"
    ,"credit_sum_by_debt"
    ,"credit_sum_by_limit"
    ,"credit_sum_by_overdue"
    ,"total_annuity"
    ,"active_prem_credits"
    ,"active_wk_credits"
    ,"active_bank_credits"
    ,"active_credit_cards"
    ,"active_other_credits"
    ,"closed_prem_credits"
    ,"closed_wk_credits"
    ,"closed_bank_credits"
    ,"closed_credit_cards"
    ,"closed_other_credits"
    ,"sum_debt_active_prem_credits"
    ,"sum_debt_active_wk_credits"
    ,"sum_debt_active_bank_credits"
    ,"sum_debt_active_credit_cards"
    ,"sum_debt_active_other_credits"
    ,"trend_debt_active_credits"
    ,"max_debt_closed_premium_loans"
    ,"max_debt_closed_wk_credits"
    ,"max_debt_closed_bank_credits"
    ,"max_debt_closed_credit_card"
    ,"max_debt_closed_debt_others"
    ,"perc_util_revolving"
    ,"credit_debt_div_lim_cred"
    ,"credit_lim_div_cred_overdue"
    ,"max_cred_lim_non_delinq"
    ,"max_cred_lim_non_overdue"
    ,"max_cred_lim_delinq"
    ,"max_cred_lim_overdue"
    ,"payment_history"
    ,"mean_consecutive_non_delinq"
    ,"max_consecutive_non_delinq"
    ,"times_bucket_1"
    ,"times_bucket_2"
    ,"times_bucket_3"
    ,"times_bucket_4"
    ,"times_bucket_5"
    ,"times_bucket_6"
    ,"antiguedad_media_dias"
    ,"antiguedad_media_dias_closed_accounts"
    ,"antiguedad_media_dias_open_accounts"
    ,"sum_prolong_days"
    ,"closed_accounts_last12m"
    ,"closed_accounts_last9m"
    ,"closed_accounts_last6m"
    ,"closed_accounts_last3m"
    ,"closed_accounts_last2m"
    ,"closed_accounts_last1m"
    ,"open_accounts_last12m"
    ,"open_accounts_last9m"
    ,"open_accounts_last6m"
    ,"open_accounts_last3m"
    ,"open_accounts_last2m"
    ,"open_accounts_last1m"
    ,"n_unique_credit_types"
    ,"cosine_cred_sum"
    ,"sine_cred_sum"
    ,"cosine_cred_sum"
    ,"sine_cred_sum"
    ,"cosine_cred_sum"
    ,"sine_cred_sum"]


In [None]:
credit_bureau_features = ["total_accounts"
    ,"active_accounts"
    ,"cloased_accounts"
    ,"max_amt_overdue"
    ,"mean_amt_overdue"
    ,"mean_amt_credit_debt"
    ,"sum_amt_credit_debt"
    ,"mean_amt_credit_limit"
    ,"sum_amt_credit_overdue"
    ,"credit_sum_by_debt"
    ,"credit_sum_by_limit"
    ,"credit_sum_by_overdue"
    ,"total_annuity"
    ,"active_prem_credits"
    ,"active_wk_credits"
    ,"active_bank_credits"
    ,"active_credit_cards"
    ,"active_other_credits"
    ,"closed_prem_credits"
    ,"closed_wk_credits"
    ,"closed_bank_credits"
    ,"closed_credit_cards"
    ,"closed_other_credits"
    ,"sum_debt_active_prem_credits"
    ,"sum_debt_active_wk_credits"
    ,"sum_debt_active_bank_credits"
    ,"sum_debt_active_credit_cards"
    ,"sum_debt_active_other_credits"
    ,"trend_debt_active_credits"
    ,"max_debt_closed_premium_loans"
    ,"max_debt_closed_wk_credits"
    ,"max_debt_closed_bank_credits"
    ,"max_debt_closed_credit_card"
    ,"max_debt_closed_debt_others"
    ,"perc_util_revolving"
    ,"credit_debt_div_lim_cred"
    ,"credit_lim_div_cred_overdue"
    ,"max_cred_lim_non_delinq"
    ,"max_cred_lim_non_overdue"
    ,"max_cred_lim_delinq"
    ,"max_cred_lim_overdue"
    ,"payment_history"
    ,"mean_consecutive_non_delinq"
    ,"max_consecutive_non_delinq"
    ,"times_bucket_1"
    ,"times_bucket_2"
    ,"times_bucket_3"
    ,"times_bucket_4"
    ,"times_bucket_5"
    ,"times_bucket_6"
    ,"antiguedad_media_dias"
    ,"antiguedad_media_dias_closed_accounts"
    ,"antiguedad_media_dias_open_accounts"
    ,"sum_prolong_days"
    ,"closed_accounts_last12m"
    ,"closed_accounts_last9m"
    ,"closed_accounts_last6m"
    ,"closed_accounts_last3m"
    ,"closed_accounts_last2m"
    ,"closed_accounts_last1m"
    ,"open_accounts_last12m"
    ,"open_accounts_last9m"
    ,"open_accounts_last6m"
    ,"open_accounts_last3m"
    ,"open_accounts_last2m"
    ,"open_accounts_last1m"
    ,"n_unique_credit_types"
    ,"cosine_cred_sum"
    ,"sine_cred_sum"
    ,"cosine_cred_sum"
    ,"sine_cred_sum"
    ,"cosine_cred_sum"
    ,"sine_cred_sum"]


In [None]:
credit_bureau_features = ['sum_debt_active_bank_credits',
 'max_consecutive_non_delinq',
 'times_bucket_5',
 'open_accounts_last6m',
 'open_accounts_last9m',
 'total_annuity',
 'active_credit_cards',
 'max_cred_lim_overdue',
 'times_bucket_2',
 'antiguedad_media_dias',
 'credit_debt_div_lim_cred',
 'mean_amt_credit_limit',
 'closed_other_credits',
 'max_cred_lim_delinq',
 'closed_accounts_last6m',
 'credit_sum_by_debt',
 'closed_prem_credits',
 'credit_lim_div_cred_overdue',
 'closed_accounts_last9m',
 'max_debt_closed_debt_others',
 'total_accounts',
 'active_other_credits',
 'times_bucket_4',
 'sum_prolong_days',
 'max_debt_closed_wk_credits',
 'max_cred_lim_non_delinq',
 'closed_accounts_last2m',
 'open_accounts_last1m',
 'payment_history',
 'sum_debt_active_prem_credits',
 'sum_debt_active_wk_credits',
 'sum_debt_active_credit_cards',
 'max_debt_closed_bank_credits',
 'closed_wk_credits',
 'mean_consecutive_non_delinq',
 'times_bucket_1',
 'mean_amt_overdue',
 'sum_amt_credit_debt',
 'times_bucket_6',
 'trend_debt_active_credits',
 'credit_sum_by_overdue',
 'active_wk_credits',
 'open_accounts_last2m',
 'sum_amt_credit_overdue',
 'max_cred_lim_non_overdue',
 'perc_util_revolving',
 'closed_accounts_last12m',
 'credit_sum_by_limit',
 'max_debt_closed_credit_card',
 'closed_accounts_last3m',
 'closed_bank_credits',
 'open_accounts_last12m',
 'closed_credit_cards',
 'max_amt_overdue',
 'cloased_accounts',
 'n_unique_credit_types',
 'max_debt_closed_premium_loans',
 'open_accounts_last3m',
 'antiguedad_media_dias_closed_accounts',
 'antiguedad_media_dias_open_accounts',
 'times_bucket_3',
 'closed_accounts_last1m',
 'sum_debt_active_other_credits',
 'active_prem_credits',
 'active_bank_credits',
 'active_accounts',
 'mean_amt_credit_debt',
 "cosine_cred_sum",
 "sine_cred_sum",
 "cosine_cred_sum_debt",
 "sine_cred_sum_debt",
 "cosine_cred_sum_limit",
 "sine_cred_sum_limit"]

In [None]:
pd.DataFrame(index = credit_bureau_features).reset_index().value_counts()

In [None]:
list(set(credit_bureau_features))