# Import of package

In [54]:
import os, re, gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import RepeatedStratifiedKFold, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import lightgbm as lgb
import xgboost as xgb
import time

# Setting paths and dataname

In [55]:
basepath = '/kaggle/input/home-credit-default-risk'

# DATASET FILENAMES
cols_info_dataname = 'HomeCredit_columns_description.csv'

# MAIN DATASETS
app_train_dataname = 'application_train.csv'
test_dataname = 'application_test.csv'

# SECONDARY DATASETS
bureau_dataname = 'bureau.csv'
bureau_balance_dataname = 'bureau_balance.csv'
previous_app_dataname = 'previous_application.csv'
pos_cash_bal_dataname = 'POS_CASH_balance.csv'
instal_pay_dataname = 'installments_payments.csv'
credit_card_bal_dataname = 'credit_card_balance.csv'

# Loading main datasets...

In [56]:
app_train = pd.read_csv(os.path.join(basepath, app_train_dataname))        
cols_info = pd.read_csv(os.path.join(basepath, cols_info_dataname), encoding='latin')
app_test = pd.read_csv(os.path.join(basepath, test_dataname))

# See Informations about columns

In [57]:
# Information about columns
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 10000)
cols_info[['Table', 'Row', 'Description']]

Unnamed: 0,Table,Row,Description
0,application_{train|test}.csv,SK_ID_CURR,ID of loan in our sample
1,application_{train|test}.csv,TARGET,"Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)"
2,application_{train|test}.csv,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving
3,application_{train|test}.csv,CODE_GENDER,Gender of the client
4,application_{train|test}.csv,FLAG_OWN_CAR,Flag if the client owns a car
5,application_{train|test}.csv,FLAG_OWN_REALTY,Flag if client owns a house or flat
6,application_{train|test}.csv,CNT_CHILDREN,Number of children the client has
7,application_{train|test}.csv,AMT_INCOME_TOTAL,Income of the client
8,application_{train|test}.csv,AMT_CREDIT,Credit amount of the loan
9,application_{train|test}.csv,AMT_ANNUITY,Loan annuity


In [58]:
del cols_info
gc.collect()

82

# Load Secondary datasets

## Bureau data

In [59]:
# LOAD BUREAU DATASETS
bureau = pd.read_csv(os.path.join(basepath, bureau_dataname))
bureau_balance = pd.read_csv(os.path.join(basepath, bureau_balance_dataname))

# JOIN TWO DATASET
bureau = bureau.merge(bureau_balance, how='left', on='SK_ID_BUREAU')

# DROP NON RELEVANT COLUMNS
bureau = bureau.drop(labels=['SK_ID_BUREAU'], axis=1)

# WE EXTRACT ACTIVE CREDITS
active_cr = bureau[bureau['CREDIT_ACTIVE'] == "Active"]

# WE EXTRACT CLOSED CREDITS
closed_cr = bureau[bureau['CREDIT_ACTIVE'] == 'Closed']

# WE CREATE NEW FEATURE IN ORDER TO OBTAIN MORE INFORMATION FROM DATA BY AGGREGATION   
f_aggr = {
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
    'AMT_ANNUITY': ['max', 'mean'],
    'CNT_CREDIT_PROLONG': ['sum'],
    'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'DAYS_CREDIT_UPDATE': ['mean'],
}

# WE AGGREGATE MIXED CREDITS
bureau_agg = bureau.groupby('SK_ID_CURR').agg(func=f_aggr)
bureau_agg.columns = pd.Index([item[0].upper() + '_' + item[1].upper() for item in bureau_agg.columns.tolist()])

# WE AGGREGATE ACTIVE CREDITS
active_cr_agg = active_cr.groupby('SK_ID_CURR').agg(func=f_aggr)
active_cr_agg.columns = pd.Index([item[0].upper() + '_ACTIVE_' + item[1].upper() for item in active_cr_agg.columns.tolist()])

# WE AGGREGATE CLOSED CREDITS
closed_cr_agg = closed_cr.groupby('SK_ID_CURR').agg(func=f_aggr)
closed_cr_agg.columns = pd.Index([item[0].upper() + '_CLOSED_' + item[1].upper() for item in closed_cr_agg.columns.tolist()])

# WE NOW MERGE ACTIVE, CLOSED AND MIXED CREDITS
bureau_mrg = bureau_agg.merge(active_cr_agg, how='left', on="SK_ID_CURR")
bureau_mrg = bureau_mrg.merge(closed_cr_agg, how='left', on="SK_ID_CURR")

# DELETING NO LONGER USEFUL DATA
del bureau_balance, f_aggr, bureau, bureau_agg, active_cr, active_cr_agg, closed_cr, closed_cr_agg
gc.collect()

0

## Previous application data

In [60]:
# LOAD PREVIOUS_APPLICATION DATASET
previous_app = pd.read_csv(os.path.join(basepath, previous_app_dataname))

# GET APPROVED APPLICATION
appr_app = previous_app[previous_app['NAME_CONTRACT_STATUS'] == 'Approved']

# GET REFUSE APPLICATION
refus_app = previous_app[previous_app['NAME_CONTRACT_STATUS'] == 'Refused']

# WE CREATE NEW FEATURE IN ORDER TO OBTAIN MORE INFORMATION FROM DATA BY AGGREGATION   
f_aggr = {
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
    'RATE_INTEREST_PRIMARY' : ['min', 'max', 'mean'],
    'RATE_INTEREST_PRIVILEGED' : ['min', 'max', 'mean'],
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
}

# WE AGGREGATE PREVIOUS APPLICATIONS
previous_app_agg = previous_app.groupby('SK_ID_CURR').agg(func=f_aggr)
previous_app_agg.columns = pd.Index([item[0].upper() + '_' + item[1].upper() for item in previous_app_agg.columns.tolist()])

# WE AGGREGATE APPROVED APPLICATIONS
appr_app_agg = appr_app.groupby('SK_ID_CURR').agg(func=f_aggr)
appr_app_agg.columns = pd.Index([item[0].upper() + '_APPROVED' + item[1].upper() + '_APPROVED' for item in appr_app_agg.columns.tolist()])

# WE AGGREGATE REFUSED APPLICATIONS
refus_app_agg = refus_app.groupby('SK_ID_CURR').agg(func=f_aggr)
refus_app_agg.columns = pd.Index([item[0].upper() + '_' + item[1].upper() + '_REFUSED' for item in refus_app_agg.columns.tolist()])

# WE NOW MARGE MIXED, APPROVED AND REFUSED APPLICATIONS
previous_app_mrg = previous_app_agg.merge(appr_app_agg, how= 'left', on = 'SK_ID_CURR')
previous_app_mrg = previous_app_mrg.merge(refus_app_agg, how= 'left', on = 'SK_ID_CURR')

# DELETING NO LONGER USEFUL DATA
del previous_app, previous_app_agg, appr_app, appr_app_agg, refus_app, refus_app_agg, f_aggr
gc.collect()

0

## Pos cash balance

In [61]:
# LOAD POS_CASH DATASET
pos_cash_bal = pd.read_csv(os.path.join(basepath, pos_cash_bal_dataname))

# WE CREATE NEW FEATURE IN ORDER TO OBTAIN MORE INFORMATION FROM DATA BY AGGREGATION   
f_aggr = {
    'MONTHS_BALANCE': ['max', 'mean', 'size'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}

# WE AGGREGATE POS_CASH_BALANCE
pos_cash_bal_agg = pos_cash_bal.groupby('SK_ID_CURR').agg(func=f_aggr)
pos_cash_bal_agg.columns = pd.Index([item[0].upper() + '_' + item[1].upper() for item in pos_cash_bal_agg.columns.tolist()])

del pos_cash_bal, f_aggr
gc.collect()

20

## Installments payments

In [62]:
# LOAD INSTALLMENTS PAYMENTS DATASET
instal_paym = pd.read_csv(os.path.join(basepath, instal_pay_dataname))

# FIX DATA
instal_paym['DAYS_PASSED_FROM_PAYMENT'] = instal_paym['DAYS_ENTRY_PAYMENT'] - instal_paym['DAYS_INSTALMENT']
instal_paym['DAYS_BEFORE_THE_PAYMENT'] = instal_paym['DAYS_INSTALMENT'] - instal_paym['DAYS_ENTRY_PAYMENT']
instal_paym['DAYS_PASSED_THE_PAYMENT'] = instal_paym['DAYS_PASSED_FROM_PAYMENT'].apply(lambda x: x if x > 0 else 0)
instal_paym['DAYS_BEFORE_THE_PAYMENT'] = instal_paym['DAYS_BEFORE_THE_PAYMENT'].apply(lambda x: x if x > 0 else 0)
instal_paym['PAYM_RATE'] = instal_paym['AMT_PAYMENT'] / instal_paym['AMT_INSTALMENT']
instal_paym['PAYM_DIFFERENCE'] = instal_paym['AMT_INSTALMENT'] - instal_paym['AMT_PAYMENT']

# WE CREATE NEW FEATURE IN ORDER TO OBTAIN MORE INFORMATION FROM DATA BY AGGREGATION   
f_aggr = {
    'DAYS_BEFORE_THE_PAYMENT': ['max', 'mean', 'sum'],
    'PAYM_RATE': ['max', 'mean', 'sum', 'var'],
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DAYS_PASSED_FROM_PAYMENT': ['max', 'mean', 'sum'],
    'PAYM_DIFFERENCE': ['max', 'mean', 'sum', 'var'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
}

# AGGREGATE INSTALLMENTS PAYMENTS DATA
instal_paym_agg = instal_paym.groupby('SK_ID_CURR').agg(func=f_aggr)
instal_paym_agg.columns = pd.Index([e[0].upper() + '_' + e[1].upper() +'_INSTAL_PAY' for e in instal_paym_agg.columns.tolist()])

# ADDING FEATURE 
instal_paym_agg['INSTAS_COUNT'] = instal_paym.groupby('SK_ID_CURR').size()

del instal_paym, f_aggr
gc.collect()

0

## Credit card balance

In [63]:
# LOAD CREDIT CARD DATASET
credit_cards = pd.read_csv(os.path.join(basepath, credit_card_bal_dataname))

# DROP NO LONGER USEFUL FEATURE
credit_cards = credit_cards.drop(['SK_ID_PREV'], axis= 1)

# AGGREGATION FUNCTION
f_aggr = [ 'mean', 'sum', 'var','min', 'max']

# WE AGGREGATE ALL FEATURE FROM CREDIT CARD DATASET
credit_cards_agg = credit_cards.groupby('SK_ID_CURR').agg(f_aggr)
credit_cards_agg.columns = pd.Index(['CRED_CARD_'+ e[0].upper() + '_' + e[1].upper() for e in credit_cards_agg.columns.tolist()])

# ADDING COUNT FEATURE
credit_cards_agg['CRED_CARD_CNT'] = credit_cards.groupby('SK_ID_CURR').size()

del credit_cards, f_aggr

# Merge main dataset with secondary datasets

In [64]:
# BUREAU
app_train = app_train.merge(bureau_mrg, how='left', on='SK_ID_CURR')
app_test = app_test.merge(bureau_mrg, how='left', on='SK_ID_CURR')

del bureau_mrg
gc.collect()

# PREVIOUS APPL
app_train = app_train.merge(previous_app_mrg, how='left', on='SK_ID_CURR')
app_test = app_test.merge(previous_app_mrg, how='left', on='SK_ID_CURR')

del previous_app_mrg
gc.collect()

# POS CASH BALANCE
app_train = app_train.merge(pos_cash_bal_agg, how='left', on='SK_ID_CURR')
app_test = app_test.merge(pos_cash_bal_agg, how='left', on='SK_ID_CURR')

del pos_cash_bal_agg
gc.collect()

# INSTALLMENTS PAYMENTS
app_train = app_train.merge(instal_paym_agg, how='left', on='SK_ID_CURR')
app_test = app_test.merge(instal_paym_agg, how='left', on='SK_ID_CURR')

del instal_paym_agg
gc.collect()

# CREDIT CARD
app_train = app_train.merge(credit_cards_agg, how='left', on='SK_ID_CURR')
app_test = app_test.merge(credit_cards_agg, how='left', on='SK_ID_CURR')

del credit_cards_agg
gc.collect()

0

# Handling Missing Values

In [65]:
# Information about missing values
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

# Missing values statistics
#missing_values = missing_values_table(app_train)


# Label and One-hot encoding

In [66]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d label encode done.' % le_count)

# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

del le
gc.collect()

3 label encode done.
Training Features shape:  (307511, 542)
Testing Features shape:  (48744, 538)


0

# Align Training and Testing set 

In [67]:
targets_train = app_train[['TARGET']]

# We keep only columns present in both dataframes, this step drop the target column
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (307511, 538)
Testing Features shape:  (48744, 538)


# Impute missing value

We impute data on train and test and then add the target column back in train set

In [68]:
"""# we create imputer
imputer = SimpleImputer(strategy='mean') 

# And impute missing values
app_train_imputed = imputer.fit_transform(app_train.replace([np.inf, -np.inf], np.nan))
app_test_imputed = imputer.transform(app_test.replace([np.inf, -np.inf], np.nan))

del app_train, app_test, imputer
gc.collect()"""

"# we create imputer\nimputer = SimpleImputer(strategy='mean') \n\n# And impute missing values\napp_train_imputed = imputer.fit_transform(app_train.replace([np.inf, -np.inf], np.nan))\napp_test_imputed = imputer.transform(app_test.replace([np.inf, -np.inf], np.nan))\n\ndel app_train, app_test, imputer\ngc.collect()"

In [69]:
# Select the target column
y = np.array(targets_train['TARGET'])

# Training data
X = np.array(app_train.replace([np.inf, -np.inf], np.nan))

del app_train, targets_train
gc.collect()

60

### Under/Over sampling

In [70]:
over = RandomOverSampler(sampling_strategy=0.1058)
under = RandomUnderSampler(sampling_strategy=0.47)# valore migliore 0.47

# Define the model

In [71]:
classifier = lgb.LGBMClassifier(n_jobs=4, 
                                n_estimators=1000,
                                objective= 'binary', 
                                learning_rate=0.0190, 
                                num_leaves=25,     # riportare a 28
                                colsample_bytree=0.9,
                                subsample=0.9,
                                scale_pos_weight=1,    
                                max_depth=5,        # riportare a 5
                                min_split_gain=0.08,
                                min_child_weight=1,    
                                boosting_type='goss', 
                                random_state=40
                               )

"""classifier = RandomForestClassifier(n_estimators = 300, ###
                                    criterion='entropy', 
                                    random_state = 50, 
                                    verbose = 1,
                                    min_samples_split = 0.00015, ###
                                    min_samples_leaf = 0.00002,
                                    n_jobs=4)"""

"classifier = RandomForestClassifier(n_estimators = 300, ###\n                                    criterion='entropy', \n                                    random_state = 50, \n                                    verbose = 1,\n                                    min_samples_split = 0.00015, ###\n                                    min_samples_leaf = 0.00002,\n                                    n_jobs=4)"

In [72]:
steps = [('o', over), ('u', under), ('m', classifier)]
pipeline = Pipeline(steps=steps)

In [73]:
# Params:
n_folds = 5

cv = KFold(n_splits = n_folds)
scores = cross_val_score(pipeline[1:], X, y, scoring='roc_auc', cv=cv, n_jobs=4) #inserire il [2:]

In [74]:
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.780


In [75]:
# Train the model on training data
classifier.fit(X, y);

In [76]:
predictions = classifier.predict_proba(app_test, num_iteration=classifier.best_iteration_)[:,1]

### Save predictions on csv

In [84]:
# Make the submission dataframe
submit = app_test[['SK_ID_CURR']]
submit.insert(1, 'TARGET', predictions)

# Set timezone
os.environ["TZ"] = "Europe/Rome"
time.tzset()

# Save file by time
output_name = time.strftime("prediction_%H_%M_%S.csv", time.localtime())
submit.to_csv(output_name, index = False)

prediction_11_25_40.csv
