# All necessary imports

In [1]:
import sys
sys.path.append('..')

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
from sklearn.ensemble import VotingClassifier

In [4]:
import lightgbm as lgb

In [5]:
import xgboost as xgb

In [6]:
from imblearn.under_sampling import RandomUnderSampler

In [7]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [8]:
data_path = '../data/dataset/transformed/{}.csv'

# Choose random_state

In [9]:
random_state = 42

# Data reading

Firstly we just load all data into memory, then profile each dataset,

then try to filter features that are most interesting for us at the moment

(continuous, categorical, binary features without na, features with low na percentage etc.).

In [10]:
dataset_names = [
    'application_train',
    'application_test',
    'bureau',
#     'bureau_balance',
#     'credit_card_balance',
#     'installments_payments',
#     'POS_CASH_balance',
    'previous_application'
#     'sample_submission'
]

In [11]:
train_n, test_n, bureau_n, bureau_balance_n, credit_card_balance_n, installments_payments_n, POS_CASH_balance_n, previous_application_n, sample_submission_n = 0, 1, 2, 3, 4, 5, 6, 3, 8

In [12]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

100%|██████████| 4/4 [00:35<00:00,  8.96s/it]


# Feature description

Here at this picture the general data structure is reflected.

Lots of connections and, as a consequence, lots of hypothetial issues with data.

![Image of data scheme](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

In [13]:
data_dict[dataset_names[train_n]].SK_ID_CURR.nunique()

307507

In [14]:
len(set(data_dict[dataset_names[train_n]].SK_ID_CURR) & set(data_dict[dataset_names[bureau_n]].SK_ID_CURR))

263487

In [15]:
len(set(data_dict[dataset_names[train_n]].SK_ID_CURR) & set(data_dict[dataset_names[previous_application_n]].SK_ID_CURR))

291053

In [16]:
data_dict[dataset_names[test_n]].SK_ID_CURR.nunique()

48744

In [17]:
len(set(data_dict[dataset_names[test_n]].SK_ID_CURR) & set(data_dict[dataset_names[bureau_n]].SK_ID_CURR))

42320

In [18]:
len(set(data_dict[dataset_names[test_n]].SK_ID_CURR) & set(data_dict[dataset_names[previous_application_n]].SK_ID_CURR))

47800

# Preprocessing

In [19]:
common_features = list(set(data_dict[dataset_names[train_n]].columns) & set(data_dict[dataset_names[test_n]].columns))

In [20]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(data_dict[dataset_names[train_n]][common_features], data_dict[dataset_names[train_n]].TARGET)

In [30]:
X_b = pd.DataFrame(X_b, columns=common_features)

In [21]:
data_dict[dataset_names[bureau_n]] = data_dict[dataset_names[bureau_n]].groupby('SK_ID_CURR', as_index = False).aggregate([min, max, sum, len, np.mean, np.median]).reset_index()

In [22]:
data_dict[dataset_names[bureau_n]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305811 entries, 0 to 305810
Columns: 175 entries, (SK_ID_CURR, ) to (SK_ID_BUREAU, median)
dtypes: float64(175)
memory usage: 408.3 MB


In [23]:
# List of column names
columns = ['SK_ID_CURR']

# Iterate through the variables names
for var in data_dict[dataset_names[bureau_n]].columns.levels[0]:
    # Skip the id name
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
        for stat in data_dict[dataset_names[bureau_n]].columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('bureau_%s_%s' % (var, stat))

In [24]:
data_dict[dataset_names[bureau_n]].columns = columns

In [25]:
data_dict[dataset_names[previous_application_n]] = data_dict[dataset_names[previous_application_n]].groupby('SK_ID_CURR', as_index = False).aggregate([min, max, sum, len, np.mean, np.median]).reset_index()

In [26]:
data_dict[dataset_names[previous_application_n]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338857 entries, 0 to 338856
Columns: 745 entries, (SK_ID_CURR, ) to (SK_ID_PREV, median)
dtypes: float64(745)
memory usage: 1.9 GB


In [27]:
# List of column names
columns = ['SK_ID_CURR']

# Iterate through the variables names
for var in data_dict[dataset_names[previous_application_n]].columns.levels[0]:
    # Skip the id name
    if var != 'SK_ID_CURR':
        
        # Iterate through the stat names
        for stat in data_dict[dataset_names[previous_application_n]].columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('bureau_%s_%s' % (var, stat))

In [28]:
data_dict[dataset_names[previous_application_n]].columns = columns

In [31]:
data_dict[dataset_names[train_n]] = pd.merge(X_b, data_dict[dataset_names[bureau_n]], on='SK_ID_CURR', how='left')

In [32]:
data_dict[dataset_names[train_n]] = pd.merge(data_dict[dataset_names[train_n]], data_dict[dataset_names[previous_application_n]], on='SK_ID_CURR', how='left')

In [33]:
data_dict[dataset_names[test_n]] = pd.merge(data_dict[dataset_names[test_n]], data_dict[dataset_names[bureau_n]], on='SK_ID_CURR', how='left')

In [34]:
data_dict[dataset_names[test_n]] = pd.merge(data_dict[dataset_names[test_n]], data_dict[dataset_names[previous_application_n]], on='SK_ID_CURR', how='left')

In [35]:
data_dict[dataset_names[train_n]].info(verbose=10, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49650 entries, 0 to 49649
Data columns (total 1042 columns):
ORGANIZATION_TYPE_45                          49650 non-null float64
FLAG_PHONE                                    49650 non-null float64
FLAG_DOCUMENT_3                               49650 non-null float64
FLAG_DOCUMENT_11                              49650 non-null float64
ORGANIZATION_TYPE_31                          49650 non-null float64
FLAG_DOCUMENT_8                               49650 non-null float64
NAME_INCOME_TYPE_5                            49650 non-null float64
ORGANIZATION_TYPE_16                          49650 non-null float64
ORGANIZATION_TYPE_51                          49650 non-null float64
FLAG_DOCUMENT_5                               49650 non-null float64
ORGANIZATION_TYPE_21                          49650 non-null float64
ORGANIZATION_TYPE_17                          49650 non-null float64
ORGANIZATION_TYPE_44                          49650 non-null 

In [36]:
data_dict[dataset_names[test_n]].info(verbose=10, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48744 entries, 0 to 48743
Data columns (total 1044 columns):
CODE_GENDER                                   48744 non-null float64
FLAG_CONT_MOBILE                              48744 non-null float64
FLAG_DOCUMENT_11                              48744 non-null float64
FLAG_DOCUMENT_18                              48744 non-null float64
FLAG_DOCUMENT_3                               48744 non-null float64
FLAG_DOCUMENT_4                               48744 non-null float64
FLAG_DOCUMENT_5                               48744 non-null float64
FLAG_DOCUMENT_6                               48744 non-null float64
FLAG_DOCUMENT_7                               48744 non-null float64
FLAG_DOCUMENT_8                               48744 non-null float64
FLAG_DOCUMENT_9                               48744 non-null float64
FLAG_EMAIL                                    48744 non-null float64
FLAG_EMP_PHONE                                48744 non-null 

In [37]:
data_dict[dataset_names[train_n]].fillna(0, inplace=True)

In [38]:
data_dict[dataset_names[test_n]].fillna(0, inplace=True)

In [39]:
data_dict[dataset_names[train_n]].drop('SK_ID_CURR', inplace=True, axis=1)

In [40]:
test_SK_ID_CURR = data_dict[dataset_names[test_n]].SK_ID_CURR
data_dict[dataset_names[test_n]].drop('SK_ID_CURR', inplace=True, axis=1)

In [54]:
common_features = list(set(data_dict[dataset_names[train_n]].columns) & set(data_dict[dataset_names[test_n]].columns))

# LightGBM

In [41]:
lgb_params = {
    'learning_rate': 0.1,
    'max_depth': 7,
    'num_leaves': 40, 
    'objective': 'binary',
    'tree_learner':'voting',
    'metric':'auc',
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'max_bin': 100
}

In [48]:
dtrain_lgb = lgb.Dataset(data_dict[dataset_names[train_n]], Y_b)

In [49]:
cv_result_lgb = lgb.cv(
    lgb_params,
    dtrain_lgb,
    num_boost_round=5000,
    nfold=5,
    stratified=True,
    early_stopping_rounds=50,
    verbose_eval=100,
    show_stdv=True
)

[100]	cv_agg's auc: 0.718328 + 0.0034165


In [50]:
num_boost_rounds_lgb = len(cv_result_lgb['auc-mean'])

In [51]:
print('num_boost_rounds_lgb = ' + str(num_boost_rounds_lgb))

num_boost_rounds_lgb = 131


In [52]:
# train model
model_lgb = lgb.train(lgb_params, dtrain_lgb, num_boost_round=num_boost_rounds_lgb)

In [55]:
y_pred = model_lgb.predict(data_dict[dataset_names[test_n]][common_features])

In [56]:
result = pd.DataFrame({
    'SK_ID_CURR': list(map(int, test_SK_ID_CURR)),
    'TARGET': y_pred
})

In [57]:
result.to_csv('../data/dataset/extended_features/lgb_submission.csv', index=False)

# XGBoost

In [58]:
data_dict[dataset_names[train_n]].shape

(49650, 1041)

In [59]:
data_dict[dataset_names[test_n]][common_features].shape

(48744, 1041)

In [60]:
classifier = xgb.XGBClassifier()

In [63]:
classifier.fit(data_dict[dataset_names[train_n]].values, Y_b)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [64]:
xgb_y_est = classifier.predict_proba(data_dict[dataset_names[test_n]][common_features].values)

In [65]:
xgb_y_est[:, 1].shape

(48744,)

In [66]:
result = pd.DataFrame({
    'SK_ID_CURR': list(map(int, test_SK_ID_CURR)),
    'TARGET': xgb_y_est[:, 1]
})

In [67]:
result.to_csv('../data/dataset/extended_features/xgb_submission.csv', index=False)

# Models mixture

In [None]:
voting_classifier = VotingClassifier(
    estimators=[
        ('xgb', xgb.XGBClassifier()),
        ('lgb', lgb.LGBMClassifier())
    ],
    voting='soft'
)

In [None]:
voting_classifier.fit(X_b, Y_b)

In [None]:
res_y = voting_classifier.predict_proba(data_dict[dataset_names[test_n]][common_features].values)

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': list(map(int, test_SK_ID_CURR)),
    'TARGET': res_y[:, 1]
})

In [None]:
result.to_csv('../data/dataset/extended_features/xgb_lgb_mixture_submission.csv', index=False)