# All necessary imports

In [1]:
import sys
sys.path.append('..')

In [5]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [84]:
from sklearn.ensemble import VotingClassifier

In [85]:
import lightgbm as lgb

In [86]:
import xgboost as xgb

In [71]:
from imblearn.under_sampling import RandomUnderSampler

In [6]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [7]:
data_path = '../data/dataset/transformed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

In [8]:
QUERY_PATTERN = 'n_missing <= 0 & type == \'{}\''

# Choose random_state

In [73]:
random_state = 42

# Data reading

Firstly we just load all data into memory, then profile each dataset,

then try to filter features that are most interesting for us at the moment

(continuous, categorical, binary features without na, features with low na percentage etc.).

In [9]:
dataset_names = ['application_train', 'application_test', 'bureau', 'bureau_balance', 'credit_card_balance', 'installments_payments', 'POS_CASH_balance', 'previous_application', 'sample_submission']

In [10]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

100%|██████████| 9/9 [01:25<00:00,  9.46s/it]


In [11]:
columns_description = pd.read_csv(filepath_or_buffer='../data/dataset/original/HomeCredit_columns_description.csv', encoding='ISO-8859-1', index_col=0)

# Data examples

In [12]:
name_number = 0

In [13]:
data_dict[dataset_names[name_number]].head().T # application_train

Unnamed: 0,0,1,2,3,4
CODE_GENDER,0.0,1.0,0.0,1.0,0.0
FLAG_CONT_MOBILE,1.0,1.0,1.0,1.0,1.0
FLAG_DOCUMENT_10,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_11,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_12,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_13,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_14,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_15,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_16,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_17,0.0,0.0,0.0,0.0,0.0


In [14]:
name_number = 1

In [15]:
data_dict[dataset_names[name_number]].head().T # application_test

Unnamed: 0,0,1,2,3,4
CODE_GENDER,1.0,0.0,0.0,1.0,0.0
FLAG_CONT_MOBILE,1.0,1.0,1.0,1.0,1.0
FLAG_DOCUMENT_11,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_18,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_3,1.0,1.0,0.0,1.0,1.0
FLAG_DOCUMENT_4,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_5,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_6,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_7,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_8,0.0,0.0,1.0,0.0,0.0


In [16]:
name_number = 2

In [17]:
data_dict[dataset_names[name_number]].head().T # bureau

Unnamed: 0,0,1,2,3,4
CREDIT_ACTIVE_0,0.0,1.0,1.0,1.0,1.0
CREDIT_ACTIVE_1,0.0,0.0,0.0,0.0,0.0
CREDIT_ACTIVE_2,1.0,0.0,0.0,0.0,0.0
CREDIT_ACTIVE_3,0.0,0.0,0.0,0.0,0.0
CREDIT_CURRENCY_0,1.0,1.0,1.0,1.0,1.0
CREDIT_CURRENCY_1,0.0,0.0,0.0,0.0,0.0
CREDIT_CURRENCY_2,0.0,0.0,0.0,0.0,0.0
CREDIT_CURRENCY_3,0.0,0.0,0.0,0.0,0.0
CREDIT_TYPE_0,0.0,0.0,0.0,0.0,0.0
CREDIT_TYPE_1,0.0,0.0,0.0,0.0,0.0


In [18]:
name_number = 3

In [19]:
data_dict[dataset_names[name_number]].head().T # bureau_balance

Unnamed: 0,0,1,2,3,4
STATUS_0,0.0,0.0,0.0,0.0,0.0
STATUS_1,0.0,0.0,0.0,0.0,0.0
STATUS_2,0.0,0.0,0.0,0.0,0.0
STATUS_3,0.0,0.0,0.0,0.0,0.0
STATUS_4,0.0,0.0,0.0,0.0,0.0
STATUS_5,0.0,0.0,0.0,0.0,0.0
STATUS_6,1.0,1.0,1.0,1.0,1.0
STATUS_7,0.0,0.0,0.0,0.0,0.0
MONTHS_BALANCE,0.0,-1.0,-2.0,-3.0,-4.0
SK_ID_BUREAU,5715448.0,5715448.0,5715448.0,5715448.0,5715448.0


In [20]:
name_number = 4

In [21]:
data_dict[dataset_names[name_number]].head().T # credit_card_balance

Unnamed: 0,0,1,2,3,4
NAME_CONTRACT_STATUS_0,1.0,1.0,1.0,1.0,1.0
NAME_CONTRACT_STATUS_1,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_2,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_3,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_4,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_5,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_6,0.0,0.0,0.0,0.0,0.0
AMT_BALANCE,56.97,63975.555,31815.225,236572.11,453919.455
AMT_CREDIT_LIMIT_ACTUAL,135000.0,45000.0,450000.0,225000.0,450000.0
AMT_DRAWINGS_CURRENT,877.5,2250.0,0.0,2250.0,11547.0


In [22]:
name_number = 5

In [23]:
data_dict[dataset_names[name_number]].head().T # installments_payments

Unnamed: 0,0,1,2,3,4
AMT_INSTALMENT,6948.36,1716.525,25425.0,24350.13,2165.04
DAYS_INSTALMENT,-1180.0,-2156.0,-63.0,-2418.0,-1383.0
NUM_INSTALMENT_NUMBER,6.0,34.0,1.0,3.0,2.0
NUM_INSTALMENT_VERSION,1.0,0.0,2.0,1.0,1.0
SK_ID_CURR,161674.0,151639.0,193053.0,199697.0,167756.0
SK_ID_PREV,1054186.0,1330831.0,2085231.0,2452527.0,2714724.0


In [24]:
name_number = 6

In [25]:
data_dict[dataset_names[name_number]].head().T # POS_CASH_balance

Unnamed: 0,0,1,2,3,4
NAME_CONTRACT_STATUS_0,1.0,1.0,1.0,1.0,1.0
NAME_CONTRACT_STATUS_1,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_2,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_3,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_4,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_5,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_6,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_7,0.0,0.0,0.0,0.0,0.0
NAME_CONTRACT_STATUS_8,0.0,0.0,0.0,0.0,0.0
MONTHS_BALANCE,-31.0,-33.0,-32.0,-35.0,-35.0


In [26]:
name_number = 7

In [27]:
data_dict[dataset_names[name_number]].head().T # previous_application

Unnamed: 0,0,1,2,3,4
FLAG_LAST_APPL_PER_CONTRACT,1.0,1.0,1.0,1.0,1.0
NFLAG_LAST_APPL_IN_DAY,1.0,1.0,1.0,1.0,1.0
CHANNEL_TYPE_0,0.0,0.0,0.0,0.0,0.0
CHANNEL_TYPE_1,0.0,0.0,0.0,0.0,0.0
CHANNEL_TYPE_2,0.0,0.0,0.0,0.0,0.0
CHANNEL_TYPE_3,0.0,1.0,0.0,0.0,0.0
CHANNEL_TYPE_4,1.0,0.0,0.0,0.0,0.0
CHANNEL_TYPE_5,0.0,0.0,1.0,1.0,1.0
CHANNEL_TYPE_6,0.0,0.0,0.0,0.0,0.0
CHANNEL_TYPE_7,0.0,0.0,0.0,0.0,0.0


In [28]:
name_number = 8

In [29]:
data_dict[dataset_names[name_number]].head().T # sample_submission

Unnamed: 0,0,1,2,3,4
SK_ID_CURR,100001.0,100005.0,100013.0,100028.0,100038.0


# Info

In [30]:
name_number = 0

In [31]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # application_train

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307507 entries, 0 to 307506
Data columns (total 138 columns):
CODE_GENDER                     307507 non-null float64
FLAG_CONT_MOBILE                307507 non-null float64
FLAG_DOCUMENT_10                307507 non-null float64
FLAG_DOCUMENT_11                307507 non-null float64
FLAG_DOCUMENT_12                307507 non-null float64
FLAG_DOCUMENT_13                307507 non-null float64
FLAG_DOCUMENT_14                307507 non-null float64
FLAG_DOCUMENT_15                307507 non-null float64
FLAG_DOCUMENT_16                307507 non-null float64
FLAG_DOCUMENT_17                307507 non-null float64
FLAG_DOCUMENT_18                307507 non-null float64
FLAG_DOCUMENT_19                307507 non-null float64
FLAG_DOCUMENT_2                 307507 non-null float64
FLAG_DOCUMENT_20                307507 non-null float64
FLAG_DOCUMENT_21                307507 non-null float64
FLAG_DOCUMENT_3                 307507 non-null 

In [32]:
name_number = 1

In [33]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # application_test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 126 columns):
CODE_GENDER                     48744 non-null float64
FLAG_CONT_MOBILE                48744 non-null float64
FLAG_DOCUMENT_11                48744 non-null float64
FLAG_DOCUMENT_18                48744 non-null float64
FLAG_DOCUMENT_3                 48744 non-null float64
FLAG_DOCUMENT_4                 48744 non-null float64
FLAG_DOCUMENT_5                 48744 non-null float64
FLAG_DOCUMENT_6                 48744 non-null float64
FLAG_DOCUMENT_7                 48744 non-null float64
FLAG_DOCUMENT_8                 48744 non-null float64
FLAG_DOCUMENT_9                 48744 non-null float64
FLAG_EMAIL                      48744 non-null float64
FLAG_EMP_PHONE                  48744 non-null float64
FLAG_MOBIL                      48744 non-null float64
FLAG_OWN_CAR                    48744 non-null float64
FLAG_OWN_REALTY                 48744 non-null float64
FLAG_PHONE

In [34]:
name_number = 2

In [35]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # bureau

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 30 columns):
CREDIT_ACTIVE_0           1716428 non-null float64
CREDIT_ACTIVE_1           1716428 non-null float64
CREDIT_ACTIVE_2           1716428 non-null float64
CREDIT_ACTIVE_3           1716428 non-null float64
CREDIT_CURRENCY_0         1716428 non-null float64
CREDIT_CURRENCY_1         1716428 non-null float64
CREDIT_CURRENCY_2         1716428 non-null float64
CREDIT_CURRENCY_3         1716428 non-null float64
CREDIT_TYPE_0             1716428 non-null float64
CREDIT_TYPE_1             1716428 non-null float64
CREDIT_TYPE_2             1716428 non-null float64
CREDIT_TYPE_3             1716428 non-null float64
CREDIT_TYPE_4             1716428 non-null float64
CREDIT_TYPE_5             1716428 non-null float64
CREDIT_TYPE_6             1716428 non-null float64
CREDIT_TYPE_7             1716428 non-null float64
CREDIT_TYPE_8             1716428 non-null float64
CREDIT_TYPE_9       

In [36]:
name_number = 3

In [37]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # bureau_balance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 10 columns):
STATUS_0          27299925 non-null float64
STATUS_1          27299925 non-null float64
STATUS_2          27299925 non-null float64
STATUS_3          27299925 non-null float64
STATUS_4          27299925 non-null float64
STATUS_5          27299925 non-null float64
STATUS_6          27299925 non-null float64
STATUS_7          27299925 non-null float64
MONTHS_BALANCE    27299925 non-null float64
SK_ID_BUREAU      27299925 non-null float64
dtypes: float64(10)
memory usage: 2.0 GB


In [38]:
name_number = 4

In [39]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # credit_card_balance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 16 columns):
NAME_CONTRACT_STATUS_0     3840312 non-null float64
NAME_CONTRACT_STATUS_1     3840312 non-null float64
NAME_CONTRACT_STATUS_2     3840312 non-null float64
NAME_CONTRACT_STATUS_3     3840312 non-null float64
NAME_CONTRACT_STATUS_4     3840312 non-null float64
NAME_CONTRACT_STATUS_5     3840312 non-null float64
NAME_CONTRACT_STATUS_6     3840312 non-null float64
AMT_BALANCE                3840312 non-null float64
AMT_CREDIT_LIMIT_ACTUAL    3840312 non-null float64
AMT_DRAWINGS_CURRENT       3840312 non-null float64
CNT_DRAWINGS_CURRENT       3840312 non-null float64
MONTHS_BALANCE             3840312 non-null float64
SK_DPD                     3840312 non-null float64
SK_DPD_DEF                 3840312 non-null float64
SK_ID_CURR                 3840312 non-null float64
SK_ID_PREV                 3840312 non-null float64
dtypes: float64(16)
memory usage: 468.8 MB


In [40]:
name_number = 5

In [41]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # installments_payments

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 6 columns):
AMT_INSTALMENT            13605401 non-null float64
DAYS_INSTALMENT           13605401 non-null float64
NUM_INSTALMENT_NUMBER     13605401 non-null float64
NUM_INSTALMENT_VERSION    13605401 non-null float64
SK_ID_CURR                13605401 non-null float64
SK_ID_PREV                13605401 non-null float64
dtypes: float64(6)
memory usage: 622.8 MB


In [42]:
name_number = 6

In [43]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # POS_CASH_balance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 14 columns):
NAME_CONTRACT_STATUS_0    10001358 non-null float64
NAME_CONTRACT_STATUS_1    10001358 non-null float64
NAME_CONTRACT_STATUS_2    10001358 non-null float64
NAME_CONTRACT_STATUS_3    10001358 non-null float64
NAME_CONTRACT_STATUS_4    10001358 non-null float64
NAME_CONTRACT_STATUS_5    10001358 non-null float64
NAME_CONTRACT_STATUS_6    10001358 non-null float64
NAME_CONTRACT_STATUS_7    10001358 non-null float64
NAME_CONTRACT_STATUS_8    10001358 non-null float64
MONTHS_BALANCE            10001358 non-null float64
SK_DPD                    10001358 non-null float64
SK_DPD_DEF                10001358 non-null float64
SK_ID_CURR                10001358 non-null float64
SK_ID_PREV                10001358 non-null float64
dtypes: float64(14)
memory usage: 1.0 GB


In [44]:
name_number = 7

In [45]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # previous_application

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 125 columns):
FLAG_LAST_APPL_PER_CONTRACT     1670214 non-null float64
NFLAG_LAST_APPL_IN_DAY          1670214 non-null float64
CHANNEL_TYPE_0                  1670214 non-null float64
CHANNEL_TYPE_1                  1670214 non-null float64
CHANNEL_TYPE_2                  1670214 non-null float64
CHANNEL_TYPE_3                  1670214 non-null float64
CHANNEL_TYPE_4                  1670214 non-null float64
CHANNEL_TYPE_5                  1670214 non-null float64
CHANNEL_TYPE_6                  1670214 non-null float64
CHANNEL_TYPE_7                  1670214 non-null float64
CODE_REJECT_REASON_0            1670214 non-null float64
CODE_REJECT_REASON_1            1670214 non-null float64
CODE_REJECT_REASON_2            1670214 non-null float64
CODE_REJECT_REASON_3            1670214 non-null float64
CODE_REJECT_REASON_4            1670214 non-null float64
CODE_REJECT_REASON_5           

In [46]:
name_number = 8

In [47]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # sample_submission

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 1 columns):
SK_ID_CURR    48744 non-null float64
dtypes: float64(1)
memory usage: 380.9 KB


# Feature description

Here at this picture the general data structure is reflected.

Lots of connections and, as a consequence, lots of hypothetial issues with data.

![Image of data scheme](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

In [53]:
dataset_names

['application_train',
 'application_test',
 'bureau',
 'bureau_balance',
 'credit_card_balance',
 'installments_payments',
 'POS_CASH_balance',
 'previous_application',
 'sample_submission']

In [52]:
data_dict[dataset_names[1]].SK_ID_CURR.nunique()

48744

In [56]:
pd.merge(data_dict[dataset_names[1]], data_dict[dataset_names[2]], on='SK_ID_CURR', how='inner').SK_ID_CURR.nunique()

42320

In [57]:
pd.merge(data_dict[dataset_names[1]], data_dict[dataset_names[6]], on='SK_ID_CURR', how='inner').SK_ID_CURR.nunique()

47808

In [58]:
pd.merge(data_dict[dataset_names[1]], data_dict[dataset_names[4]], on='SK_ID_CURR', how='inner').SK_ID_CURR.nunique()

16653

# Preprocessing

In [293]:
X = data_dict[dataset_names[0]][list(set(data_dict[dataset_names[0]].columns) & set(data_dict[dataset_names[1]].columns))]

In [294]:
Y = data_dict[dataset_names[0]]['TARGET']

In [295]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307507 entries, 0 to 307506
Columns: 124 entries, ORGANIZATION_TYPE_0 to ORGANIZATION_TYPE_33
dtypes: float64(124)
memory usage: 290.9 MB


In [296]:
Y.value_counts()

0.0    282682
1.0    24825 
Name: TARGET, dtype: int64

In [297]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X, Y)

In [298]:
print(sum(Y_b), len(Y_b) - sum(Y_b))

24825.0 24825.0


In [299]:
X_b = pd.DataFrame(X_b, columns=X.columns)

In [300]:
XY_b = X_b

In [301]:
XY_b['TARGET'] = Y

In [302]:
XY_b_ext = pd.merge(XY_b, data_dict[dataset_names[2]].groupby('SK_ID_CURR').aggregate(np.mean).reset_index(), on='SK_ID_CURR', how='inner')

In [303]:
XY_b_ext = pd.merge(XY_b_ext, data_dict[dataset_names[4]].groupby('SK_ID_CURR').aggregate(np.mean).reset_index(), on='SK_ID_CURR', how='inner')

In [304]:
XY_b_ext = pd.merge(XY_b_ext, data_dict[dataset_names[6]].groupby('SK_ID_CURR').aggregate(np.mean).reset_index(), on='SK_ID_CURR', how='inner')

In [305]:
Y_b_ext = XY_b_ext['TARGET']

In [306]:
X_b_ext = XY_b_ext[list(set(XY_b_ext.columns) - set(['TARGET', 'SK_ID_CURR']))]

In [307]:
X_b_ext.shape

(11950, 180)

In [308]:
X_test_ext = pd.merge(data_dict[dataset_names[1]][list(set(data_dict[dataset_names[0]].columns) & set(data_dict[dataset_names[1]].columns))], data_dict[dataset_names[2]].groupby('SK_ID_CURR').aggregate(np.mean).reset_index(), on='SK_ID_CURR', how='inner')

In [309]:
X_test_ext = pd.merge(X_test_ext, data_dict[dataset_names[4]].groupby('SK_ID_CURR').aggregate(np.mean).reset_index(), on='SK_ID_CURR', how='inner')

In [310]:
X_test_ext = pd.merge(X_test_ext, data_dict[dataset_names[6]].groupby('SK_ID_CURR').aggregate(np.mean).reset_index(), on='SK_ID_CURR', how='inner')

In [311]:
X_test_ext.shape

(14491, 181)

In [312]:
X_test = data_dict[dataset_names[1]][~data_dict[dataset_names[1]]['SK_ID_CURR'].isin(X_test_ext.SK_ID_CURR)]

In [313]:
X_test.shape

(34253, 126)

In [314]:
SK_ID_CURR_exten = X_test_ext.SK_ID_CURR
SK_ID_CURR_usual = X_test.SK_ID_CURR

In [315]:
X_test_ext = X_test_ext[X_b_ext.columns]

In [316]:
X_test_ext.shape

(14491, 180)

In [317]:
X_test.drop(['SK_ID_CURR'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [318]:
X_test.shape

(34253, 125)

# Models mixture

## Extended features

In [319]:
voting_classifier = VotingClassifier(
    estimators=[
        ('xgb', xgb.XGBClassifier()),
        ('lgb', lgb.LGBMClassifier())
    ],
    voting='soft')

In [320]:
voting_classifier.fit(X_b_ext.values, Y_b_ext)

VotingClassifier(estimators=[('xgb', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random...0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [321]:
ext_res_y = voting_classifier.predict_proba(X_test_ext.values)

## Usual features

In [322]:
voting_classifier = VotingClassifier(
    estimators=[
        ('xgb', xgb.XGBClassifier()),
        ('lgb', lgb.LGBMClassifier())
    ],
    voting='soft')

In [323]:
voting_classifier.fit(X_b.values, Y_b)

VotingClassifier(estimators=[('xgb', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random...0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [324]:
res_y = voting_classifier.predict_proba(X_test.values)

In [330]:
result = pd.DataFrame({
    'SK_ID_CURR': SK_ID_CURR_exten.values.tolist() + SK_ID_CURR_usual.values.tolist(),
    'TARGET': ext_res_y[:, 1].tolist() + res_y[:, 1].tolist()
})

In [331]:
result.head(30)

Unnamed: 0,SK_ID_CURR,TARGET
0,100013.0,0.916831
1,100028.0,0.910972
2,100042.0,0.940664
3,100066.0,0.932648
4,100067.0,0.929342
5,100090.0,0.948203
6,100107.0,0.904428
7,100109.0,0.965619
8,100128.0,0.949731
9,100169.0,0.935491


In [332]:
result.SK_ID_CURR = result.SK_ID_CURR.astype(np.int)

In [333]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 2 columns):
SK_ID_CURR    48744 non-null int64
TARGET        48744 non-null float64
dtypes: float64(1), int64(1)
memory usage: 761.7 KB


In [334]:
result.to_csv('../data/dataset/xgb_lgb_ext_mixture_submission.csv', index=False)