# Все необходимые импорты

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.ItemSelector import ItemSelector
from source.code.MyLabelBinarizer import MyLabelBinarizer

In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

# Выбрать random_state

In [4]:
random_state = 42

# Вспомогательные функции

In [5]:
def read_columns(path):
    with open(path, 'r') as file:
        columns = file.readlines()
        columns = list(map(str.rstrip, columns))
    return columns

In [6]:
def generate_pipeline(column):
    return  (column, Pipeline([
        ('choose', ItemSelector(column)),
        ('binarize', MyLabelBinarizer())
    ]))

# Чтение данных

In [7]:
train = pd.read_csv('../data/dataset/application_train.csv')

In [8]:
train = train[train.CODE_GENDER != 'XNA']

In [9]:
train = train[train.NAME_FAMILY_STATUS != 'Unknown']

In [10]:
train = train[train.NAME_INCOME_TYPE != 'Maternity leave']

In [11]:
test = pd.read_csv('../data/dataset/application_test.csv')

In [12]:
na_columns = read_columns('../data/train_test_na_columns.txt')

In [13]:
highly_correlated_columns = read_columns('../data/train_test_highly_correlated_columns.txt')

In [14]:
categorical_columns = read_columns('../data/train_test_categorical_columns.txt')

In [15]:
binary_columns = read_columns('../data/train_test_binary_columns.txt')

# Предобработка

Пока что попробуем обойтись только этими данными, без хитростей с заполнениями пропусков:

In [16]:
train = train[list(set(train.columns) - (set(na_columns) | set(highly_correlated_columns)))]

In [17]:
test = test[list(set(test.columns) - (set(na_columns) | set(highly_correlated_columns)))]

In [18]:
continuous_columns = list(
    set(test.columns) - (set(binary_columns) | set(categorical_columns) | set(['SK_ID_CURR']))
)

In [19]:
X = train[test.columns]

In [20]:
Y = train['TARGET']

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307500 entries, 0 to 307510
Data columns (total 53 columns):
NAME_CONTRACT_TYPE             307500 non-null object
FLAG_DOCUMENT_10               307500 non-null int64
FLAG_DOCUMENT_13               307500 non-null int64
FLAG_DOCUMENT_9                307500 non-null int64
FLAG_DOCUMENT_2                307500 non-null int64
NAME_FAMILY_STATUS             307500 non-null object
FLAG_DOCUMENT_15               307500 non-null int64
AMT_INCOME_TOTAL               307500 non-null float64
FLAG_DOCUMENT_6                307500 non-null int64
FLAG_DOCUMENT_11               307500 non-null int64
FLAG_DOCUMENT_20               307500 non-null int64
REG_REGION_NOT_LIVE_REGION     307500 non-null int64
HOUR_APPR_PROCESS_START        307500 non-null int64
FLAG_DOCUMENT_8                307500 non-null int64
AMT_CREDIT                     307500 non-null float64
SK_ID_CURR                     307500 non-null int64
FLAG_DOCUMENT_5                3075

In [22]:
Y.value_counts()

0    282677
1     24823
Name: TARGET, dtype: int64

Классы не сбалансированы.

Надо поправить.

Перед этим надо бинаризовать все поля, где есть символьные обозначения:

In [23]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, binary_columns)) + \
        list(map(generate_pipeline, categorical_columns)) + \
        [('all_other_features', Pipeline([('choose', ItemSelector(continuous_columns))]))]
    ))
])

In [24]:
X_tr = pd.DataFrame(pipeline.fit_transform(X))

In [25]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307500 entries, 0 to 307499
Columns: 134 entries, 0 to 133
dtypes: float64(134)
memory usage: 314.4 MB


In [26]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, binary_columns)) + \
        list(map(generate_pipeline, categorical_columns)) + \
        [('all_other_features', Pipeline([('choose', ItemSelector(continuous_columns))]))]
    ))
])

In [27]:
test_tr = pd.DataFrame(pipeline.fit_transform(test))

In [28]:
test_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Columns: 134 entries, 0 to 133
dtypes: float64(134)
memory usage: 49.8 MB


In [29]:
print(len(X_tr), len(Y))

307500 307500


Тперь можно попробовать и сбалансировать:

In [30]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [31]:
print(sum(Y_b), len(Y_b))

24823 49646


# Baseline-модель

In [32]:
cross_val_score(BaggingClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.57546828, 0.58079526, 0.5786802 ])

In [33]:
cross_val_score(RandomForestClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.57691843, 0.57221416, 0.5754774 ])

In [34]:
cross_val_score(ExtraTreesClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.58114804, 0.56538555, 0.57596084])

In [35]:
baseline_classifier = RandomForestClassifier(random_state=random_state)

In [36]:
baseline_classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [37]:
baseline_y_est = baseline_classifier.predict_proba(test_tr)

In [38]:
baseline_y_est[:, 1].shape

(48744,)

In [39]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': baseline_y_est[:, 1]
})

In [40]:
result.to_csv('../data/dataset/baseline_submission.csv', index=False)

# Потюним гиперпараметры

# ExtraTreesClassifier

In [41]:
ExtraTreesClassifier().get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [42]:
e_t_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [43]:
e_t_grid = GridSearchCV(
    ExtraTreesClassifier(random_state=random_state),
    param_grid=e_t_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [44]:
e_t_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5811480362537764, total=   2.1s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5759608411892676, total=   2.2s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5653855450809765, total=   2.3s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.5809063444108761, total=   3.0s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ....

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.3s


[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.5806139714769157, total=   3.6s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.58404833836858, total=   4.0s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.5817017162194827, total=   4.2s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.587321730722746, total=   3.7s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.5905849649504471, total=   4.6s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.5s


[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.59202416918429, total=   5.5s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.5881073241479333, total=   5.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.572809667673716, total=   2.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.5929909365558913, total=   6.7s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.5907662557408749, total=   6.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.5925187333816776, total=   6.4s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 ............

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   20.3s


[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.5726976069615664, total=   3.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5837462235649546, total=   3.6s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5794657964708726, total=   3.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5754169688179841, total=   4.1s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.5865256797583082, total=   4.7s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.5842397872854725, total=   4.8s
[CV] bootstrap=False, criterion=entropy, n_estimato

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   29.5s


[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5905135951661632, total=   5.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5818830070099106, total=   6.9s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5866569978245105, total=   6.3s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.5941993957703927, total=   7.1s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=10, score=0.5710574018126887, total=   1.5s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.5852066715010877, total=   7.3s
[CV] bootstrap=True, criterion=gini, n_estimators=15 ..

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   41.7s


[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.5817621464829587, total=   2.3s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.5789823543630651, total=   2.1s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.5891346386270244, total=   7.0s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.5855692530819434, total=   2.7s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.5831419939577039, total=   3.3s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.5836354846507131, total=   3.0s
[CV] bootstrap=True, criterion=gini, n_estimators=30 .................


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.1s


[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5893655589123867, total=   4.5s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5934251873338168, total=   4.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.576797583081571, total=   2.2s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5691926516799614, total=   1.7s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5697969543147208, total=   1.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5893159294174523, total=   4.7s
[CV] bootstrap=True, criterion=entropy, n_estimators=20 ...........

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   59.5s


[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5919637462235648, total=   3.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5888929175731206, total=   3.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5926395939086294, total=   3.7s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.5964954682779456, total=   4.2s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.5937273386511965, total=   3.8s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.5917331399564902, total=   2.8s


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [45]:
e_t_grid.best_params_

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 30}

In [46]:
e_t_grid.best_score_

0.5939854167505942

In [47]:
e_t_classifier = ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)

In [48]:
e_t_classifier.fit(X_b, Y_b)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [49]:
e_t_y_est = e_t_classifier.predict_proba(test_tr)

In [50]:
e_t_y_est[:, 1].shape

(48744,)

In [51]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': e_t_y_est[:, 1]
})

In [52]:
result.to_csv('../data/dataset/e_t_grid_search_cv_submission.csv', index=False)

# RandomForestClassifier

In [53]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [54]:
r_f_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [55]:
r_f_grid = GridSearchCV(
    RandomForestClassifier(random_state=random_state),
    param_grid=r_f_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [56]:
r_f_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5803021148036254, total=   2.2s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5755378293449359, total=   2.1s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5774111675126904, total=   2.6s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.5868277945619335, total=   3.4s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ....

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.2s


[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.5844815083393764, total=   3.5s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.5919033232628399, total=   4.6s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.5938481991781485, total=   4.3s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.5885907662557408, total=   4.1s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.5961933534743201, total=   5.3s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.4s


[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.5991660623640319, total=   6.1s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.5960236886632825, total=   5.3s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.5794561933534743, total=   2.8s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.6010876132930513, total=   6.7s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.5989847715736041, total=   6.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.5738457819676094, total=   2.9s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 ......

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   21.5s


[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.5964467005076143, total=   6.8s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5891842900302116, total=   4.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5868382886149384, total=   4.2s
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5861735557167029, total=   4.7s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.5930513595166164, total=   5.4s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.5897389412617839, total=   5.4s
[CV] bootstrap=False, criterion=entropy, n_estimators=

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   31.7s


[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.6004833836858006, total=   7.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5948755136572396, total=   7.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5963862702441383, total=   6.8s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=10, score=0.5769184290030211, total=   1.4s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=10, score=0.5722141648537588, total=   1.4s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.6056797583081571, total=   8.1s
[CV] bootstrap=True, criterion=gini, n_estimators=15 ......

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   43.4s


[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.584964950447184, total=   2.3s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.5830311820159536, total=   2.2s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.5948755136572396, total=   9.1s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.596012084592145, total=   2.6s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.5914914189025864, total=   2.7s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.5882281846748851, total=   2.7s
[CV] bootstrap=True, criterion=gini, n_estimators=30 .................
[C

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.9s


[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5795166163141995, total=   1.9s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5988519637462235, total=   4.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.6032753202803964, total=   4.2s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5710659898477157, total=   1.9s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5946942228668116, total=   3.9s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5755982596084119, total=   2.0s
[CV] bootstrap=True, criterion=entropy, n_estimators=20 ..........

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.1min


[CV]  bootstrap=True, criterion=entropy, n_estimators=20, score=0.5921561518008218, total=   4.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5975226586102719, total=   5.3s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5995286439448876, total=   5.9s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5972322939328015, total=   5.4s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.6029003021148036, total=   6.3s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.602671017645637, total=   5.6s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.6006768189509306, total=   5.1s


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [57]:
r_f_grid.best_params_

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 30}

In [58]:
r_f_grid.best_score_

0.6020827458405511

In [59]:
r_f_classifier = RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)

In [60]:
r_f_classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [61]:
r_f_y_est = r_f_classifier.predict_proba(test_tr)

In [62]:
r_f_y_est[:, 1].shape

(48744,)

In [63]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': r_f_y_est[:, 1]
})

In [64]:
result.to_csv('../data/dataset/r_f_grid_search_cv_submission.csv', index=False)

# BaggingClassifier

In [65]:
BaggingClassifier().get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [66]:
bagging_params_grid = {
    'bootstrap': [True],
    'bootstrap_features': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'oob_score': [False, True]
}

In [67]:
bagging_grid = GridSearchCV(
    BaggingClassifier(random_state=random_state),
    param_grid=bagging_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [68]:
bagging_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False 
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False 
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False 
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False, score=0.5807952622673436, total=  10.4s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False, score=0.5786802030456852, total=  11.1s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True, score=0.5754682779456194, total=  11.1s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False, score=0.5754682779456194, total=  11.4s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True, score=0.5807952622673436, total=   9.7s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False 


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   22.6s


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True, score=0.5786802030456852, total=  10.1s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False, score=0.5853172205438066, total=  15.7s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False, score=0.5887116267826927, total=  16.0s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False, score=0.5915518491660624, total=  17.0s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True, score=0.5853172205438066, total=  17.3s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False 


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   42.6s
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True, score=0.5915518491660624, total=  17.2s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True, score=0.5887116267826927, total=  18.1s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False, score=0.5906344410876133, total=  24.4s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False, score=0.5903432438965435, total=  23.8s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False, score=0.5940899202320522, total=  24.1s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True, score=0.5906344410876133, total=  23.9s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True, score=0.5903432438965435, total=  22.9s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False 


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.6min


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True, score=0.5940899202320522, total=  22.9s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False, score=0.5989728096676737, total=  26.5s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False, score=0.5951776649746193, total=  26.7s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False, score=0.5998307952622673, total=  25.9s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True, score=0.5989728096676737, total=  26.5s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_sco

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.3min


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False, score=0.5999395770392749, total=  31.8s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False, score=0.6012206913222141, total=  31.6s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False, score=0.5989243413101281, total=  33.3s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=True, score=0.5999395770392749, total=  34.9s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=False, score=0.5774622356495468, total=  11.3s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.3min
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=True, score=0.5774622356495468, total=  11.6s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=True, score=0.577834179357022, total=  12.4s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=True, score=0.5707034082668601, total=  11.4s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False, score=0.583987915407855, total=  16.5s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False, score=0.5853275320280397, total=  16.3s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False, score=0.5810974135847232, total=  16.7s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True, score=0.583987915407855, total=  17.2s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True, score=0.5853275320280397, total=  17.9s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True, score=0.5810974135847232, total=  17.6s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True 


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.0min


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False, score=0.587915407854985, total=  22.3s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False, score=0.590705825477399, total=  22.3s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False, score=0.5881073241479333, total=  25.0s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True, score=0.587915407854985, total=  26.1s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True, score=0.590705825477399, total=  27.0s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True, score=0.5881073241479333, total=  27.2s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False, score=0.590453172205438, total=  30.8s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False, score=0.5960841189267585, total=  30.1s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False, score=0.5926395939086294, total=  27.2s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False 
[C

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.5min


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True, score=0.5926395939086294, total=  27.8s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False, score=0.5954078549848942, total=  34.4s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False, score=0.5960841189267585, total=  35.1s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False, score=0.5995890742083635, total=  36.0s
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True, score=0.5954078549848942, total=  37.1s
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True, score=0.5960841189267585, total=  31.0s
[CV]  bootstrap=True, bootstrap_features

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  6.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  6.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'bootstrap_features': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'oob_score': [False, True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [69]:
bagging_grid.best_params_

{'bootstrap': True,
 'bootstrap_features': False,
 'n_estimators': 30,
 'oob_score': False}

In [70]:
bagging_grid.best_score_

0.6000281996535471

In [73]:
bagging_classifier = BaggingClassifier(random_state=random_state, **bagging_grid.best_params_)

In [74]:
bagging_classifier.fit(X_b, Y_b)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=30, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False)

In [75]:
bagging_y_est = bagging_classifier.predict_proba(test_tr)

In [76]:
bagging_y_est[:, 1].shape

(48744,)

In [77]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': bagging_y_est[:, 1]
})

In [78]:
result.to_csv('../data/dataset/bagging_grid_search_cv_submission.csv', index=False)

# Models mixture

In [79]:
res_y = (bagging_y_est[:, 1] + r_f_y_est[:, 1] + e_t_y_est[:, 1] + baseline_y_est[:, 1]) / 4

In [80]:
res_y

array([0.39166667, 0.46666667, 0.40833333, ..., 0.425     , 0.45833333,
       0.50833333])

In [81]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': res_y
})

In [82]:
result.to_csv('../data/dataset/mixture_submission.csv', index=False)