# All necessary imports

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import load_obj
from source.code.utils import generate_binarized_pipeline
from source.code.utils import generate_encoded_pipeline

from source.code.ItemSelector import ItemSelector

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

In [4]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [5]:
data_path = '../data/dataset/processed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

# Choose random_state

In [6]:
random_state = 42

# Data reading

## Datasets

In [7]:
dataset_names = ['application_train', 'application_test']

In [8]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

100%|██████████| 2/2 [00:02<00:00,  1.01s/it]


## Columns needed

In [9]:
datasets_num_features = load_obj(meta_path.format('datasets_num_features'))

In [10]:
datasets_cat_features = load_obj(meta_path.format('datasets_cat_features'))

In [11]:
datasets_bin_features = load_obj(meta_path.format('datasets_bin_features'))

In [12]:
commom_categories = load_obj(meta_path.format('commom_categories'))

# Preprocessing

In [13]:
for category in tqdm(commom_categories):
    data_dict[dataset_names[0]] = data_dict[dataset_names[0]][data_dict[dataset_names[0]][category].isin(commom_categories[category])]

100%|██████████| 6/6 [00:00<00:00, 11.27it/s]


So far we just generate features only from train and test, without NaNs imputing and other tricks:

In [14]:
common_num_features = list(set(datasets_num_features[dataset_names[0]]) & set(datasets_num_features[dataset_names[1]]))
common_cat_features = list(set(datasets_cat_features[dataset_names[0]]) & set(datasets_cat_features[dataset_names[1]]))
common_bin_features = list(set(datasets_bin_features[dataset_names[0]]) & set(datasets_bin_features[dataset_names[1]]))

In [15]:
X = data_dict[dataset_names[0]][common_num_features + common_cat_features + common_bin_features]

In [16]:
Y = data_dict[dataset_names[0]]['TARGET']

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307500 entries, 0 to 307506
Data columns (total 42 columns):
DAYS_BIRTH                     307500 non-null int64
AMT_INCOME_TOTAL               307500 non-null float64
REGION_POPULATION_RELATIVE     307500 non-null float64
CNT_CHILDREN                   307500 non-null int64
SK_ID_CURR                     307500 non-null int64
DAYS_EMPLOYED                  307500 non-null int64
DAYS_ID_PUBLISH                307500 non-null int64
DAYS_REGISTRATION              307500 non-null float64
HOUR_APPR_PROCESS_START        307500 non-null int64
AMT_CREDIT                     307500 non-null float64
REGION_RATING_CLIENT           307500 non-null int64
NAME_EDUCATION_TYPE            307500 non-null object
WEEKDAY_APPR_PROCESS_START     307500 non-null object
NAME_FAMILY_STATUS             307500 non-null object
NAME_INCOME_TYPE               307500 non-null object
NAME_HOUSING_TYPE              307500 non-null object
ORGANIZATION_TYPE           

In [18]:
Y.value_counts()

0    282677
1    24823 
Name: TARGET, dtype: int64

Classes are unbalanced.

This has to be fixed.

But before that we need to binarize categorical features:

In [19]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_encoded_pipeline, common_cat_features)) +\
        [('all_other_features', Pipeline([('choose', ItemSelector(common_num_features + common_bin_features))]))]
    ))
])

In [20]:
X_tr = pd.DataFrame(pipeline.fit_transform(X))

In [21]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307500 entries, 0 to 307499
Data columns (total 42 columns):
0     307500 non-null float64
1     307500 non-null float64
2     307500 non-null float64
3     307500 non-null float64
4     307500 non-null float64
5     307500 non-null float64
6     307500 non-null float64
7     307500 non-null float64
8     307500 non-null float64
9     307500 non-null float64
10    307500 non-null float64
11    307500 non-null float64
12    307500 non-null float64
13    307500 non-null float64
14    307500 non-null float64
15    307500 non-null float64
16    307500 non-null float64
17    307500 non-null float64
18    307500 non-null float64
19    307500 non-null float64
20    307500 non-null float64
21    307500 non-null float64
22    307500 non-null float64
23    307500 non-null float64
24    307500 non-null float64
25    307500 non-null float64
26    307500 non-null float64
27    307500 non-null float64
28    307500 non-null float64
29    307500 non-nu

In [22]:
test = data_dict[dataset_names[1]][common_num_features + common_cat_features + common_bin_features]

In [23]:
test_tr = pd.DataFrame(pipeline.transform(test))

In [24]:
test_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 42 columns):
0     48744 non-null float64
1     48744 non-null float64
2     48744 non-null float64
3     48744 non-null float64
4     48744 non-null float64
5     48744 non-null float64
6     48744 non-null float64
7     48744 non-null float64
8     48744 non-null float64
9     48744 non-null float64
10    48744 non-null float64
11    48744 non-null float64
12    48744 non-null float64
13    48744 non-null float64
14    48744 non-null float64
15    48744 non-null float64
16    48744 non-null float64
17    48744 non-null float64
18    48744 non-null float64
19    48744 non-null float64
20    48744 non-null float64
21    48744 non-null float64
22    48744 non-null float64
23    48744 non-null float64
24    48744 non-null float64
25    48744 non-null float64
26    48744 non-null float64
27    48744 non-null float64
28    48744 non-null float64
29    48744 non-null float64
30    48744 non-null 

In [25]:
print(len(X_tr), len(Y))

307500 307500


Now we can balance classes.

Here we use the simplest way to do it (because of time & memory issues):

In [26]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [27]:
print(sum(Y_b), len(Y_b))

24823 49646


# Baseline-модель

In [28]:
cross_val_score(BaggingClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.33359517, 0.33417936, 0.33025139])

In [29]:
cross_val_score(RandomForestClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.33667674, 0.34535896, 0.33460237])

In [30]:
cross_val_score(ExtraTreesClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.37220544, 0.44216824, 0.38343002])

In [31]:
baseline_classifier = RandomForestClassifier(random_state=random_state)

In [32]:
baseline_classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [33]:
baseline_y_est = baseline_classifier.predict_proba(test_tr)

In [34]:
baseline_y_est[:, 1].shape

(48744,)

In [35]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': baseline_y_est[:, 1]
})

In [36]:
result.to_csv('../data/dataset/first_approach/baseline_submission.csv', index=False)

# Let's tune our hyperparameters

# ExtraTreesClassifier

In [37]:
ExtraTreesClassifier().get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [38]:
e_t_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [39]:
e_t_grid = GridSearchCV(
    ExtraTreesClassifier(random_state=random_state),
    param_grid=e_t_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [40]:
e_t_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.3722054380664653, total=   1.2s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.38343002175489493, total=   1.2s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.442168237853517, total=   1.4s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.3406646525679758, total=   1.6s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ....

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.4s


[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.35381919265167994, total=   1.6s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.43817984046410446, total=   2.1s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.35232628398791543, total=   2.6s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.36167512690355336, total=   2.2s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.33673716012084587, total=   2.8s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.2s


[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.4340705825477399, total=   2.8s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.34711143340584966, total=   2.6s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.3752870090634441, total=   1.1s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.34404833836858006, total=   3.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.4471839497220208, total=   1.4s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.3571428571428571, total=   1.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 .

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.3s


[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.3543630650229635, total=   3.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.3391540785498489, total=   2.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.3329707517524776, total=   1.9s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.43382886149383615, total=   2.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.35262839879154073, total=   2.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.3486221899927484, total=   2.3s
[CV] bootstrap=False, criterion=entropy, n_estimator

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   16.2s


[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.339214501510574, total=   2.8s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.43020304568527923, total=   3.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.330190959632584, total=   3.2s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.34640483383685794, total=   3.4s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=10, score=0.3780060422960725, total=   0.7s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.4253686246072033, total=   3.2s
[CV] bootstrap=True, criterion=gini, n_estimators=15 ..

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   22.3s


[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.3416727096930142, total=   3.7s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.3430021754894851, total=   1.0s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.3555287009063445, total=   1.3s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.4467005076142132, total=   1.4s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.3575054387237129, total=   1.3s
[CV] bootstrap=True, criterion=gini, n_estimators=30 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=25, score=0.338429003021148, total=   1.6s
[CV] bootstrap=True, criterion=gini, n_estimators=30 .................
[

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.1s


[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.3691238670694864, total=   0.9s
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.35015105740181274, total=   2.2s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.43080734832003864, total=   2.2s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.4708121827411168, total=   0.8s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.3662678269277254, total=   1.0s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.3411288373217307, total=   2.5s
[CV] bootstrap=True, criterion=entropy, n_estimators=20 ........

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   31.2s


[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.3342598187311179, total=   2.7s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.33375634517766495, total=   2.3s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.41630408508581096, total=   2.3s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.34157099697885196, total=   2.8s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.4262750785593425, total=   2.0s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.3451172347111433, total=   1.6s


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   35.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   35.8s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [41]:
e_t_grid.best_params_

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 10}

In [42]:
e_t_grid.best_score_

0.4051685936429924

In [43]:
e_t_classifier = ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)

In [44]:
e_t_classifier.fit(X_b, Y_b)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [45]:
e_t_y_est = e_t_classifier.predict_proba(test_tr)

In [46]:
e_t_y_est[:, 1].shape

(48744,)

In [47]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': e_t_y_est[:, 1]
})

In [48]:
result.to_csv('../data/dataset/first_approach/e_t_grid_search_cv_submission.csv', index=False)

# RandomForestClassifier

In [49]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [50]:
r_f_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [51]:
r_f_grid = GridSearchCV(
    RandomForestClassifier(random_state=random_state),
    param_grid=r_f_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [52]:
r_f_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.35200628474740153, total=   1.7s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.3329305135951661, total=   1.8s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.33768431230360163, total=   1.9s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.30398791540785497, total=   2.7s
[CV] bootstrap=False, criterion=gini, n_estimators=20 .

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.3s


[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.300700991056321, total=   2.6s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.32132930513595165, total=   4.2s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.3371404399323181, total=   4.0s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.3162919990331158, total=   3.7s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.3037462235649547, total=   4.5s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    9.8s


[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.31725888324873097, total=   3.6s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.2987067923616147, total=   4.4s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.3141389728096677, total=   5.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.332749244712991, total=   2.6s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.3306139714769156, total=   5.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.31067198452985256, total=   5.3s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 ........

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   16.6s


[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.33025138989605995, total=   2.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.30006042296072505, total=   3.9s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.3155064056079284, total=   3.4s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.29804205946337925, total=   3.7s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.316797583081571, total=   4.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.3300700991056321, total=   4.7s
[CV] bootstrap=False, criterion=entropy, n_estima

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   24.9s


[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.2990936555891238, total=   6.0s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.31260575296108295, total=   5.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.29701474498428815, total=   5.8s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=10, score=0.3366767371601208, total=   0.9s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.3110574018126888, total=   7.0s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=10, score=0.34535895576504716, total=   1.0s
[CV] bootstrap=True, criterion=gini, n_estimators=15 ...

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   34.6s


[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.3155064056079284, total=   1.5s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.2996736765772299, total=   1.5s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.3063814358230602, total=   7.4s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.32006042296072507, total=   2.1s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.3338772057046169, total=   2.0s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.3156876963983563, total=   1.9s
[CV] bootstrap=True, criterion=gini, n_estimators=30 .................

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.5s


[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.3109365558912387, total=   3.7s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.3355287009063444, total=   1.8s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.32825719120135366, total=   3.3s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.30855692530819434, total=   3.9s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.34977036499879144, total=   1.9s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.33224558859076625, total=   1.8s
[CV] bootstrap=True, criterion=entropy, n_estimators=20 ......

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   48.8s


[CV]  bootstrap=True, criterion=entropy, n_estimators=20, score=0.3134517766497462, total=   2.9s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.29842900302114805, total=   4.0s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.31218274111675126, total=   4.0s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.2932680686487793, total=   4.0s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.31178247734138975, total=   4.5s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.3248126661832245, total=   4.3s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.304205946337926, total=   3.7s


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   55.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   55.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [53]:
r_f_grid.best_params_

{'bootstrap': False, 'criterion': 'gini', 'n_estimators': 10}

In [54]:
r_f_grid.best_score_

0.34087338355557345

In [55]:
r_f_classifier = RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)

In [56]:
r_f_classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [57]:
r_f_y_est = r_f_classifier.predict_proba(test_tr)

In [58]:
r_f_y_est[:, 1].shape

(48744,)

In [59]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': r_f_y_est[:, 1]
})

In [60]:
result.to_csv('../data/dataset/first_approach/r_f_grid_search_cv_submission.csv', index=False)

# BaggingClassifier

In [61]:
BaggingClassifier().get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [62]:
bagging_params_grid = {
    'bootstrap': [True],
    'bootstrap_features': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'oob_score': [False, True]
}

In [63]:
bagging_grid = GridSearchCV(
    BaggingClassifier(random_state=random_state),
    param_grid=bagging_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [64]:
bagging_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False 
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False 
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False 
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False, score=0.33417935702199664, total=   5.5s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False, score=0.333595166163142, total=   6.3s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True, score=0.333595166163142, total=   6.2s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=False, score=0.33025138989605995, total=   6.9s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True, score=0.33417935702199664, total=   6.1s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.3s


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=10, oob_score=True, score=0.33025138989605995, total=   5.7s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False, score=0.30160744500846026, total=   7.4s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False, score=0.298368580060423, total=   8.4s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=False, score=0.2997945371041818, total=   8.3s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True, score=0.298368580060423, total=   9.0s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False 


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   22.3s
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True, score=0.30160744500846026, total=   8.6s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=15, oob_score=True, score=0.2997945371041818, total=   9.3s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False, score=0.31703927492447126, total=  10.6s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False, score=0.3181653372008702, total=  10.4s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=False, score=0.3149625332366449, total=  11.0s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True, score=0.31703927492447126, total=  10.8s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True, score=0.3181653372008702, total=  11.1s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False 


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   43.9s
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=20, oob_score=True, score=0.3149625332366449, total=  11.3s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False, score=0.29852550157118685, total=  12.8s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False, score=0.2982477341389728, total=  14.2s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=False, score=0.29610829103214886, total=  14.6s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=25, oob_score=True, score=0.2982477341389728, total=  14.7s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_s

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.1min


[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False, score=0.31000725163161713, total=  15.0s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False, score=0.3113595166163142, total=  17.0s
[CV] bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=False, score=0.30759004109257915, total=  17.4s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=False, n_estimators=30, oob_score=True, score=0.3113595166163142, total=  17.8s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=False, score=0.3410876132930514, total=   6.7s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=10, oob_sco

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=True, score=0.3410876132930514, total=   7.5s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=True, score=0.3425791636451535, total=   6.7s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=10, oob_score=True, score=0.3396785109983079, total=   6.4s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False, score=0.3065861027190332, total=   9.6s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False, score=0.30662315687696395, total=   9.2s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=False, score=0.3058979937152526, total=   9.0s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True, score=0.3065861027190332, total=   9.9s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True, score=0.30662315687696395, total=   9.8s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=15, oob_score=True, score=0.3058979937152526, total=  10.1s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True 


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False, score=0.33202416918429, total=  13.7s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False, score=0.3329103214890017, total=  12.5s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=False, score=0.3306139714769156, total=  13.3s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True, score=0.33202416918429, total=  13.1s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True, score=0.3329103214890017, total=  13.6s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False 


  warn("Some inputs do not have OOB scores. "
  predictions.sum(axis=1)[:, np.newaxis])


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=20, oob_score=True, score=0.3306139714769156, total=  13.8s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False, score=0.31045317220543805, total=  15.9s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False, score=0.31067198452985256, total=  16.0s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=False, score=0.3065022963500121, total=  15.6s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True, score=0.31045317220543805, total=  16.0s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=Fals

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.8min


[CV]  bootstrap=True, bootstrap_features=True, n_estimators=25, oob_score=True, score=0.3065022963500121, total=  15.7s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False, score=0.3242900302114804, total=  18.1s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False, score=0.32704858593183467, total=  16.9s
[CV] bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True 
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=False, score=0.323120618805898, total=  19.0s
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True, score=0.3242900302114804, total=  19.1s
[CV]  bootstrap=True, bootstrap_features=True, n_estimators=30, oob_score=True, score=0.32704858593183467, total=  16.4s
[CV]  bootstrap=True, bootstrap_feature

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'bootstrap_features': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'oob_score': [False, True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [65]:
bagging_grid.best_params_

{'bootstrap': True,
 'bootstrap_features': True,
 'n_estimators': 10,
 'oob_score': False}

In [66]:
bagging_grid.best_score_

0.34111509487169156

In [67]:
bagging_classifier = BaggingClassifier(random_state=random_state, **bagging_grid.best_params_)

In [68]:
bagging_classifier.fit(X_b, Y_b)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=True, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False)

In [69]:
bagging_y_est = bagging_classifier.predict_proba(test_tr)

In [70]:
bagging_y_est[:, 1].shape

(48744,)

In [71]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': bagging_y_est[:, 1]
})

In [72]:
result.to_csv('../data/dataset/first_approach/bagging_grid_search_cv_submission.csv', index=False)

# Models mixture

In [73]:
voting_classifier = VotingClassifier(
    estimators=[
        ('baseline', RandomForestClassifier(random_state=random_state)),
        ('extra trees', ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)),
        ('random forest', RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)),
        ('bagging', BaggingClassifier(random_state=random_state, **bagging_grid.best_params_))
    ],
    voting='soft')

In [74]:
voting_classifier.fit(X_b, Y_b)

VotingClassifier(estimators=[('baseline', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [75]:
res_y = voting_classifier.predict_proba(test_tr)

In [76]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': res_y[:, 1]
})

In [77]:
result.to_csv('../data/dataset/first_approach/mixture_submission.csv', index=False)