# All necessary imports

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import load_obj
from source.code.utils import generate_pipeline

from source.code.ItemSelector import ItemSelector

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

In [4]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [5]:
data_path = '../data/dataset/processed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

# Choose random_state

In [6]:
random_state = 42

# Data reading

## Datasets

In [7]:
dataset_names = ['application_train', 'application_test']

In [8]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.23it/s]


## Columns needed

In [9]:
datasets_num_features = load_obj(meta_path.format('datasets_num_features'))

In [10]:
datasets_cat_features = load_obj(meta_path.format('datasets_cat_features'))

In [11]:
datasets_bin_features = load_obj(meta_path.format('datasets_bin_features'))

# Common train & test categories

In [12]:
commom_categories = load_obj(meta_path.format('commom_categories'))

# Preprocessing

In [13]:
for category in tqdm(commom_categories):
    data_dict[dataset_names[0]] = data_dict[dataset_names[0]][data_dict[dataset_names[0]][category].isin(commom_categories[category])]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 11.79it/s]


So far we just generate features only from train and test, without NaNs imputing and other tricks:

In [14]:
common_num_features = list(set(datasets_num_features[dataset_names[0]]) & set(datasets_num_features[dataset_names[1]]))
common_cat_features = list(set(datasets_cat_features[dataset_names[0]]) & set(datasets_cat_features[dataset_names[1]]))
common_bin_features = list(set(datasets_bin_features[dataset_names[0]]) & set(datasets_bin_features[dataset_names[1]]))

In [15]:
X = data_dict[dataset_names[0]][common_num_features + common_cat_features + common_bin_features]

In [16]:
Y = data_dict[dataset_names[0]]['TARGET']

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307500 entries, 0 to 307506
Data columns (total 42 columns):
AMT_CREDIT                     307500 non-null float64
AMT_INCOME_TOTAL               307500 non-null float64
REGION_POPULATION_RELATIVE     307500 non-null float64
SK_ID_CURR                     307500 non-null int64
HOUR_APPR_PROCESS_START        307500 non-null int64
CNT_CHILDREN                   307500 non-null int64
DAYS_ID_PUBLISH                307500 non-null int64
DAYS_EMPLOYED                  307500 non-null int64
REGION_RATING_CLIENT           307500 non-null int64
DAYS_REGISTRATION              307500 non-null float64
DAYS_BIRTH                     307500 non-null int64
NAME_FAMILY_STATUS             307500 non-null object
NAME_EDUCATION_TYPE            307500 non-null object
NAME_HOUSING_TYPE              307500 non-null object
ORGANIZATION_TYPE              307500 non-null object
WEEKDAY_APPR_PROCESS_START     307500 non-null object
NAME_INCOME_TYPE            

In [18]:
Y.value_counts()

0    282677
1    24823 
Name: TARGET, dtype: int64

In [19]:
cat_counts = dict(zip(common_cat_features, list(map(lambda cat: data_dict[dataset_names[0]][cat].nunique(), common_cat_features))))

In [20]:
cat_counts

{'NAME_EDUCATION_TYPE': 5,
 'NAME_FAMILY_STATUS': 5,
 'NAME_HOUSING_TYPE': 6,
 'NAME_INCOME_TYPE': 7,
 'ORGANIZATION_TYPE': 58,
 'WEEKDAY_APPR_PROCESS_START': 7}

Classes are unbalanced.

This has to be fixed.

But before that we need to binarize categorical features:

In [21]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, common_cat_features)) + [('all_other_features', Pipeline([('choose', ItemSelector(common_num_features + common_bin_features))]))]
    ))
])

In [22]:
X_tr = pd.DataFrame(pipeline.fit_transform(X))

In [23]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307500 entries, 0 to 307499
Columns: 124 entries, 0 to 123
dtypes: float64(124)
memory usage: 290.9 MB


In [24]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, common_cat_features)) + [('all_other_features', Pipeline([('choose', ItemSelector(common_num_features + common_bin_features))]))]
    ))
])

In [25]:
test = data_dict[dataset_names[1]][common_num_features + common_cat_features + common_bin_features]

In [26]:
test_tr = pd.DataFrame(pipeline.fit_transform(test))

In [27]:
test_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Columns: 124 entries, 0 to 123
dtypes: float64(124)
memory usage: 46.1 MB


In [28]:
print(len(X_tr), len(Y))

307500 307500


Now we can balance classes.

Here we use the simplest way to do it (because of time & memory issues):

In [29]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [30]:
print(sum(Y_b), len(Y_b))

24823 49646


# Baseline-модель

In [31]:
cross_val_score(BaggingClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([ 0.3326284 ,  0.33303118,  0.3306744 ])

In [32]:
cross_val_score(RandomForestClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([ 0.34688822,  0.35847232,  0.3386512 ])

In [33]:
cross_val_score(ExtraTreesClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([ 0.40084592,  0.48706792,  0.3871767 ])

In [34]:
baseline_classifier = RandomForestClassifier(random_state=random_state)

In [35]:
baseline_classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
baseline_y_est = baseline_classifier.predict_proba(test_tr)

In [37]:
baseline_y_est[:, 1].shape

(48744,)

In [38]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': baseline_y_est[:, 1]
})

In [39]:
result.to_csv('../data/dataset/baseline_submission.csv', index=False)

# Потюним гиперпараметры

# ExtraTreesClassifier

In [40]:
ExtraTreesClassifier().get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [41]:
e_t_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [42]:
e_t_grid = GridSearchCV(
    ExtraTreesClassifier(random_state=random_state),
    param_grid=e_t_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [43]:
e_t_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:   25.7s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   28.5s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [44]:
e_t_grid.best_params_

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 10}

In [45]:
e_t_grid.best_score_

0.42740603472585909

In [46]:
e_t_classifier = ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)

In [47]:
e_t_classifier.fit(X_b, Y_b)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [48]:
e_t_y_est = e_t_classifier.predict_proba(test_tr)

In [49]:
e_t_y_est[:, 1].shape

(48744,)

In [50]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': e_t_y_est[:, 1]
})

In [51]:
result.to_csv('../data/dataset/e_t_grid_search_cv_submission.csv', index=False)

# RandomForestClassifier

In [52]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [53]:
r_f_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [54]:
r_f_grid = GridSearchCV(
    RandomForestClassifier(random_state=random_state),
    param_grid=r_f_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [55]:
r_f_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:   26.2s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   29.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [56]:
r_f_grid.best_params_

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 10}

In [57]:
r_f_grid.best_score_

0.36041171494178786

In [58]:
r_f_classifier = RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)

In [59]:
r_f_classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [60]:
r_f_y_est = r_f_classifier.predict_proba(test_tr)

In [61]:
r_f_y_est[:, 1].shape

(48744,)

In [62]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': r_f_y_est[:, 1]
})

In [63]:
result.to_csv('../data/dataset/r_f_grid_search_cv_submission.csv', index=False)

# BaggingClassifier

In [64]:
BaggingClassifier().get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [65]:
bagging_params_grid = {
    'bootstrap': [True],
    'bootstrap_features': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'oob_score': [False, True]
}

In [66]:
bagging_grid = GridSearchCV(
    BaggingClassifier(random_state=random_state),
    param_grid=bagging_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [67]:
bagging_grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:  2.7min remaining:   24.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'bootstrap_features': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'oob_score': [False, True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [68]:
bagging_grid.best_params_

{'bootstrap': True,
 'bootstrap_features': True,
 'n_estimators': 10,
 'oob_score': False}

In [69]:
bagging_grid.best_score_

0.36244611851911535

In [70]:
bagging_classifier = BaggingClassifier(random_state=random_state, **bagging_grid.best_params_)

In [71]:
bagging_classifier.fit(X_b, Y_b)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=True, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False)

In [72]:
bagging_y_est = bagging_classifier.predict_proba(test_tr)

In [73]:
bagging_y_est[:, 1].shape

(48744,)

In [74]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': bagging_y_est[:, 1]
})

In [75]:
result.to_csv('../data/dataset/bagging_grid_search_cv_submission.csv', index=False)

# Models mixture

In [76]:
voting_classifier = VotingClassifier(
    estimators=[
        ('baseline', RandomForestClassifier(random_state=random_state)),
        ('extra trees', ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)),
        ('random forest', RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)),
        ('bagging', BaggingClassifier(random_state=random_state, **bagging_grid.best_params_))
    ],
    voting='soft')

In [77]:
voting_classifier.fit(X_b, Y_b)

VotingClassifier(estimators=[('baseline', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [78]:
res_y = voting_classifier.predict_proba(test_tr)

In [79]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': res_y[:, 1]
})

In [80]:
result.to_csv('../data/dataset/mixture_submission.csv', index=False)