# All necessary imports

In [None]:
import sys
sys.path.append('..')

In [None]:
from source.code.utils import load_obj
from source.code.utils import generate_pipeline

from source.code.ItemSelector import ItemSelector

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [None]:
data_path = '../data/dataset/processed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

# Choose random_state

In [None]:
random_state = 42

# Data reading

## Datasets

In [None]:
dataset_names = ['application_train', 'application_test']

In [None]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

## Columns needed

In [None]:
datasets_num_features = load_obj(meta_path.format('datasets_num_features'))

In [None]:
datasets_cat_features = load_obj(meta_path.format('datasets_cat_features'))

In [None]:
datasets_bin_features = load_obj(meta_path.format('datasets_bin_features'))

# Common train & test categories

In [None]:
commom_categories = load_obj(meta_path.format('commom_categories'))

# Preprocessing

In [None]:
for category in tqdm(commom_categories):
    data_dict[dataset_names[0]] = data_dict[dataset_names[0]][data_dict[dataset_names[0]][category].isin(commom_categories[category])]

So far we just generate features only from train and test, without NaNs imputing and other tricks:

In [None]:
common_num_features = list(set(datasets_num_features[dataset_names[0]]) & set(datasets_num_features[dataset_names[1]]))
common_cat_features = list(set(datasets_cat_features[dataset_names[0]]) & set(datasets_cat_features[dataset_names[1]]))
common_bin_features = list(set(datasets_bin_features[dataset_names[0]]) & set(datasets_bin_features[dataset_names[1]]))

In [None]:
X = data_dict[dataset_names[0]][common_num_features + common_cat_features + common_bin_features]

In [None]:
Y = data_dict[dataset_names[0]]['TARGET']

In [None]:
X.info()

In [None]:
Y.value_counts()

In [None]:
cat_counts = dict(zip(common_cat_features, list(map(lambda cat: data_dict[dataset_names[0]][cat].nunique(), common_cat_features))))

In [None]:
cat_counts

Classes are unbalanced.

This has to be fixed.

But before that we need to binarize categorical features:

In [None]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, common_cat_features)) + [('all_other_features', Pipeline([('choose', ItemSelector(common_num_features + common_bin_features))]))]
    ))
])

In [None]:
X_tr = pd.DataFrame(pipeline.fit_transform(X))

In [None]:
X_tr.info()

In [None]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, common_cat_features)) + [('all_other_features', Pipeline([('choose', ItemSelector(common_num_features + common_bin_features))]))]
    ))
])

In [None]:
test = data_dict[dataset_names[1]][common_num_features + common_cat_features + common_bin_features]

In [None]:
test_tr = pd.DataFrame(pipeline.fit_transform(test))

In [None]:
test_tr.info()

In [None]:
print(len(X_tr), len(Y))

Now we can balance classes.

Here we use the simplest way to do it (because of time & memory issues):

In [None]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [None]:
print(sum(Y_b), len(Y_b))

# Baseline-модель

In [None]:
cross_val_score(BaggingClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

In [None]:
cross_val_score(RandomForestClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

In [None]:
cross_val_score(ExtraTreesClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

In [None]:
baseline_classifier = RandomForestClassifier(random_state=random_state)

In [None]:
baseline_classifier.fit(X_b, Y_b)

In [None]:
baseline_y_est = baseline_classifier.predict_proba(test_tr)

In [None]:
baseline_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': baseline_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/baseline_submission.csv', index=False)

# Потюним гиперпараметры

# ExtraTreesClassifier

In [None]:
ExtraTreesClassifier().get_params()

In [None]:
e_t_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [None]:
e_t_grid = GridSearchCV(
    ExtraTreesClassifier(random_state=random_state),
    param_grid=e_t_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [None]:
e_t_grid.fit(X_b, Y_b)

In [None]:
e_t_grid.best_params_

In [None]:
e_t_grid.best_score_

In [None]:
e_t_classifier = ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)

In [None]:
e_t_classifier.fit(X_b, Y_b)

In [None]:
e_t_y_est = e_t_classifier.predict_proba(test_tr)

In [None]:
e_t_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': e_t_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/e_t_grid_search_cv_submission.csv', index=False)

# RandomForestClassifier

In [None]:
RandomForestClassifier().get_params()

In [None]:
r_f_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [None]:
r_f_grid = GridSearchCV(
    RandomForestClassifier(random_state=random_state),
    param_grid=r_f_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [None]:
r_f_grid.fit(X_b, Y_b)

In [None]:
r_f_grid.best_params_

In [None]:
r_f_grid.best_score_

In [None]:
r_f_classifier = RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)

In [None]:
r_f_classifier.fit(X_b, Y_b)

In [None]:
r_f_y_est = r_f_classifier.predict_proba(test_tr)

In [None]:
r_f_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': r_f_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/r_f_grid_search_cv_submission.csv', index=False)

# BaggingClassifier

In [None]:
BaggingClassifier().get_params()

In [None]:
bagging_params_grid = {
    'bootstrap': [True],
    'bootstrap_features': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'oob_score': [False, True]
}

In [None]:
bagging_grid = GridSearchCV(
    BaggingClassifier(random_state=random_state),
    param_grid=bagging_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [None]:
bagging_grid.fit(X_b, Y_b)

In [None]:
bagging_grid.best_params_

In [None]:
bagging_grid.best_score_

In [None]:
bagging_classifier = BaggingClassifier(random_state=random_state, **bagging_grid.best_params_)

In [None]:
bagging_classifier.fit(X_b, Y_b)

In [None]:
bagging_y_est = bagging_classifier.predict_proba(test_tr)

In [None]:
bagging_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': bagging_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/bagging_grid_search_cv_submission.csv', index=False)

# Models mixture

In [None]:
voting_classifier = VotingClassifier(
    estimators=[
        ('baseline', RandomForestClassifier(random_state=random_state)),
        ('extra trees', ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)),
        ('random forest', RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)),
        ('bagging', BaggingClassifier(random_state=random_state, **bagging_grid.best_params_))
    ],
    voting='soft')

In [None]:
voting_classifier.fit(X_b, Y_b)

In [None]:
res_y = voting_classifier.predict_proba(test_tr)

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': res_y[:, 1]
})

In [None]:
result.to_csv('../data/dataset/mixture_submission.csv', index=False)