# Все необходимые импорты

In [None]:
import sys
sys.path.append('..')

In [None]:
from source.code.ItemSelector import ItemSelector
from source.code.MyLabelBinarizer import MyLabelBinarizer

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

# Выбрать random_state

In [None]:
random_state = 42

# Вспомогательные функции

In [None]:
def read_columns(path):
    with open(path, 'r') as file:
        columns = file.readlines()
        columns = list(map(str.rstrip, columns))
    return columns

In [None]:
def generate_pipeline(column):
    return  (column, Pipeline([
        ('choose', ItemSelector(column)),
        ('binarize', MyLabelBinarizer())
    ]))

# Чтение данных

In [None]:
train = pd.read_csv('../data/dataset/application_train.csv')

In [None]:
train = train[train.CODE_GENDER != 'XNA']

In [None]:
train = train[train.NAME_FAMILY_STATUS != 'Unknown']

In [None]:
train = train[train.NAME_INCOME_TYPE != 'Maternity leave']

In [None]:
test = pd.read_csv('../data/dataset/application_test.csv')

In [None]:
na_columns = read_columns('../data/train_test_na_columns.txt')

In [None]:
highly_correlated_columns = read_columns('../data/train_test_highly_correlated_columns.txt')

In [None]:
categorical_columns = read_columns('../data/train_test_categorical_columns.txt')

In [None]:
binary_columns = read_columns('../data/train_test_binary_columns.txt')

# Предобработка

Пока что попробуем обойтись только этими данными, без хитростей с заполнениями пропусков:

In [None]:
train = train[list(set(train.columns) - (set(na_columns) | set(highly_correlated_columns)))]

In [None]:
test = test[list(set(test.columns) - (set(na_columns) | set(highly_correlated_columns)))]

In [None]:
continuous_columns = list(
    set(test.columns) - (set(binary_columns) | set(categorical_columns) | set(['SK_ID_CURR']))
)

In [None]:
X = train[test.columns]

In [None]:
Y = train['TARGET']

In [None]:
X.info()

In [None]:
Y.value_counts()

Классы не сбалансированы.

Надо поправить.

Перед этим надо бинаризовать все поля, где есть символьные обозначения:

In [None]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, binary_columns)) + \
        list(map(generate_pipeline, categorical_columns)) + \
        [('all_other_features', Pipeline([('choose', ItemSelector(continuous_columns))]))]
    ))
])

In [None]:
X_tr = pd.DataFrame(pipeline.fit_transform(X))

In [None]:
X_tr.info()

In [None]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, binary_columns)) + \
        list(map(generate_pipeline, categorical_columns)) + \
        [('all_other_features', Pipeline([('choose', ItemSelector(continuous_columns))]))]
    ))
])

In [None]:
test_tr = pd.DataFrame(pipeline.fit_transform(test))

In [None]:
test_tr.info()

In [None]:
print(len(X_tr), len(Y))

Тперь можно попробовать и сбалансировать:

In [None]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [None]:
print(sum(Y_b), len(Y_b))

# Baseline-модель

In [None]:
cross_val_score(BaggingClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

In [None]:
cross_val_score(RandomForestClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

In [None]:
cross_val_score(ExtraTreesClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

In [None]:
baseline_classifier = RandomForestClassifier(random_state=random_state)

In [None]:
baseline_classifier.fit(X_b, Y_b)

In [None]:
baseline_y_est = baseline_classifier.predict_proba(test_tr)

In [None]:
baseline_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': baseline_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/baseline_submission.csv', index=False)

# Потюним гиперпараметры

# ExtraTreesClassifier

In [None]:
ExtraTreesClassifier().get_params()

In [None]:
e_t_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [None]:
e_t_grid = GridSearchCV(
    ExtraTreesClassifier(random_state=random_state),
    param_grid=e_t_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [None]:
e_t_grid.fit(X_b, Y_b)

In [None]:
e_t_grid.best_params_

In [None]:
e_t_grid.best_score_

In [None]:
e_t_classifier = ExtraTreesClassifier(random_state=random_state, **e_t_grid.best_params_)

In [None]:
e_t_classifier.fit(X_b, Y_b)

In [None]:
e_t_y_est = e_t_classifier.predict_proba(test_tr)

In [None]:
e_t_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': e_t_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/e_t_grid_search_cv_submission.csv', index=False)

# RandomForestClassifier

In [None]:
RandomForestClassifier().get_params()

In [None]:
r_f_params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [None]:
r_f_grid = GridSearchCV(
    RandomForestClassifier(random_state=random_state),
    param_grid=r_f_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [None]:
r_f_grid.fit(X_b, Y_b)

In [None]:
r_f_grid.best_params_

In [None]:
r_f_grid.best_score_

In [None]:
r_f_classifier = RandomForestClassifier(random_state=random_state, **r_f_grid.best_params_)

In [None]:
r_f_classifier.fit(X_b, Y_b)

In [None]:
r_f_y_est = r_f_classifier.predict_proba(test_tr)

In [None]:
r_f_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': r_f_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/r_f_grid_search_cv_submission.csv', index=False)

# BaggingClassifier

In [None]:
BaggingClassifier().get_params()

In [None]:
bagging_params_grid = {
    'bootstrap': [True],
    'bootstrap_features': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'oob_score': [False, True]
}

In [None]:
bagging_grid = GridSearchCV(
    BaggingClassifier(random_state=random_state),
    param_grid=bagging_params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [None]:
bagging_grid.fit(X_b, Y_b)

In [None]:
bagging_grid.best_params_

In [None]:
bagging_grid.best_score_

In [None]:
bagging_classifier = BaggingClassifier(random_state=random_state, **bagging_grid.best_params_)

In [None]:
bagging_classifier.fit(X_b, Y_b)

In [None]:
bagging_y_est = bagging_classifier.predict_proba(test_tr)

In [None]:
bagging_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': bagging_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/bagging_grid_search_cv_submission.csv', index=False)

# Models mixture

In [None]:
res_y = (bagging_y_est[:, 1] + r_f_y_est[:, 1] + e_t_y_est[:, 1] + baseline_y_est[:, 1]) / 4

In [None]:
res_y

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': res_y
})

In [None]:
result.to_csv('../data/dataset/mixture_submission.csv', index=False)