# Все необходимые импорты

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.ItemSelector import ItemSelector
from source.code.MyLabelBinarizer import MyLabelBinarizer

In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

# Выбрать random_state

In [4]:
random_state = 42

# Вспомогательные функции

In [5]:
def read_columns(path):
    with open(path, 'r') as file:
        columns = file.readlines()
        columns = list(map(str.rstrip, columns))
    return columns

In [6]:
def generate_pipeline(column):
    return  (column, Pipeline([
        ('choose', ItemSelector(column)),
        ('binarize', MyLabelBinarizer())
    ]))

# Чтение данных

In [7]:
train = pd.read_csv('../data/dataset/application_train.csv')

In [8]:
train = train[train.CODE_GENDER != 'XNA']

In [9]:
train = train[train.NAME_FAMILY_STATUS != 'Unknown']

In [10]:
train = train[train.NAME_INCOME_TYPE != 'Maternity leave']

In [11]:
test = pd.read_csv('../data/dataset/application_test.csv')

In [12]:
na_columns = read_columns('../data/train_test_na_columns.txt')

In [13]:
highly_correlated_columns = read_columns('../data/train_test_highly_correlated_columns.txt')

In [14]:
categorical_columns = read_columns('../data/train_test_categorical_columns.txt')

In [15]:
binary_columns = read_columns('../data/train_test_binary_columns.txt')

# Предобработка

Пока что попробуем обойтись только этими данными, без хитростей с заполнениями пропусков:

In [16]:
train = train[list(set(train.columns) - (set(na_columns) | set(highly_correlated_columns)))]

In [17]:
test = test[list(set(test.columns) - (set(na_columns) | set(highly_correlated_columns)))]

In [18]:
continuous_columns = list(
    set(test.columns) - (set(binary_columns) | set(categorical_columns) | set(['SK_ID_CURR']))
)

In [19]:
X = train[test.columns]

In [20]:
Y = train['TARGET']

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307500 entries, 0 to 307510
Data columns (total 53 columns):
FLAG_PHONE                     307500 non-null int64
FLAG_OWN_CAR                   307500 non-null object
FLAG_DOCUMENT_21               307500 non-null int64
FLAG_DOCUMENT_7                307500 non-null int64
DAYS_EMPLOYED                  307500 non-null int64
FLAG_WORK_PHONE                307500 non-null int64
REGION_RATING_CLIENT           307500 non-null int64
REG_CITY_NOT_WORK_CITY         307500 non-null int64
FLAG_DOCUMENT_6                307500 non-null int64
FLAG_DOCUMENT_10               307500 non-null int64
FLAG_DOCUMENT_5                307500 non-null int64
REG_REGION_NOT_WORK_REGION     307500 non-null int64
AMT_CREDIT                     307500 non-null float64
FLAG_DOCUMENT_16               307500 non-null int64
WEEKDAY_APPR_PROCESS_START     307500 non-null object
CNT_CHILDREN                   307500 non-null int64
FLAG_CONT_MOBILE               307500

In [22]:
Y.value_counts()

0    282677
1     24823
Name: TARGET, dtype: int64

Классы не сбалансированы.

Надо поправить.

Перед этим надо бинаризовать все поля, где есть символьные обозначения:

In [23]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, binary_columns)) + \
        list(map(generate_pipeline, categorical_columns)) + \
        [('all_other_features', Pipeline([('choose', ItemSelector(continuous_columns))]))]
    ))
])

In [24]:
X_tr = pd.DataFrame(pipeline.fit_transform(X))

In [25]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307500 entries, 0 to 307499
Columns: 134 entries, 0 to 133
dtypes: float64(134)
memory usage: 314.4 MB


In [26]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        list(map(generate_pipeline, binary_columns)) + \
        list(map(generate_pipeline, categorical_columns)) + \
        [('all_other_features', Pipeline([('choose', ItemSelector(continuous_columns))]))]
    ))
])

In [27]:
test_tr = pd.DataFrame(pipeline.fit_transform(test))

In [28]:
test_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Columns: 134 entries, 0 to 133
dtypes: float64(134)
memory usage: 49.8 MB


In [29]:
print(len(X_tr), len(Y))

307500 307500


Тперь можно попробовать и сбалансировать:

In [30]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [31]:
print(sum(Y_b), len(Y_b))

24823 49646


# Baseline-модель

In [32]:
cross_val_score(BaggingClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.58072508, 0.57904278, 0.58103698])

In [33]:
cross_val_score(RandomForestClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.58338369, 0.57704859, 0.58043268])

In [34]:
cross_val_score(ExtraTreesClassifier(random_state=random_state), X_b, Y_b, scoring=make_scorer(roc_auc_score))

array([0.57480363, 0.56550641, 0.56961566])

In [69]:
classifier = RandomForestClassifier(random_state=random_state)

In [70]:
classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [71]:
y_est = classifier.predict_proba(test_tr)

In [72]:
y_est[:, 1].shape

(48744,)

In [73]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': y_est[:, 1]
})

In [74]:
result.to_csv('../data/dataset/baseline_submission.csv', index=False)

# Потюним гиперпараметры

# ExtraTreesClassifier

In [41]:
classifier.get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [42]:
params_grid = {
    'bootstrap': [False, True],
    'n_estimators': [10, 15, 20, 25, 30],
    'criterion': ['gini', 'entropy']
}

In [43]:
grid = GridSearchCV(
    ExtraTreesClassifier(random_state=random_state),
    param_grid=params_grid,
    scoring=make_scorer(roc_auc_score),
    verbose=10,
    n_jobs=-1
)

In [44]:
grid.fit(X_b, Y_b)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=10 ................
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5748036253776435, total=   1.8s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.5655064056079285, total=   1.9s
[CV] bootstrap=False, criterion=gini, n_estimators=15 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=10, score=0.569615663524293, total=   2.1s
[CV] bootstrap=False, criterion=gini, n_estimators=20 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.5827794561933535, total=   3.1s
[CV] bootstrap=False, criterion=gini, n_estimators=20 .....

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.7s


[CV]  bootstrap=False, criterion=gini, n_estimators=15, score=0.581339134638627, total=   3.1s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.590453172205438, total=   3.7s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.5836354846507131, total=   3.6s
[CV] bootstrap=False, criterion=gini, n_estimators=25 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=20, score=0.5868382886149384, total=   3.7s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.5911178247734139, total=   4.7s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   11.3s


[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.583816775441141, total=   4.5s
[CV] bootstrap=False, criterion=gini, n_estimators=30 ................
[CV]  bootstrap=False, criterion=gini, n_estimators=25, score=0.5903432438965434, total=   4.7s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.5682779456193353, total=   2.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.5895468277945619, total=   6.1s
[CV] bootstrap=False, criterion=entropy, n_estimators=10 .............
[CV]  bootstrap=False, criterion=gini, n_estimators=30, score=0.5864757070340827, total=   6.1s
[CV]  bootstrap=False, criterion=entropy, n_estimators=10, score=0.5684070582547739, total=   2.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=15 .............
[CV] bootstrap=False, criterion=entropy, n_estimators=15 .......

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   18.7s


[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5772809667673716, total=   3.3s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5803118201595359, total=   2.8s
[CV]  bootstrap=False, criterion=entropy, n_estimators=15, score=0.5720328740633309, total=   3.2s
[CV] bootstrap=False, criterion=entropy, n_estimators=20 .............
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.5838670694864049, total=   4.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.5853275320280397, total=   4.6s
[CV] bootstrap=False, criterion=entropy, n_estimators=25 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=20, score=0.577592458303

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   27.4s


[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5851359516616315, total=   6.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5829103214890017, total=   5.5s
[CV] bootstrap=False, criterion=entropy, n_estimators=30 .............
[CV]  bootstrap=False, criterion=entropy, n_estimators=25, score=0.5891950688905004, total=   5.4s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=10, score=0.5737764350453172, total=   1.3s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.5876132930513596, total=   6.5s
[CV] bootstrap=True, criterion=gini, n_estimators=10 .................
[CV]  bootstrap=False, criterion=entropy, n_estimators=30, score=0.5840584964950447, total=   6.0s
[CV] bootstrap=True, criterion=gini, n_estimators=15 ..

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   37.2s


[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.5855589123867069, total=   2.1s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.576565143824027, total=   2.1s
[CV] bootstrap=True, criterion=gini, n_estimators=20 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=15, score=0.5898598017887359, total=   2.0s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.5886404833836858, total=   2.7s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.581399564902103, total=   2.6s
[CV] bootstrap=True, criterion=gini, n_estimators=25 .................
[CV]  bootstrap=True, criterion=gini, n_estimators=20, score=0.5904036741600193, total=   2.5s
[CV] bootstrap=True, criterion=gini, n_estimators=30 .................
[CV]  

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   45.6s


[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5929909365558913, total=   3.8s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5699093655589124, total=   1.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=10 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5879864636209814, total=   4.1s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5711264201111917, total=   1.5s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=gini, n_estimators=30, score=0.5970510031423738, total=   3.8s
[CV] bootstrap=True, criterion=entropy, n_estimators=15 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=10, score=0.5728184674885183, total=   1.6s
[CV] bootstrap=True, criterion=entropy, n_estimators=20 ..........

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   53.4s


[CV]  bootstrap=True, criterion=entropy, n_estimators=20, score=0.5815808556925308, total=   2.7s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5890030211480363, total=   3.3s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.587382160986222, total=   3.3s
[CV] bootstrap=True, criterion=entropy, n_estimators=30 ..............
[CV]  bootstrap=True, criterion=entropy, n_estimators=25, score=0.5852671017645636, total=   3.4s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.5909969788519638, total=   3.9s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.5877447425670776, total=   3.3s
[CV]  bootstrap=True, criterion=entropy, n_estimators=30, score=0.5868382886149383, total=   2.9s


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   59.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   59.3s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [False, True], 'n_estimators': [10, 15, 20, 25, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=10)

In [45]:
grid.best_params_

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 30}

In [46]:
grid.best_score_

0.5926761471216211

In [47]:
classifier = ExtraTreesClassifier(random_state=random_state, **grid.best_params_)

In [48]:
classifier.fit(X_b, Y_b)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [49]:
y_est = classifier.predict_proba(test_tr)

In [50]:
y_est[:, 1].shape

(48744,)

In [51]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': y_est[:, 1]
})

In [52]:
result.to_csv('../data/dataset/grid_search_cv_submission.csv', index=False)

# Models mixture

In [53]:
classifier = BaggingClassifier(random_state=random_state)

In [54]:
classifier.fit(X_b, Y_b)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False)

In [55]:
bagg_y_est = classifier.predict_proba(test_tr)

In [56]:
bagg_y_est[:, 1]

array([0.4, 0.4, 0.2, ..., 0.5, 0.3, 0.8])

In [57]:
classifier = RandomForestClassifier(random_state=random_state)

In [58]:
classifier.fit(X_b, Y_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [59]:
r_f_y_est = classifier.predict_proba(test_tr)

In [60]:
r_f_y_est[:, 1]

array([0.5, 0.5, 0.4, ..., 0.4, 0.4, 0.7])

In [61]:
classifier = ExtraTreesClassifier(random_state=random_state)

In [62]:
classifier.fit(X_b, Y_b)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [63]:
e_t_y_est = classifier.predict_proba(test_tr)

In [64]:
e_t_y_est[:, 1]

array([0.8, 0.3, 0.3, ..., 0.5, 0.3, 0.7])

In [65]:
res_y = (bagg_y_est[:, 1] + r_f_y_est[:, 1] + e_t_y_est[:, 1]) / 3

In [66]:
res_y

array([0.56666667, 0.4       , 0.3       , ..., 0.46666667, 0.33333333,
       0.73333333])

In [67]:
result = pd.DataFrame({
    'SK_ID_CURR': test.SK_ID_CURR.values,
    'TARGET': res_y
})

In [68]:
result.to_csv('../data/dataset/mixture_submission.csv', index=False)