# All necessary imports

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import load_obj
from source.code.utils import generate_pipeline
from source.code.utils import generate_cat_feature_counts
from source.code.utils import generate_features_names
from source.code.ItemSelector import ItemSelector

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

In [4]:
import lightgbm as lgb

In [5]:
import xgboost as xgb

In [6]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [7]:
data_path = '../data/dataset/processed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

# Choose random_state

In [8]:
random_state = 42

# Data reading

## Datasets

In [9]:
dataset_names = ['application_train', 'application_test']

In [10]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

100%|██████████| 2/2 [00:02<00:00,  1.28s/it]


## Columns needed

In [11]:
datasets_num_features = load_obj(meta_path.format('datasets_num_features'))

In [12]:
datasets_cat_features = load_obj(meta_path.format('datasets_cat_features'))

In [13]:
datasets_bin_features = load_obj(meta_path.format('datasets_bin_features'))

# Common train & test categories

In [14]:
commom_categories = load_obj(meta_path.format('commom_categories'))

# Preprocessing

In [15]:
for category in tqdm(commom_categories):
    data_dict[dataset_names[0]] = data_dict[dataset_names[0]][data_dict[dataset_names[0]][category].isin(commom_categories[category])]

100%|██████████| 6/6 [00:00<00:00, 10.85it/s]


So far we just generate features only from train and test, without NaNs imputing and other tricks:

In [16]:
common_num_features = list(set(datasets_num_features[dataset_names[0]]) & set(datasets_num_features[dataset_names[1]]))
common_cat_features = list(set(datasets_cat_features[dataset_names[0]]) & set(datasets_cat_features[dataset_names[1]]))
common_bin_features = list(set(datasets_bin_features[dataset_names[0]]) & set(datasets_bin_features[dataset_names[1]]))

In [17]:
data_dict[dataset_names[0]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307500 entries, 0 to 307506
Data columns (total 54 columns):
AMT_CREDIT                     307500 non-null float64
AMT_INCOME_TOTAL               307500 non-null float64
CNT_CHILDREN                   307500 non-null int64
DAYS_BIRTH                     307500 non-null int64
DAYS_EMPLOYED                  307500 non-null int64
DAYS_ID_PUBLISH                307500 non-null int64
DAYS_REGISTRATION              307500 non-null float64
HOUR_APPR_PROCESS_START        307500 non-null int64
REGION_POPULATION_RELATIVE     307500 non-null float64
REGION_RATING_CLIENT           307500 non-null int64
SK_ID_CURR                     307500 non-null int64
NAME_EDUCATION_TYPE            307500 non-null object
NAME_FAMILY_STATUS             307500 non-null object
NAME_HOUSING_TYPE              307500 non-null object
NAME_INCOME_TYPE               307500 non-null object
ORGANIZATION_TYPE              307500 non-null object
WEEKDAY_APPR_PROCESS_START  

In [18]:
X = data_dict[dataset_names[0]][common_num_features + common_cat_features + common_bin_features]

In [19]:
Y = data_dict[dataset_names[0]]['TARGET']

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 307500 entries, 0 to 307506
Data columns (total 42 columns):
REGION_RATING_CLIENT           307500 non-null int64
HOUR_APPR_PROCESS_START        307500 non-null int64
SK_ID_CURR                     307500 non-null int64
CNT_CHILDREN                   307500 non-null int64
DAYS_EMPLOYED                  307500 non-null int64
AMT_INCOME_TOTAL               307500 non-null float64
AMT_CREDIT                     307500 non-null float64
DAYS_BIRTH                     307500 non-null int64
DAYS_REGISTRATION              307500 non-null float64
DAYS_ID_PUBLISH                307500 non-null int64
REGION_POPULATION_RELATIVE     307500 non-null float64
NAME_FAMILY_STATUS             307500 non-null object
NAME_HOUSING_TYPE              307500 non-null object
ORGANIZATION_TYPE              307500 non-null object
NAME_INCOME_TYPE               307500 non-null object
NAME_EDUCATION_TYPE            307500 non-null object
WEEKDAY_APPR_PROCESS_START  

In [21]:
Y.value_counts()

0    282677
1    24823 
Name: TARGET, dtype: int64

In [22]:
extended_features = generate_features_names(
    common_bin_features,
    generate_cat_feature_counts(X, common_cat_features),
    common_num_features
)

In [23]:
extended_features

['FLAG_DOCUMENT_6',
 'CODE_GENDER',
 'FLAG_DOCUMENT_8',
 'FLAG_EMP_PHONE',
 'NAME_CONTRACT_TYPE',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_9',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'FLAG_DOCUMENT_5',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLAG_OWN_REALTY',
 'FLAG_DOCUMENT_4',
 'LIVE_REGION_NOT_WORK_REGION',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_DOCUMENT_11',
 'FLAG_EMAIL',
 'FLAG_PHONE',
 'FLAG_OWN_CAR',
 'FLAG_MOBIL',
 'REG_CITY_NOT_WORK_CITY',
 'REG_CITY_NOT_LIVE_CITY',
 'NAME_FAMILY_STATUS_0',
 'NAME_FAMILY_STATUS_1',
 'NAME_FAMILY_STATUS_2',
 'NAME_FAMILY_STATUS_3',
 'NAME_FAMILY_STATUS_4',
 'NAME_HOUSING_TYPE_0',
 'NAME_HOUSING_TYPE_1',
 'NAME_HOUSING_TYPE_2',
 'NAME_HOUSING_TYPE_3',
 'NAME_HOUSING_TYPE_4',
 'NAME_HOUSING_TYPE_5',
 'ORGANIZATION_TYPE_0',
 'ORGANIZATION_TYPE_1',
 'ORGANIZATION_TYPE_2',
 'ORGANIZATION_TYPE_3',
 'ORGANIZATION_TYPE_4',
 'ORGANIZATION_TYPE_5',
 'ORGANIZATION_TYPE_6',
 'ORGANIZATION_TYPE_7',


In [24]:
len(extended_features)

124

Classes are unbalanced.

This has to be fixed.

But before that we need to binarize categorical features:

In [25]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        [('bin', Pipeline([('choose', ItemSelector(common_bin_features))]))] + list(map(generate_pipeline, common_cat_features)) + [('num', Pipeline([('choose', ItemSelector(common_num_features))]))]
    ))
])

In [26]:
X_tr = pd.DataFrame(pipeline.fit_transform(X), columns=extended_features)

In [27]:
X_tr.info(verbose=10, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307500 entries, 0 to 307499
Data columns (total 124 columns):
FLAG_DOCUMENT_6                 307500 non-null float64
CODE_GENDER                     307500 non-null float64
FLAG_DOCUMENT_8                 307500 non-null float64
FLAG_EMP_PHONE                  307500 non-null float64
NAME_CONTRACT_TYPE              307500 non-null float64
FLAG_DOCUMENT_3                 307500 non-null float64
FLAG_DOCUMENT_7                 307500 non-null float64
FLAG_DOCUMENT_18                307500 non-null float64
FLAG_DOCUMENT_9                 307500 non-null float64
REG_REGION_NOT_LIVE_REGION      307500 non-null float64
REG_REGION_NOT_WORK_REGION      307500 non-null float64
FLAG_DOCUMENT_5                 307500 non-null float64
LIVE_CITY_NOT_WORK_CITY         307500 non-null float64
FLAG_OWN_REALTY                 307500 non-null float64
FLAG_DOCUMENT_4                 307500 non-null float64
LIVE_REGION_NOT_WORK_REGION     307500 non-null 

In [28]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        [('bin', Pipeline([('choose', ItemSelector(common_bin_features))]))] + list(map(generate_pipeline, common_cat_features)) + [('num', Pipeline([('choose', ItemSelector(common_num_features))]))]
    ))
])

In [29]:
test = data_dict[dataset_names[1]][common_num_features + common_cat_features + common_bin_features]

In [30]:
test_tr = pd.DataFrame(pipeline.fit_transform(test), columns=extended_features)

In [31]:
test_tr.info(verbose=10, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 124 columns):
FLAG_DOCUMENT_6                 48744 non-null float64
CODE_GENDER                     48744 non-null float64
FLAG_DOCUMENT_8                 48744 non-null float64
FLAG_EMP_PHONE                  48744 non-null float64
NAME_CONTRACT_TYPE              48744 non-null float64
FLAG_DOCUMENT_3                 48744 non-null float64
FLAG_DOCUMENT_7                 48744 non-null float64
FLAG_DOCUMENT_18                48744 non-null float64
FLAG_DOCUMENT_9                 48744 non-null float64
REG_REGION_NOT_LIVE_REGION      48744 non-null float64
REG_REGION_NOT_WORK_REGION      48744 non-null float64
FLAG_DOCUMENT_5                 48744 non-null float64
LIVE_CITY_NOT_WORK_CITY         48744 non-null float64
FLAG_OWN_REALTY                 48744 non-null float64
FLAG_DOCUMENT_4                 48744 non-null float64
LIVE_REGION_NOT_WORK_REGION     48744 non-null float64
FLAG_WORK_

In [32]:
print(len(X_tr), len(Y))

307500 307500


Now we can balance classes.

Here we use the simplest way to do it (because of time & memory issues):

In [33]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [34]:
print(sum(Y_b), len(Y_b))

24823 49646


# XGBOOST

In [35]:
xgb_classifier = xgb.XGBClassifier()

In [36]:
xgb_classifier.fit(X_b, Y_b)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [37]:
xgb_y_est = xgb_classifier.predict_proba(test_tr.values)

In [38]:
xgb_y_est[:, 1].shape

(48744,)

In [39]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': xgb_y_est[:, 1]
})

In [40]:
result.to_csv('../data/dataset/xgb_submission.csv', index=False)

# LIGHTGBM

In [41]:
lgb_classifier = lgb.LGBMClassifier()

In [42]:
lgb_classifier.fit(X_b, Y_b)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [43]:
lgb_y_est = lgb_classifier.predict_proba(test_tr)

In [44]:
lgb_y_est[:, 1].shape

(48744,)

In [45]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': lgb_y_est[:, 1]
})

In [46]:
result.to_csv('../data/dataset/lgb_submission.csv', index=False)

# Models mixture

In [47]:
voting_classifier = VotingClassifier(
    estimators=[
        ('xgb', xgb.XGBClassifier()),
        ('lgb', lgb.LGBMClassifier())
    ],
    voting='soft')

In [48]:
voting_classifier.fit(X_b, Y_b)

VotingClassifier(estimators=[('xgb', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random...0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [49]:
res_y = voting_classifier.predict_proba(test_tr.values)

In [50]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': res_y[:, 1]
})

In [51]:
result.to_csv('../data/dataset/xgb_lgb_mixture_submission.csv', index=False)