# All necessary imports

In [None]:
import sys
sys.path.append('..')

In [None]:
from source.code.utils import load_obj
from source.code.utils import generate_binarized_pipeline
from source.code.utils import generate_cat_feature_counts
from source.code.utils import generate_features_names
from source.code.ItemSelector import ItemSelector

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from imblearn.under_sampling import RandomUnderSampler

In [None]:
import lightgbm as lgb

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [None]:
data_path = '../data/dataset/processed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

# Choose random_state

In [None]:
random_state = 42

# Data reading

## Datasets

In [None]:
dataset_names = ['application_train', 'application_test']

In [None]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

## Columns needed

In [None]:
datasets_num_features = load_obj(meta_path.format('datasets_num_features'))

In [None]:
datasets_cat_features = load_obj(meta_path.format('datasets_cat_features'))

In [None]:
datasets_bin_features = load_obj(meta_path.format('datasets_bin_features'))

# Common train & test categories

In [None]:
commom_categories = load_obj(meta_path.format('commom_categories'))

# Preprocessing

In [None]:
for category in tqdm(commom_categories):
    data_dict[dataset_names[0]] = data_dict[dataset_names[0]][data_dict[dataset_names[0]][category].isin(commom_categories[category])]

So far we just generate features only from train and test, without NaNs imputing and other tricks:

In [None]:
common_num_features = list(set(datasets_num_features[dataset_names[0]]) & set(datasets_num_features[dataset_names[1]]))
common_cat_features = list(set(datasets_cat_features[dataset_names[0]]) & set(datasets_cat_features[dataset_names[1]]))
common_bin_features = list(set(datasets_bin_features[dataset_names[0]]) & set(datasets_bin_features[dataset_names[1]]))

In [None]:
data_dict[dataset_names[0]].info()

In [None]:
X = data_dict[dataset_names[0]][common_num_features + common_cat_features + common_bin_features]

In [None]:
Y = data_dict[dataset_names[0]]['TARGET']

In [None]:
X.info()

In [None]:
Y.value_counts()

In [None]:
extended_features = generate_features_names(common_bin_features, generate_cat_feature_counts(X, common_cat_features), common_num_features)

In [None]:
extended_features

In [None]:
len(extended_features)

Classes are unbalanced.

This has to be fixed.

But before that we need to binarize categorical features:

In [None]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        [('bin', Pipeline([('choose', ItemSelector(common_bin_features))]))] +\
        list(map(generate_binarized_pipeline, common_cat_features)) +\
        [('num', Pipeline([('choose', ItemSelector(common_num_features))]))]
    ))
])

In [None]:
X_tr = pd.DataFrame(pipeline.fit_transform(X), columns=extended_features)

In [None]:
X_tr.info(verbose=10, null_counts=True)

In [None]:
test = data_dict[dataset_names[1]][common_num_features + common_cat_features + common_bin_features]

In [None]:
test_tr = pd.DataFrame(pipeline.transform(test), columns=extended_features)

In [None]:
test_tr.info(verbose=10, null_counts=True)

In [None]:
print(len(X_tr), len(Y))

Now we can balance classes.

Here we use the simplest way to do it (because of time & memory issues):

In [None]:
X_b, Y_b = RandomUnderSampler(random_state=random_state).fit_sample(X_tr, Y)

In [None]:
print(sum(Y_b), len(Y_b))

# LIGHTGBM

In [None]:
classifier = lgb.LGBMClassifier()

In [None]:
classifier.fit(X_b, Y_b)

In [None]:
lgb_y_est = classifier.predict_proba(test_tr)

In [None]:
lgb_y_est[:, 1].shape

In [None]:
result = pd.DataFrame({
    'SK_ID_CURR': data_dict[dataset_names[1]].SK_ID_CURR.values,
    'TARGET': lgb_y_est[:, 1]
})

In [None]:
result.to_csv('../data/dataset/lgb_submission.csv', index=False)