# All necessary imports

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import save_obj
from source.code.utils import load_obj

In [3]:
from source.code.utils import generate_pipeline
from source.code.utils import generate_cat_feature_counts
from source.code.utils import generate_features_names
from source.code.ItemSelector import ItemSelector

In [4]:
from imblearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

In [5]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import pandas_profiling

In [6]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [7]:
data_path = '../data/dataset/processed/{}.csv'
transformed_data_path = '../data/dataset/transformed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

In [8]:
QUERY_PATTERN = 'n_missing <= 0 & type == \'{}\''

# Data reading

Firstly we just load all data into memory, then profile each dataset,

then try to filter features that are most interesting for us at the moment

(continuous, categorical, binary features without na, features with low na percentage etc.).

In [9]:
dataset_names = ['application_train', 'application_test', 'bureau', 'bureau_balance', 'credit_card_balance', 'installments_payments', 'POS_CASH_balance', 'previous_application', 'sample_submission']

In [10]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

100%|██████████| 9/9 [00:25<00:00,  2.86s/it]


In [11]:
columns_description = pd.read_csv(filepath_or_buffer='../data/dataset/original/HomeCredit_columns_description.csv', encoding='ISO-8859-1', index_col=0)

# Feature description

Here at this picture the general data structure is reflected.

Lots of connections and, as a consequence, lots of hypothetial issues with data.

![Image of data scheme](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

## Columns needed

In [12]:
datasets_num_features = load_obj(meta_path.format('datasets_num_features'))

In [13]:
datasets_cat_features = load_obj(meta_path.format('datasets_cat_features'))

In [14]:
datasets_bin_features = load_obj(meta_path.format('datasets_bin_features'))

In [15]:
extended_features_list = list(map(lambda name: generate_features_names(
    datasets_bin_features[name].tolist(),
    generate_cat_feature_counts(data_dict[name], datasets_cat_features[name].tolist()),
    datasets_num_features[name].tolist()
), tqdm(dataset_names)))

100%|██████████| 9/9 [00:01<00:00,  5.38it/s]


In [16]:
pipelines = list(
    map(
        lambda name: Pipeline([
            ('union', FeatureUnion(
                [('bin', Pipeline([('choose', ItemSelector(datasets_bin_features[name].tolist()))]))] +\
                list(map(generate_pipeline, datasets_cat_features[name].tolist())) +\
                [('num', Pipeline([('choose', ItemSelector(datasets_num_features[name].tolist()))]))]
            ))]),
        tqdm(dataset_names)
    )
)

100%|██████████| 9/9 [00:00<00:00, 1277.97it/s]


In [17]:
datasets_tr = list(
    map(
        lambda name_number: pd.DataFrame(pipelines[name_number].fit_transform(data_dict[dataset_names[name_number]]), columns=extended_features_list[name_number]),
        tqdm(range(len(dataset_names)))
    )
)

100%|██████████| 9/9 [04:51<00:00, 32.40s/it]


In [18]:
tr_data_dict = dict(zip(dataset_names, datasets_tr))

In [19]:
_ = list(
    map(
        lambda name: tr_data_dict[name].to_csv(transformed_data_path.format(name), index=False),
        tqdm(dataset_names)
    )
)

100%|██████████| 9/9 [09:34<00:00, 63.84s/it]
