# All necessary imports

In [None]:
import sys
sys.path.append('..')

In [None]:
from source.code.utils import save_obj
from source.code.utils import load_obj

In [None]:
import pandas as pd
import numpy as np
import os
import pandas_profiling
from tqdm import tqdm

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [None]:
data_path = '../data/dataset/processed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

In [None]:
dataset_names = ['application_train', 'application_test', 'bureau', 'bureau_balance', 'credit_card_balance', 'installments_payments', 'POS_CASH_balance', 'previous_application', 'sample_submission']

In [None]:
train_n, test_n, bureau_n, bureau_balance_n, credit_card_balance_n, installments_payments_n, POS_CASH_balance_n, previous_application_n, sample_submission_n = 0, 1, 2, 3, 4, 5, 6, 7, 8

# Data reading

## Datasets

In [None]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

In [None]:
columns_description = pd.read_csv(filepath_or_buffer='../data/dataset/processed/HomeCredit_columns_description.csv', encoding='ISO-8859-1', index_col=0)

## Columns needed

In [None]:
datasets_num_features = load_obj(meta_path.format('datasets_num_features'))

In [None]:
datasets_cat_features = load_obj(meta_path.format('datasets_cat_features'))

In [None]:
datasets_bin_features = load_obj(meta_path.format('datasets_bin_features'))

In [None]:
commom_categories = load_obj(meta_path.format('commom_categories'))

# Data examples

## Numeric columns

In [None]:
data_dict[dataset_names[train_n]][datasets_num_features[dataset_names[train_n]]].head().T # application_train

In [None]:
data_dict[dataset_names[test_n]][datasets_num_features[dataset_names[test_n]]].head().T # application_test

In [None]:
data_dict[dataset_names[bureau_n]][datasets_num_features[dataset_names[bureau_n]]].head().T # bureau

In [None]:
data_dict[dataset_names[bureau_balance_n]][datasets_num_features[dataset_names[bureau_balance_n]]].head().T # bureau_balance

In [None]:
data_dict[dataset_names[credit_card_balance_n]][datasets_num_features[dataset_names[credit_card_balance_n]]].head().T # credit_card_balance

In [None]:
data_dict[dataset_names[installments_payments_n]][datasets_num_features[dataset_names[installments_payments_n]]].head().T # installments_payments

In [None]:
data_dict[dataset_names[POS_CASH_balance_n]][datasets_num_features[dataset_names[POS_CASH_balance_n]]].head().T # POS_CASH_balance

In [None]:
data_dict[dataset_names[previous_application_n]][datasets_num_features[dataset_names[previous_application_n]]].head().T # previous_application

In [None]:
data_dict[dataset_names[sample_submission_n]][datasets_num_features[dataset_names[sample_submission_n]]].head().T # sample_submission

## Categorical columns

In [None]:
data_dict[dataset_names[train_n]][datasets_cat_features[dataset_names[train_n]]].head().T # application_train

In [None]:
data_dict[dataset_names[test_n]][datasets_cat_features[dataset_names[test_n]]].head().T # application_test

In [None]:
data_dict[dataset_names[bureau_n]][datasets_cat_features[dataset_names[bureau_n]]].head().T # bureau

In [None]:
data_dict[dataset_names[bureau_balance_n]][datasets_cat_features[dataset_names[bureau_balance_n]]].head().T # bureau_balance

In [None]:
data_dict[dataset_names[credit_card_balance_n]][datasets_cat_features[dataset_names[credit_card_balance_n]]].head().T # credit_card_balance

In [None]:
data_dict[dataset_names[installments_payments_n]][datasets_cat_features[dataset_names[installments_payments_n]]].head().T # installments_payments

In [None]:
data_dict[dataset_names[POS_CASH_balance_n]][datasets_cat_features[dataset_names[POS_CASH_balance_n]]].head().T # POS_CASH_balance

In [None]:
data_dict[dataset_names[previous_application_n]][datasets_cat_features[dataset_names[previous_application_n]]].head().T # previous_application

In [None]:
data_dict[dataset_names[sample_submission_n]][datasets_cat_features[dataset_names[sample_submission_n]]].head().T # sample_submission

## Binary columns

In [None]:
data_dict[dataset_names[train_n]][datasets_bin_features[dataset_names[train_n]]].head().T # application_train

In [None]:
data_dict[dataset_names[test_n]][datasets_bin_features[dataset_names[test_n]]].head().T # application_test

In [None]:
data_dict[dataset_names[bureau_n]][datasets_bin_features[dataset_names[bureau_n]]].head().T # bureau

In [None]:
data_dict[dataset_names[bureau_balance_n]][datasets_bin_features[dataset_names[bureau_balance_n]]].head().T # bureau_balance

In [None]:
data_dict[dataset_names[credit_card_balance_n]][datasets_bin_features[dataset_names[credit_card_balance_n]]].head().T # credit_card_balance

In [None]:
data_dict[dataset_names[installments_payments_n]][datasets_bin_features[dataset_names[installments_payments_n]]].head().T # installments_payments

In [None]:
data_dict[dataset_names[POS_CASH_balance_n]][datasets_bin_features[dataset_names[POS_CASH_balance_n]]].head().T # POS_CASH_balance

In [None]:
data_dict[dataset_names[previous_application_n]][datasets_bin_features[dataset_names[previous_application_n]]].head().T # previous_application

In [None]:
data_dict[dataset_names[sample_submission_n]][datasets_bin_features[dataset_names[sample_submission_n]]].head().T # sample_submission

# Info

## Numeric columns

In [None]:
data_dict[dataset_names[train_n]][datasets_num_features[dataset_names[train_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[test_n]][datasets_num_features[dataset_names[test_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[bureau_n]][datasets_num_features[dataset_names[bureau_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[bureau_balance_n]][datasets_num_features[dataset_names[bureau_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[credit_card_balance_n]][datasets_num_features[dataset_names[credit_card_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[installments_payments_n]][datasets_num_features[dataset_names[installments_payments_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[POS_CASH_balance_n]][datasets_num_features[dataset_names[POS_CASH_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[previous_application_n]][datasets_num_features[dataset_names[previous_application_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[sample_submission_n]][datasets_num_features[dataset_names[sample_submission_n]]].info(verbose=10, null_counts=True)

## Categorical columns

In [None]:
data_dict[dataset_names[train_n]][datasets_cat_features[dataset_names[train_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[test_n]][datasets_cat_features[dataset_names[test_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[bureau_n]][datasets_cat_features[dataset_names[bureau_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[bureau_balance_n]][datasets_cat_features[dataset_names[bureau_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[credit_card_balance_n]][datasets_cat_features[dataset_names[credit_card_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[installments_payments_n]][datasets_cat_features[dataset_names[installments_payments_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[POS_CASH_balance_n]][datasets_cat_features[dataset_names[POS_CASH_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[previous_application_n]][datasets_cat_features[dataset_names[previous_application_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[sample_submission_n]][datasets_cat_features[dataset_names[sample_submission_n]]].info(verbose=10, null_counts=True)

## Binary columns

In [None]:
data_dict[dataset_names[train_n]][datasets_bin_features[dataset_names[train_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[test_n]][datasets_bin_features[dataset_names[test_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[bureau_n]][datasets_bin_features[dataset_names[bureau_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[bureau_balance_n]][datasets_bin_features[dataset_names[bureau_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[credit_card_balance_n]][datasets_bin_features[dataset_names[credit_card_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[installments_payments_n]][datasets_bin_features[dataset_names[installments_payments_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[POS_CASH_balance_n]][datasets_bin_features[dataset_names[POS_CASH_balance_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[previous_application_n]][datasets_bin_features[dataset_names[previous_application_n]]].info(verbose=10, null_counts=True)

In [None]:
data_dict[dataset_names[sample_submission_n]][datasets_bin_features[dataset_names[sample_submission_n]]].info(verbose=10, null_counts=True)