# All necessary imports

In [1]:
import sys
sys.path.append('..')

In [2]:
from source.code.utils import save_obj
from source.code.utils import load_obj

In [3]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import pandas_profiling

In [4]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 30000)

In [5]:
data_path = '../data/dataset/original/{}.csv'
processed_data_path = '../data/dataset/processed/{}.csv'
profiling_path = '../data/dataset/processed/data_profiling/{}.html'
meta_path = '../data/dataset/processed/meta-info/{}.pkl'

In [6]:
QUERY_PATTERN = 'n_missing <= 0 & type == \'{}\''

# Data reading

Firstly we just load all data into memory, then profile each dataset,

then try to filter features that are most interesting for us at the moment

(continuous, categorical, binary features without na, features with low na percentage etc.).

In [7]:
dataset_names = ['application_train', 'application_test', 'bureau', 'bureau_balance', 'credit_card_balance', 'installments_payments', 'POS_CASH_balance', 'previous_application', 'sample_submission']

In [8]:
data_dict = dict(zip(dataset_names, list(map(lambda name: pd.read_csv(filepath_or_buffer=data_path.format(name)), tqdm(dataset_names)))))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:53<00:00,  5.89s/it]


In [9]:
columns_description = pd.read_csv(filepath_or_buffer='../data/dataset/original/HomeCredit_columns_description.csv', encoding='ISO-8859-1', index_col=0)

# Feature description

Here at this picture the general data structure is reflected.

Lots of connections and, as a consequence, lots of hypothetial issues with data.

![Image of data scheme](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

# General data description

- **application_{train|test}.csv**

This is the main table, broken into two files for Train (with TARGET) and Test (without TARGET).
Static data for all applications. One row represents one loan in our data sample.

- **bureau.csv**

All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).
For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

- **bureau_balance.csv**

Monthly balances of previous credits in Credit Bureau.
This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

- **POS_CASH_balance.csv**

Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows.

- **credit_card_balance.csv**

Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

- **previous_application.csv**

All previous applications for Home Credit loans of clients who have loans in our sample.
There is one row for each previous application related to loans in our data sample.

- **installments_payments.csv**

Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.
There is a) one row for every payment that was made plus b) one row each for missed payment.
One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

- **HomeCredit_columns_description.csv**

This file contains descriptions for the columns in the various data files.

# Data examples

In [10]:
name_number = 0

In [11]:
data_dict[dataset_names[name_number]].head().T # application_train

Unnamed: 0,0,1,2,3,4
SK_ID_CURR,100002,100003,100004,100006,100007
TARGET,1,0,0,0,0
NAME_CONTRACT_TYPE,Cash loans,Cash loans,Revolving loans,Cash loans,Cash loans
CODE_GENDER,M,F,M,F,M
FLAG_OWN_CAR,N,N,Y,N,N
FLAG_OWN_REALTY,Y,N,Y,Y,Y
CNT_CHILDREN,0,0,0,0,0
AMT_INCOME_TOTAL,202500,270000,67500,135000,121500
AMT_CREDIT,406598,1.2935e+06,135000,312682,513000
AMT_ANNUITY,24700.5,35698.5,6750,29686.5,21865.5


In [12]:
name_number = 1

In [13]:
data_dict[dataset_names[name_number]].head().T # application_test

Unnamed: 0,0,1,2,3,4
SK_ID_CURR,100001,100005,100013,100028,100038
NAME_CONTRACT_TYPE,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans
CODE_GENDER,F,M,M,F,M
FLAG_OWN_CAR,N,N,Y,N,Y
FLAG_OWN_REALTY,Y,Y,Y,Y,N
CNT_CHILDREN,0,0,0,2,1
AMT_INCOME_TOTAL,135000,99000,202500,315000,180000
AMT_CREDIT,568800,222768,663264,1.575e+06,625500
AMT_ANNUITY,20560.5,17370,69777,49018.5,32067
AMT_GOODS_PRICE,450000,180000,630000,1.575e+06,625500


In [14]:
name_number = 2

In [15]:
data_dict[dataset_names[name_number]].head().T # bureau

Unnamed: 0,0,1,2,3,4
SK_ID_CURR,215354,215354,215354,215354,215354
SK_ID_BUREAU,5714462,5714463,5714464,5714465,5714466
CREDIT_ACTIVE,Closed,Active,Active,Active,Active
CREDIT_CURRENCY,currency 1,currency 1,currency 1,currency 1,currency 1
DAYS_CREDIT,-497,-208,-203,-203,-629
CREDIT_DAY_OVERDUE,0,0,0,0,0
DAYS_CREDIT_ENDDATE,-153,1075,528,,1197
DAYS_ENDDATE_FACT,-153,,,,
AMT_CREDIT_MAX_OVERDUE,,,,,77674.5
CNT_CREDIT_PROLONG,0,0,0,0,0


In [16]:
name_number = 3

In [17]:
data_dict[dataset_names[name_number]].head().T # bureau_balance

Unnamed: 0,0,1,2,3,4
SK_ID_BUREAU,5715448,5715448,5715448,5715448,5715448
MONTHS_BALANCE,0,-1,-2,-3,-4
STATUS,C,C,C,C,C


In [18]:
name_number = 4

In [19]:
data_dict[dataset_names[name_number]].head().T # credit_card_balance

Unnamed: 0,0,1,2,3,4
SK_ID_PREV,2562384,2582071,1740877,1389973,1891521
SK_ID_CURR,378907,363914,371185,337855,126868
MONTHS_BALANCE,-6,-1,-7,-4,-1
AMT_BALANCE,56.97,63975.6,31815.2,236572,453919
AMT_CREDIT_LIMIT_ACTUAL,135000,45000,450000,225000,450000
AMT_DRAWINGS_ATM_CURRENT,0,2250,0,2250,0
AMT_DRAWINGS_CURRENT,877.5,2250,0,2250,11547
AMT_DRAWINGS_OTHER_CURRENT,0,0,0,0,0
AMT_DRAWINGS_POS_CURRENT,877.5,0,0,0,11547
AMT_INST_MIN_REGULARITY,1700.33,2250,2250,11795.8,22924.9


In [20]:
name_number = 5

In [21]:
data_dict[dataset_names[name_number]].head().T # installments_payments

Unnamed: 0,0,1,2,3,4
SK_ID_PREV,1054186.0,1330831.0,2085231.0,2452527.0,2714724.0
SK_ID_CURR,161674.0,151639.0,193053.0,199697.0,167756.0
NUM_INSTALMENT_VERSION,1.0,0.0,2.0,1.0,1.0
NUM_INSTALMENT_NUMBER,6.0,34.0,1.0,3.0,2.0
DAYS_INSTALMENT,-1180.0,-2156.0,-63.0,-2418.0,-1383.0
DAYS_ENTRY_PAYMENT,-1187.0,-2156.0,-63.0,-2426.0,-1366.0
AMT_INSTALMENT,6948.36,1716.525,25425.0,24350.13,2165.04
AMT_PAYMENT,6948.36,1716.525,25425.0,24350.13,2160.585


In [22]:
name_number = 6

In [23]:
data_dict[dataset_names[name_number]].head().T # POS_CASH_balance

Unnamed: 0,0,1,2,3,4
SK_ID_PREV,1803195,1715348,1784872,1903291,2341044
SK_ID_CURR,182943,367990,397406,269225,334279
MONTHS_BALANCE,-31,-33,-32,-35,-35
CNT_INSTALMENT,48,36,12,48,36
CNT_INSTALMENT_FUTURE,45,35,9,42,35
NAME_CONTRACT_STATUS,Active,Active,Active,Active,Active
SK_DPD,0,0,0,0,0
SK_DPD_DEF,0,0,0,0,0


In [24]:
name_number = 7

In [25]:
data_dict[dataset_names[name_number]].head().T # previous_application

Unnamed: 0,0,1,2,3,4
SK_ID_PREV,2030495,2802425,2523466,2819243,1784265
SK_ID_CURR,271877,108129,122040,176158,202054
NAME_CONTRACT_TYPE,Consumer loans,Cash loans,Cash loans,Cash loans,Cash loans
AMT_ANNUITY,1730.43,25188.6,15060.7,47041.3,31924.4
AMT_APPLICATION,17145,607500,112500,450000,337500
AMT_CREDIT,17145,679671,136444,470790,404055
AMT_DOWN_PAYMENT,0,,,,
AMT_GOODS_PRICE,17145,607500,112500,450000,337500
WEEKDAY_APPR_PROCESS_START,SATURDAY,THURSDAY,TUESDAY,MONDAY,THURSDAY
HOUR_APPR_PROCESS_START,15,11,11,7,9


In [26]:
name_number = 8

In [27]:
data_dict[dataset_names[name_number]].head().T # sample_submission

Unnamed: 0,0,1,2,3,4
SK_ID_CURR,100001.0,100005.0,100013.0,100028.0,100038.0
TARGET,0.5,0.5,0.5,0.5,0.5


# Info

In [28]:
name_number = 0

In [29]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # application_train

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 122 columns):
SK_ID_CURR                      307511 non-null int64
TARGET                          307511 non-null int64
NAME_CONTRACT_TYPE              307511 non-null object
CODE_GENDER                     307511 non-null object
FLAG_OWN_CAR                    307511 non-null object
FLAG_OWN_REALTY                 307511 non-null object
CNT_CHILDREN                    307511 non-null int64
AMT_INCOME_TOTAL                307511 non-null float64
AMT_CREDIT                      307511 non-null float64
AMT_ANNUITY                     307499 non-null float64
AMT_GOODS_PRICE                 307233 non-null float64
NAME_TYPE_SUITE                 306219 non-null object
NAME_INCOME_TYPE                307511 non-null object
NAME_EDUCATION_TYPE             307511 non-null object
NAME_FAMILY_STATUS              307511 non-null object
NAME_HOUSING_TYPE               307511 non-null object
REGION_

In [30]:
name_number = 1

In [31]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # application_test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 121 columns):
SK_ID_CURR                      48744 non-null int64
NAME_CONTRACT_TYPE              48744 non-null object
CODE_GENDER                     48744 non-null object
FLAG_OWN_CAR                    48744 non-null object
FLAG_OWN_REALTY                 48744 non-null object
CNT_CHILDREN                    48744 non-null int64
AMT_INCOME_TOTAL                48744 non-null float64
AMT_CREDIT                      48744 non-null float64
AMT_ANNUITY                     48720 non-null float64
AMT_GOODS_PRICE                 48744 non-null float64
NAME_TYPE_SUITE                 47833 non-null object
NAME_INCOME_TYPE                48744 non-null object
NAME_EDUCATION_TYPE             48744 non-null object
NAME_FAMILY_STATUS              48744 non-null object
NAME_HOUSING_TYPE               48744 non-null object
REGION_POPULATION_RELATIVE      48744 non-null float64
DAYS_BIRTH             

In [32]:
name_number = 2

In [33]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # bureau

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 17 columns):
SK_ID_CURR                1716428 non-null int64
SK_ID_BUREAU              1716428 non-null int64
CREDIT_ACTIVE             1716428 non-null object
CREDIT_CURRENCY           1716428 non-null object
DAYS_CREDIT               1716428 non-null int64
CREDIT_DAY_OVERDUE        1716428 non-null int64
DAYS_CREDIT_ENDDATE       1610875 non-null float64
DAYS_ENDDATE_FACT         1082775 non-null float64
AMT_CREDIT_MAX_OVERDUE    591940 non-null float64
CNT_CREDIT_PROLONG        1716428 non-null int64
AMT_CREDIT_SUM            1716415 non-null float64
AMT_CREDIT_SUM_DEBT       1458759 non-null float64
AMT_CREDIT_SUM_LIMIT      1124648 non-null float64
AMT_CREDIT_SUM_OVERDUE    1716428 non-null float64
CREDIT_TYPE               1716428 non-null object
DAYS_CREDIT_UPDATE        1716428 non-null int64
AMT_ANNUITY               489637 non-null float64
dtypes: float64(8), int64(6), object(

In [34]:
name_number = 3

In [35]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # bureau_balance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 3 columns):
SK_ID_BUREAU      27299925 non-null int64
MONTHS_BALANCE    27299925 non-null int64
STATUS            27299925 non-null object
dtypes: int64(2), object(1)
memory usage: 624.8+ MB


In [36]:
name_number = 4

In [37]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # credit_card_balance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 23 columns):
SK_ID_PREV                    3840312 non-null int64
SK_ID_CURR                    3840312 non-null int64
MONTHS_BALANCE                3840312 non-null int64
AMT_BALANCE                   3840312 non-null float64
AMT_CREDIT_LIMIT_ACTUAL       3840312 non-null int64
AMT_DRAWINGS_ATM_CURRENT      3090496 non-null float64
AMT_DRAWINGS_CURRENT          3840312 non-null float64
AMT_DRAWINGS_OTHER_CURRENT    3090496 non-null float64
AMT_DRAWINGS_POS_CURRENT      3090496 non-null float64
AMT_INST_MIN_REGULARITY       3535076 non-null float64
AMT_PAYMENT_CURRENT           3072324 non-null float64
AMT_PAYMENT_TOTAL_CURRENT     3840312 non-null float64
AMT_RECEIVABLE_PRINCIPAL      3840312 non-null float64
AMT_RECIVABLE                 3840312 non-null float64
AMT_TOTAL_RECEIVABLE          3840312 non-null float64
CNT_DRAWINGS_ATM_CURRENT      3090496 non-null float64
CNT_DRAWINGS_CU

In [38]:
name_number = 5

In [39]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # installments_payments

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 8 columns):
SK_ID_PREV                13605401 non-null int64
SK_ID_CURR                13605401 non-null int64
NUM_INSTALMENT_VERSION    13605401 non-null float64
NUM_INSTALMENT_NUMBER     13605401 non-null int64
DAYS_INSTALMENT           13605401 non-null float64
DAYS_ENTRY_PAYMENT        13602496 non-null float64
AMT_INSTALMENT            13605401 non-null float64
AMT_PAYMENT               13602496 non-null float64
dtypes: float64(5), int64(3)
memory usage: 830.4 MB


In [40]:
name_number = 6

In [41]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # POS_CASH_balance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 8 columns):
SK_ID_PREV               10001358 non-null int64
SK_ID_CURR               10001358 non-null int64
MONTHS_BALANCE           10001358 non-null int64
CNT_INSTALMENT           9975287 non-null float64
CNT_INSTALMENT_FUTURE    9975271 non-null float64
NAME_CONTRACT_STATUS     10001358 non-null object
SK_DPD                   10001358 non-null int64
SK_DPD_DEF               10001358 non-null int64
dtypes: float64(2), int64(5), object(1)
memory usage: 610.4+ MB


In [42]:
name_number = 7

In [43]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # previous_application

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
SK_ID_PREV                     1670214 non-null int64
SK_ID_CURR                     1670214 non-null int64
NAME_CONTRACT_TYPE             1670214 non-null object
AMT_ANNUITY                    1297979 non-null float64
AMT_APPLICATION                1670214 non-null float64
AMT_CREDIT                     1670213 non-null float64
AMT_DOWN_PAYMENT               774370 non-null float64
AMT_GOODS_PRICE                1284699 non-null float64
WEEKDAY_APPR_PROCESS_START     1670214 non-null object
HOUR_APPR_PROCESS_START        1670214 non-null int64
FLAG_LAST_APPL_PER_CONTRACT    1670214 non-null object
NFLAG_LAST_APPL_IN_DAY         1670214 non-null int64
RATE_DOWN_PAYMENT              774370 non-null float64
RATE_INTEREST_PRIMARY          5951 non-null float64
RATE_INTEREST_PRIVILEGED       5951 non-null float64
NAME_CASH_LOAN_PURPOSE         1670214 non-null object
NAME_CONTRA

In [44]:
name_number = 8

In [45]:
data_dict[dataset_names[name_number]].info(verbose=10, null_counts=True) # sample_submission

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 2 columns):
SK_ID_CURR    48744 non-null int64
TARGET        48744 non-null float64
dtypes: float64(1), int64(1)
memory usage: 761.7 KB


# Replace 'Y' and 'N' with 1 and 0

Lots of NaNs, lots of features and besides that there are several features that supposed to be binary (and have 0 and 1 values) but they have 'Y' and 'N' values instead.

It's better to transforme it into 0 and 1 because some algorithms can work incorrectly with non numeric values:

In [46]:
data_dict[dataset_names[0]] = data_dict[dataset_names[0]].replace({'Y': 1, 'N': 0})
data_dict[dataset_names[1]] = data_dict[dataset_names[1]].replace({'Y': 1, 'N': 0})
data_dict[dataset_names[2]] = data_dict[dataset_names[2]].replace({'Y': 1, 'N': 0})
data_dict[dataset_names[3]] = data_dict[dataset_names[3]].replace({'Y': 1, 'N': 0})
data_dict[dataset_names[4]] = data_dict[dataset_names[4]].replace({'Y': 1, 'N': 0})
data_dict[dataset_names[6]] = data_dict[dataset_names[6]].replace({'Y': 1, 'N': 0})
data_dict[dataset_names[7]] = data_dict[dataset_names[7]].replace({'Y': 1, 'N': 0})

# Pandas profiling

Usually this reports are being displayed in notebooks but since train and test have so many variables

it is more convnient to work with report as a Pandas DataFrame which contains meta-information about dataset columns

(number of continuous, categorical, binary columns, number of highly correlated columns etc.).

In [47]:
profiles_dict = {k: pandas_profiling.ProfileReport(v) for k, v in tqdm(data_dict.items())}

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [14:37<00:00, 97.48s/it]


Now to simplify the process we will firstly take those features that do not contain any NaN values and are not highly correlated with other features:

In [48]:
datasets_num_features = dict(zip(dataset_names, list(map(lambda name: profiles_dict[name].description_set['variables'].query(QUERY_PATTERN.format('NUM'))['type'].index.values, dataset_names))))

In [49]:
datasets_cat_features = dict(zip(dataset_names, list(map(lambda name: profiles_dict[name].description_set['variables'].query(QUERY_PATTERN.format('CAT'))['type'].index.values, dataset_names))))

In [50]:
datasets_bin_features = dict(zip(dataset_names, list(map(lambda name: profiles_dict[name].description_set['variables'].query(QUERY_PATTERN.format('BOOL'))['type'].index.values, dataset_names))))

# Unique values counts of categorical features in train & test

As it was seen from the previous cells there are a lot of caterical features in train/test parts.

Apparently we will have to binarize them.

But it is unclear whether all categories for particular feature exist both in train & test.

If no then it can cause problem because the number of binarized features would be different in train & tets in this case,

so we have to check it:

In [51]:
profiles_dict[dataset_names[0]].description_set['variables'][profiles_dict[dataset_names[0]].description_set['variables'].index.isin(datasets_cat_features[dataset_names[0]])]['distinct_count']

CODE_GENDER                   3 
NAME_CONTRACT_TYPE            2 
NAME_EDUCATION_TYPE           5 
NAME_FAMILY_STATUS            6 
NAME_HOUSING_TYPE             6 
NAME_INCOME_TYPE              8 
ORGANIZATION_TYPE             58
WEEKDAY_APPR_PROCESS_START    7 
Name: distinct_count, dtype: object

In [52]:
profiles_dict[dataset_names[1]].description_set['variables'][profiles_dict[dataset_names[1]].description_set['variables'].index.isin(datasets_cat_features[dataset_names[1]])]['distinct_count']

CODE_GENDER                   2 
NAME_CONTRACT_TYPE            2 
NAME_EDUCATION_TYPE           5 
NAME_FAMILY_STATUS            5 
NAME_HOUSING_TYPE             6 
NAME_INCOME_TYPE              7 
ORGANIZATION_TYPE             58
WEEKDAY_APPR_PROCESS_START    7 
Name: distinct_count, dtype: object

As you can see CODE_GENDER, NAME_FAMILY_STATUS and NAME_INCOME_TYPE have different number of distinct values.

We save intersections distinct values sets for these features to be sure that after binarization train & test will have equal number of features.

And by the way CODE_GENDER and NAME_CONTRACT_TYPE apparently should by binary variable but it has 3 distinct values in train part.

Let's look at those values:

In [53]:
data_dict[dataset_names[0]]['CODE_GENDER'].value_counts()

F      202448
M      105059
XNA    4     
Name: CODE_GENDER, dtype: int64

In [54]:
data_dict[dataset_names[1]]['CODE_GENDER'].value_counts()

F    32678
M    16066
Name: CODE_GENDER, dtype: int64

In [55]:
data_dict[dataset_names[0]]['NAME_CONTRACT_TYPE'].value_counts()

Cash loans         278232
Revolving loans    29279 
Name: NAME_CONTRACT_TYPE, dtype: int64

In [56]:
data_dict[dataset_names[1]]['NAME_CONTRACT_TYPE'].value_counts()

Cash loans         48305
Revolving loans    439  
Name: NAME_CONTRACT_TYPE, dtype: int64

Only 4 objects for 'XNA' value?

No, this is not worth it.

In [57]:
data_dict[dataset_names[0]] = data_dict[dataset_names[0]][data_dict[dataset_names[0]].CODE_GENDER.isin(['M', 'F'])]

In [58]:
data_dict[dataset_names[0]]['CODE_GENDER'].value_counts()

F    202448
M    105059
Name: CODE_GENDER, dtype: int64

In [59]:
data_dict[dataset_names[0]] = data_dict[dataset_names[0]].replace({'F': 1, 'M': 0})
data_dict[dataset_names[1]] = data_dict[dataset_names[1]].replace({'F': 1, 'M': 0})

In [60]:
data_dict[dataset_names[0]] = data_dict[dataset_names[0]].replace({'Cash loans': 1, 'Revolving loans': 0})
data_dict[dataset_names[1]] = data_dict[dataset_names[1]].replace({'Cash loans': 1, 'Revolving loans': 0})

Ok, now we can re-profile train & test part:

In [61]:
len(data_dict[dataset_names[0]])

307507

In [62]:
profiles_dict[dataset_names[0]] = pandas_profiling.ProfileReport(data_dict[dataset_names[0]])

In [63]:
profiles_dict[dataset_names[1]] = pandas_profiling.ProfileReport(data_dict[dataset_names[1]])

In [64]:
datasets_num_features = dict(zip(dataset_names, list(map(lambda name: profiles_dict[name].description_set['variables'].query(QUERY_PATTERN.format('NUM'))['type'].index.values, dataset_names))))

In [65]:
datasets_cat_features = dict(zip(dataset_names, list(map(lambda name: profiles_dict[name].description_set['variables'].query(QUERY_PATTERN.format('CAT'))['type'].index.values, dataset_names))))

In [66]:
datasets_bin_features = dict(zip(dataset_names, list(map(lambda name: profiles_dict[name].description_set['variables'].query(QUERY_PATTERN.format('BOOL'))['type'].index.values, dataset_names))))

In [67]:
datasets_num_features

{'POS_CASH_balance': array(['MONTHS_BALANCE', 'SK_DPD', 'SK_DPD_DEF', 'SK_ID_CURR', 'SK_ID_PREV'], dtype=object),
 'application_test': array(['AMT_CREDIT', 'AMT_INCOME_TOTAL', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS',
        'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',
        'DAYS_LAST_PHONE_CHANGE', 'DAYS_REGISTRATION',
        'HOUR_APPR_PROCESS_START', 'REGION_POPULATION_RELATIVE',
        'REGION_RATING_CLIENT', 'SK_ID_CURR'], dtype=object),
 'application_train': array(['AMT_CREDIT', 'AMT_INCOME_TOTAL', 'CNT_CHILDREN', 'DAYS_BIRTH',
        'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION',
        'HOUR_APPR_PROCESS_START', 'REGION_POPULATION_RELATIVE',
        'REGION_RATING_CLIENT', 'index'], dtype=object),
 'bureau': array(['AMT_CREDIT_SUM_OVERDUE', 'CNT_CREDIT_PROLONG',
        'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT', 'DAYS_CREDIT_UPDATE',
        'SK_ID_BUREAU', 'SK_ID_CURR'], dtype=object),
 'bureau_balance': array(['MONTHS_BALANCE', 'SK_ID_BUREAU'], dtype=object),
 'cred

In [68]:
datasets_cat_features

{'POS_CASH_balance': array(['NAME_CONTRACT_STATUS'], dtype=object),
 'application_test': array(['NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
        'NAME_INCOME_TYPE', 'ORGANIZATION_TYPE',
        'WEEKDAY_APPR_PROCESS_START'], dtype=object),
 'application_train': array(['NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
        'NAME_INCOME_TYPE', 'ORGANIZATION_TYPE',
        'WEEKDAY_APPR_PROCESS_START'], dtype=object),
 'bureau': array(['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'], dtype=object),
 'bureau_balance': array(['STATUS'], dtype=object),
 'credit_card_balance': array(['NAME_CONTRACT_STATUS'], dtype=object),
 'installments_payments': array([], dtype=object),
 'previous_application': array(['CHANNEL_TYPE', 'CODE_REJECT_REASON', 'NAME_CASH_LOAN_PURPOSE',
        'NAME_CLIENT_TYPE', 'NAME_CONTRACT_STATUS', 'NAME_CONTRACT_TYPE',
        'NAME_GOODS_CATEGORY', 'NAME_PAYMENT_TYPE', 'NAME_PORTFOLIO',
        'NAME_PRODUCT_TYPE', 'NAME_SEL

In [69]:
datasets_bin_features

{'POS_CASH_balance': array([], dtype=object),
 'application_test': array(['CODE_GENDER', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_11',
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4',
        'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
        'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_EMAIL',
        'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
        'FLAG_PHONE', 'FLAG_WORK_PHONE', 'LIVE_CITY_NOT_WORK_CITY',
        'LIVE_REGION_NOT_WORK_REGION', 'NAME_CONTRACT_TYPE',
        'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
        'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION'], dtype=object),
 'application_train': array(['CODE_GENDER', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_10',
        'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
        'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
        'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
        'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_20

In [70]:
datasets_num_features[dataset_names[0]] = datasets_num_features[dataset_names[0]][0:-1]

In [71]:
datasets_num_features[dataset_names[0]] = np.append(datasets_num_features[dataset_names[0]], 'SK_ID_CURR')

In [72]:
datasets_num_features

{'POS_CASH_balance': array(['MONTHS_BALANCE', 'SK_DPD', 'SK_DPD_DEF', 'SK_ID_CURR', 'SK_ID_PREV'], dtype=object),
 'application_test': array(['AMT_CREDIT', 'AMT_INCOME_TOTAL', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS',
        'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH',
        'DAYS_LAST_PHONE_CHANGE', 'DAYS_REGISTRATION',
        'HOUR_APPR_PROCESS_START', 'REGION_POPULATION_RELATIVE',
        'REGION_RATING_CLIENT', 'SK_ID_CURR'], dtype=object),
 'application_train': array(['AMT_CREDIT', 'AMT_INCOME_TOTAL', 'CNT_CHILDREN', 'DAYS_BIRTH',
        'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION',
        'HOUR_APPR_PROCESS_START', 'REGION_POPULATION_RELATIVE',
        'REGION_RATING_CLIENT', 'SK_ID_CURR'], dtype=object),
 'bureau': array(['AMT_CREDIT_SUM_OVERDUE', 'CNT_CREDIT_PROLONG',
        'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT', 'DAYS_CREDIT_UPDATE',
        'SK_ID_BUREAU', 'SK_ID_CURR'], dtype=object),
 'bureau_balance': array(['MONTHS_BALANCE', 'SK_ID_BUREAU'], dtype=object),
 

In [73]:
save_obj(datasets_num_features, meta_path.format('datasets_num_features'))

In [74]:
save_obj(datasets_cat_features, meta_path.format('datasets_cat_features'))

In [75]:
save_obj(datasets_bin_features, meta_path.format('datasets_bin_features'))

In [76]:
commom_categories = {}
for feature in tqdm(datasets_cat_features[dataset_names[0]]):
    commom_categories[feature] = list(set(data_dict[dataset_names[0]][feature].unique()) & set(data_dict[dataset_names[1]][feature].unique()))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 63.64it/s]


In [77]:
save_obj(commom_categories, meta_path.format('commom_categories'))

# Store the output

We will store all these reports in case.

Also it would be reasonable to store all meta-information to be able not to recalculate it each time.

# Store visual (html) reports

In [79]:
datasets_bin_features

{'POS_CASH_balance': array([], dtype=object),
 'application_test': array(['CODE_GENDER', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_11',
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4',
        'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
        'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_EMAIL',
        'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
        'FLAG_PHONE', 'FLAG_WORK_PHONE', 'LIVE_CITY_NOT_WORK_CITY',
        'LIVE_REGION_NOT_WORK_REGION', 'NAME_CONTRACT_TYPE',
        'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
        'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION'], dtype=object),
 'application_train': array(['CODE_GENDER', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_10',
        'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
        'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
        'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
        'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_20

In [78]:
_ = list(map(lambda name: profiles_dict[name].to_file(outputfile=profiling_path.format(name)), tqdm(dataset_names)))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 195.05it/s]


# Store reports with feature meta-information

In [80]:
_ = list(map(lambda name: save_obj(profiles_dict[name].description_set['variables'], meta_path.format(name)), tqdm(dataset_names)))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 345.26it/s]


# Explore variables

Here we just take a look at keys of profiling report dictionary for train (yes, dictionaries, dictionaries and, once again, dictionaries)))

In [81]:
profiles_dict[dataset_names[0]].description_set.keys()

dict_keys(['table', 'variables', 'freq', 'correlations'])

In [82]:
pd.DataFrame(data=profiles_dict[dataset_names[0]].description_set['variables'].columns.values, columns=['PROFILING_VARIABLES'])

Unnamed: 0,PROFILING_VARIABLES
0,type
1,correlation_var
2,correlation
3,count
4,distinct_count
5,is_unique
6,memorysize
7,mode
8,n_infinite
9,n_missing


As you can see the report contains a value with the 'variables' key.

The value is a Pandas DataFrame which contains informations about each column of the dataset.

Each column is described with features displayed above.

Let's take a closer look.

# TRAIN

In [83]:
name = 0

In [84]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM     39
BOOL    37
CORR    35
CAT     12
Name: type, dtype: int64

In [85]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

88

In [86]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.000000    54
0.135018    6 
0.003320    3 
0.487813    1 
0.683864    1 
0.507501    1 
0.585164    1 
0.698722    1 
0.000003    1 
0.532963    1 
0.473986    1 
0.503491    1 
0.563812    1 
0.497611    1 
0.678489    1 
0.501764    1 
0.313453    1 
0.593769    1 
0.694332    1 
0.659910    1 
0.508411    1 
0.664980    1 
0.000007    1 
0.002146    1 
0.198256    1 
0.004202    1 
0.551792    1 
0.000039    1 
Name: p_missing, dtype: int64

In [87]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

123

You have probably guessed that the whole count of columns is the sum of count of columns with 'CORR' type and count of columns with some number of missing values.

# TEST

In [88]:
name = 1

In [89]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM      39
CORR     34
BOOL     25
CAT      12
CONST    11
Name: type, dtype: int64

In [90]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

87

In [91]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.000000    55
0.124097    6 
0.000595    3 
0.652757    1 
0.579641    1 
0.490050    1 
0.687161    1 
0.516761    1 
0.455625    1 
0.483731    1 
0.421221    1 
0.000164    1 
0.666051    1 
0.018689    1 
0.662892    1 
0.684125    1 
0.535122    1 
0.468899    1 
0.490173    1 
0.567065    1 
0.177827    1 
0.478438    1 
0.672842    1 
0.484552    1 
0.320142    1 
0.000492    1 
Name: p_missing, dtype: int64

In [92]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

121

In [93]:
set(profiles_dict[dataset_names[0]].description_set['variables'].index) - set(profiles_dict[dataset_names[1]].description_set['variables'].index)

{'TARGET', 'index'}

Again, this time the whole count of columns is 1 columns less because test dataset does not contain TARGET column.

# BUREAU

In [94]:
name = 2

In [95]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM    14
CAT    3 
Name: type, dtype: int64

In [96]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

17

In [97]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.000000    10
0.369170    1 
0.061496    1 
0.344774    1 
0.150119    1 
0.000008    1 
0.655133    1 
0.714735    1 
Name: p_missing, dtype: int64

In [98]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

17

# BUREAU BALANCE

In [99]:
name = 3

In [100]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM    2
CAT    1
Name: type, dtype: int64

In [101]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

3

In [102]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.0    3
Name: p_missing, dtype: int64

In [103]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

3

# CREDIT CARD BALANCE

In [104]:
name = 4

In [105]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM     17
CORR    5 
CAT     1 
Name: type, dtype: int64

In [106]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

18

In [107]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.000000    10
0.195249    5 
0.079482    2 
0.199981    1 
Name: p_missing, dtype: int64

In [108]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

23

# INSTALLMENTS PAYMENTS

In [109]:
name = 5

In [110]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM     6
CORR    2
Name: type, dtype: int64

In [111]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

6

In [112]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.0    6
Name: p_missing, dtype: int64

In [113]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

8

# POS CASH BALANCE

In [114]:
name = 6

In [115]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM    7
CAT    1
Name: type, dtype: int64

In [116]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

8

In [117]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.000000    6
0.002608    1
0.002607    1
Name: p_missing, dtype: int64

In [118]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

8

# PREVIOUS APPLICATIONS

In [119]:
name = 7

In [120]:
profiles_dict[dataset_names[name]].description_set['variables']['type'].value_counts()

NUM     17
CAT     15
CORR    3 
BOOL    2 
Name: type, dtype: int64

In [121]:
profiles_dict[dataset_names[name]].description_set['variables']['n_missing'].value_counts().sum()

34

In [122]:
profiles_dict[dataset_names[name]].description_set['variables']['p_missing'].value_counts()

0.000000    21
0.402981    5 
0.996437    2 
0.536365    2 
0.000207    1 
0.491198    1 
0.222864    1 
0.222867    1 
Name: p_missing, dtype: int64

In [123]:
len(profiles_dict[dataset_names[name]].description_set['variables'])

37

So far we can say that every dataset has some problems with data quality.

It is possble to highlight two major issues at the moment:
- NAs;
- high corelation.

Perhaps (even very likely) there are other issues but that is not clear so far.

# And finally (but not at all)))

Let's save datasets without NaNs to make data a bit more compact:

In [124]:
_ = list(
    map(
        lambda name: data_dict[name][
            datasets_num_features[name].tolist() + datasets_cat_features[name].tolist() + datasets_bin_features[name].tolist()
        ].to_csv(processed_data_path.format(name), index=False),
        tqdm(dataset_names)
    )
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [03:15<00:00, 21.68s/it]
