# Feature Engineering

### We start off with necessary imports of python libraries

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

### We can now read all the necessary data files, which are to be analysed
### We assume, that some elementary actions of data cleaning have already been applied to the train data

In [2]:
path = '../../Data/'

main_data = pd.read_csv(f'{path}train.csv')
bureau = pd.read_csv(f'{path}bureau.csv')
bureau_balance = pd.read_csv(f'{path}bureau_balance.csv')
credit_card_balance = pd.read_csv(f'{path}credit_card_balance.csv')
intallments_payments = pd.read_csv(f'{path}installments_payments.csv')
POS_CASH_balance = pd.read_csv(f'{path}POS_CASH_balance.csv')
previous_application = pd.read_csv(f'{path}previous_application.csv')

### Now, we can import auxilliary functions from additional .py files, which were made for convenience and clarity
### They will be much needed in the next steps of our feature engineering

In [3]:
from Functions.DataPreperation import *
from Functions.FeatureEngineering import *

### We can now take a quick look at the main data, to see its parameters such as shape or top 5 values, so that we will be able to compare the data after applying feature engineering techniques to the original data frame

In [4]:
main_data.shape

(246008, 244)

In [5]:
main_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246008 entries, 0 to 246007
Columns: 244 entries, Unnamed: 0 to EMERGENCYSTATE_MODE_Yes
dtypes: float64(66), int64(178)
memory usage: 458.0 MB


In [6]:
main_data.head(5)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,178942,307359,0,0,0,1,0,180000.0,746896.5,31774.5,...,0,0,0,0,0,1,0,0,1,0
1,17607,120529,0,0,0,1,0,63000.0,814500.0,23944.5,...,0,0,0,0,0,0,0,0,0,0
2,84845,198439,1,0,0,1,0,225000.0,450000.0,30573.0,...,0,0,0,0,0,0,0,0,0,0
3,176784,304860,0,0,0,1,0,157500.0,1256400.0,44644.5,...,0,0,0,0,0,1,0,0,1,0
4,2543,102965,0,0,0,0,0,90000.0,454500.0,14661.0,...,0,0,0,0,0,0,0,0,0,0


### To try to find even more interesting features, which could help in analysis, we can create following columns

In [7]:
copy_of_main = main_data.copy()

main_data['DAYS_EMPLOYED_PERC'] = copy_of_main['DAYS_EMPLOYED'] / copy_of_main['DAYS_BIRTH']
main_data['INCOME_CREDIT_PERC'] = copy_of_main['AMT_INCOME_TOTAL'] / copy_of_main['AMT_CREDIT']
main_data['INCOME_PER_PERSON'] = copy_of_main['AMT_INCOME_TOTAL'] / copy_of_main['CNT_FAM_MEMBERS']
main_data['ANNUITY_INCOME_PERC'] = copy_of_main['AMT_ANNUITY'] / copy_of_main['AMT_INCOME_TOTAL']
main_data['PAYMENT_RATE'] = copy_of_main['AMT_ANNUITY'] /copy_of_main['AMT_CREDIT']

### Since the "Unnamed: 0" column is not needed, we will drop it

In [8]:
main_data = main_data.drop(columns = ['Unnamed: 0'])

### Let's see, whether the changes applied

In [9]:
main_data.head(1)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,DAYS_EMPLOYED_PERC,INCOME_CREDIT_PERC,INCOME_PER_PERSON,ANNUITY_INCOME_PERC,PAYMENT_RATE
0,307359,0,0,0,1,0,180000.0,746896.5,31774.5,594000.0,...,1,0,0,1,0,0.227628,0.240997,180000.0,0.176525,0.042542


### As we can see, the unneeded column was succesfully removed
### However, it's not the only column, that we should drop
### We want our models to be not only accurate, but also fast, and applying feature engineering to 243 columns will take a lot of time
### Thus, we need to remove some of the columns

### To choose, which columns need to be removed, we can take a look at the correlations table
### The columns, that will be of high positive or negative correlation in regards to the 'TARGET' table, can be kept, while those of low correlation should probably be dropped, since they may have almost none impact on the final predictions made by our models

In [10]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

pd.DataFrame(correlations['TARGET'])

Unnamed: 0,TARGET
TARGET,1.000000
REGION_RATING_CLIENT_W_CITY,0.061792
REGION_RATING_CLIENT,0.060025
NAME_INCOME_TYPE_Working,0.057901
DAYS_LAST_PHONE_CHANGE,0.054899
...,...
DAYS_BIRTH,-0.078967
EXT_SOURCE_1,-0.100051
EXT_SOURCE_3,-0.157119
EXT_SOURCE_2,-0.161329


### The correlations are not very high
### We should probably keep all the tables with values, which have an absolute value of correlation equal to around 0.04, and we can treat that values as a certain correlation threshold
### To find such function, we will use auxilliary function remove_target_correlated_cols() from .py files
### It returns the data frame only left with columns of correlation's value bigger than the given threshold

In [11]:
main_data = remove_target_correlated_cols(main_data, special_id = 'SK_ID_CURR', threshold = 0.04)

### The data after applying changes presents as follows

In [12]:
main_data.shape

(246008, 28)

In [13]:
main_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246008 entries, 0 to 246007
Data columns (total 28 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   SK_ID_CURR                                         246008 non-null  int64  
 1   TARGET                                             246008 non-null  int64  
 2   AMT_GOODS_PRICE                                    246008 non-null  float64
 3   DAYS_BIRTH                                         246008 non-null  int64  
 4   DAYS_EMPLOYED                                      246008 non-null  float64
 5   DAYS_REGISTRATION                                  246008 non-null  float64
 6   DAYS_ID_PUBLISH                                    246008 non-null  int64  
 7   FLAG_EMP_PHONE                                     246008 non-null  int64  
 8   REGION_RATING_CLIENT                               246008 non-null  int64 

In [14]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,...,CODE_GENDER_M,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Secondary / secondary special,OCCUPATION_TYPE_Laborers,ORGANIZATION_TYPE_XNA,HOUSETYPE_MODE_block of flats,EMERGENCYSTATE_MODE_No,DAYS_EMPLOYED_PERC
0,307359,0,594000.0,16628,3785.0,5108.0,181,1,2,2,...,0,0,0,0,1,0,0,1,1,0.227628
1,120529,0,814500.0,21944,1648.0,1403.0,4501,0,3,3,...,0,1,0,0,1,0,1,0,0,0.0751
2,198439,1,450000.0,16831,806.0,438.0,386,1,3,3,...,1,0,1,0,1,1,0,0,0,0.047888
3,304860,0,900000.0,15851,189.0,1922.0,1922,1,3,1,...,1,0,0,0,1,1,0,1,1,0.011924
4,102965,0,454500.0,15978,501.0,6147.0,3907,1,2,2,...,0,0,1,0,1,0,0,0,0,0.031356


### Now, we can apply simple numerical feature engineering techniques
### We can start with adding normalization to the columns
### We will use auxilliary normalization() function from .py files
### It adds another feature in a form of a column with normalized values
### Since we want to apply it to bigger numbers, we can apply it to columns with mean values equal to, for example, as least 100

In [15]:
main_data = normalization(main_data, min_mean_value = 100, groupby_id = 'SK_ID_CURR')

### The data presents now in a following way

In [16]:
main_data.shape

(246008, 33)

In [17]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,...,OCCUPATION_TYPE_Laborers,ORGANIZATION_TYPE_XNA,HOUSETYPE_MODE_block of flats,EMERGENCYSTATE_MODE_No,DAYS_EMPLOYED_PERC,AMT_GOODS_PRICE_norm,DAYS_BIRTH_norm,DAYS_EMPLOYED_norm,DAYS_REGISTRATION_norm,DAYS_ID_PUBLISH_norm
0,307359,0,594000.0,16628,3785.0,5108.0,181,1,2,2,...,0,0,1,1,0.227628,0.138047,0.515163,0.211311,0.207036,0.027629
1,120529,0,814500.0,21944,1648.0,1403.0,4501,0,3,3,...,0,1,0,0,0.0751,0.193042,0.814825,0.092005,0.056866,0.687071
2,198439,1,450000.0,16831,806.0,438.0,386,1,3,3,...,1,0,0,0,0.047888,0.102132,0.526607,0.044998,0.017753,0.058922
3,304860,0,900000.0,15851,189.0,1922.0,1922,1,3,1,...,1,0,1,1,0.011924,0.214366,0.471364,0.010552,0.077902,0.29339
4,102965,0,454500.0,15978,501.0,6147.0,3907,1,2,2,...,0,0,0,0,0.031356,0.103255,0.478523,0.02797,0.249149,0.596397


### That way, we added 5 new features

### We can also try to apply logarithm transformation to big numerical values
### We will, one again, use auxilliary function log_transform() from .py files

In [18]:
main_data = log_transform(main_data, min_mean_value = 1000, groupby_id = 'SK_ID_CURR')

### Data after transformations looks in the following way

In [19]:
main_data.shape

(246008, 38)

In [20]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,...,AMT_GOODS_PRICE_norm,DAYS_BIRTH_norm,DAYS_EMPLOYED_norm,DAYS_REGISTRATION_norm,DAYS_ID_PUBLISH_norm,AMT_GOODS_PRICE_log,DAYS_BIRTH_log,DAYS_EMPLOYED_log,DAYS_REGISTRATION_log,DAYS_ID_PUBLISH_log
0,307359,0,594000.0,16628,3785.0,5108.0,181,1,2,2,...,0.138047,0.515163,0.211311,0.207036,0.027629,13.294636,9.718903,8.239065,8.538759,5.204007
1,120529,0,814500.0,21944,1648.0,1403.0,4501,0,3,3,...,0.193042,0.814825,0.092005,0.056866,0.687071,13.610331,9.996295,7.407924,7.247081,8.412277
2,198439,1,450000.0,16831,806.0,438.0,386,1,3,3,...,0.102132,0.526607,0.044998,0.017753,0.058922,13.017005,9.731037,6.693324,6.084499,5.958425
3,304860,0,900000.0,15851,189.0,1922.0,1922,1,3,1,...,0.214366,0.471364,0.010552,0.077902,0.29339,13.710151,9.671051,5.247024,7.561642,7.561642
4,102965,0,454500.0,15978,501.0,6147.0,3907,1,2,2,...,0.103255,0.478523,0.02797,0.249149,0.596397,13.026955,9.679031,6.2186,8.723882,8.270781


### From that point, we can start analyzing additional files, which can be helpful in creating powerful and fast models

# Bureau files

### Let's start with simple bureau files analysis

In [21]:
bureau.shape

(1716428, 17)

In [22]:
bureau.head(5)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


### We have a lot of NaN values, which need to be changed to either 0, or other values, such as mean of the column
### The columns can also be dropped, which we can do in that case, with help of auxilliary function
### It takes the data frame, threshold for number of missing values, and information about possibility of printing the information to the screen, as parameters

In [23]:
new_bureau = drop_missing_columns(bureau, threshold = 70, print_info = True)

There are 7 with greater than 70 missing values
Incomplete columns: 
['DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_ANNUITY']


### We can now fill any other missing values, if there are any
### We will fill them with 0

In [24]:
new_bureau = fill_missing_values(new_bureau, mean = False)

### Let's see, what the changes did to the file

In [25]:
new_bureau.shape

(1716428, 10)

In [26]:
new_bureau.head(5)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE
0,215354,5714462,Closed,currency 1,-497,0,0,0.0,Consumer credit,-131
1,215354,5714463,Active,currency 1,-208,0,0,0.0,Credit card,-20
2,215354,5714464,Active,currency 1,-203,0,0,0.0,Consumer credit,-16
3,215354,5714465,Active,currency 1,-203,0,0,0.0,Credit card,-16
4,215354,5714466,Active,currency 1,-629,0,0,0.0,Consumer credit,-21


### Let's apply the same procedures to bureau_balance

In [27]:
bureau_balance.shape

(27299925, 3)

In [28]:
bureau_balance.head(5)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [29]:
bureau_balance = drop_missing_columns(bureau_balance)

There are 0 with greater than 70 missing values
No columns will be deleted


In [30]:
bureau_balance = fill_missing_values(bureau_balance, mean = False)

### Both files can have object as well as numerical grouping applied to them
### That way, we can get additional features
### Bureau data frame should be grouped by SK_ID_CURR, while bureau_balance data frame should by grouped by SK_ID_BUREA
### In both cases, we will make use of auxilliary functions from .py file:
* group_numeric_values(data_frame,
                         data_frame_name,
                         groupby_id = 'SK_ID_CURR', 
                         grouping_statistics = ['count', 'min', 'max', 'mean', 'median', 'sum'])
* group_object_values(data_frame,
                        data_frame_name,
                        groupby_id = 'SK_ID_CURR',
                        grouping_statistics = ['count', 'mean', 'sum'])
                        
### They apply the grouping feature engineering technique, in which we get the data frame with additional variables in return

### We will apply those techniques to these two bureau files and see, what kind of data we get

* Numerical grouping for bureau

In [31]:
bureau_group_num = group_numeric_values(new_bureau,
                         'bureau',
                         groupby_id = 'SK_ID_CURR')

In [32]:
bureau_group_num.head(5)

Unnamed: 0_level_0,bureau_DAYS_CREDIT_count,bureau_DAYS_CREDIT_min,bureau_DAYS_CREDIT_max,bureau_DAYS_CREDIT_mean,bureau_DAYS_CREDIT_median,bureau_DAYS_CREDIT_sum,bureau_CREDIT_DAY_OVERDUE_count,bureau_CREDIT_DAY_OVERDUE_min,bureau_CREDIT_DAY_OVERDUE_max,bureau_CREDIT_DAY_OVERDUE_mean,...,bureau_AMT_CREDIT_SUM_OVERDUE_max,bureau_AMT_CREDIT_SUM_OVERDUE_mean,bureau_AMT_CREDIT_SUM_OVERDUE_median,bureau_AMT_CREDIT_SUM_OVERDUE_sum,bureau_DAYS_CREDIT_UPDATE_count,bureau_DAYS_CREDIT_UPDATE_min,bureau_DAYS_CREDIT_UPDATE_max,bureau_DAYS_CREDIT_UPDATE_mean,bureau_DAYS_CREDIT_UPDATE_median,bureau_DAYS_CREDIT_UPDATE_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,0.0,...,0.0,0.0,0.0,0.0,7,-155,-6,-93.142857,-155.0,-652
100002,8,-1437,-103,-874.0,-1042.5,-6992,8,0,0,0.0,...,0.0,0.0,0.0,0.0,8,-1185,-7,-499.875,-402.5,-3999
100003,4,-2586,-606,-1400.75,-1205.5,-5603,4,0,0,0.0,...,0.0,0.0,0.0,0.0,4,-2131,-43,-816.0,-545.0,-3264
100004,2,-1326,-408,-867.0,-867.0,-1734,2,0,0,0.0,...,0.0,0.0,0.0,0.0,2,-682,-382,-532.0,-532.0,-1064
100005,3,-373,-62,-190.666667,-137.0,-572,3,0,0,0.0,...,0.0,0.0,0.0,0.0,3,-121,-11,-54.333333,-31.0,-163


* Numerical grouping for bureau_balance

In [33]:
bureau_balance_group_num = group_numeric_values(bureau_balance,
                         'bureau_balance',
                         groupby_id = 'SK_ID_BUREAU')

In [34]:
bureau_balance_group_num.head(5)

Unnamed: 0_level_0,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_min,bureau_balance_MONTHS_BALANCE_max,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_MONTHS_BALANCE_median,bureau_balance_MONTHS_BALANCE_sum
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5001709,97,-96,0,-48.0,-48.0,-4656
5001710,83,-82,0,-41.0,-41.0,-3403
5001711,4,-3,0,-1.5,-1.5,-6
5001712,19,-18,0,-9.0,-9.0,-171
5001713,22,-21,0,-10.5,-10.5,-231


* Object grouping for bureau

In [35]:
bureau_group_object = group_object_values(new_bureau, 'bureau')

In [36]:
bureau_group_object.head(5)

Unnamed: 0_level_0,bureau_CREDIT_ACTIVE_Active_count,bureau_CREDIT_ACTIVE_Active_mean,bureau_CREDIT_ACTIVE_Active_sum,bureau_CREDIT_ACTIVE_Bad debt_count,bureau_CREDIT_ACTIVE_Bad debt_mean,bureau_CREDIT_ACTIVE_Bad debt_sum,bureau_CREDIT_ACTIVE_Closed_count,bureau_CREDIT_ACTIVE_Closed_mean,bureau_CREDIT_ACTIVE_Closed_sum,bureau_CREDIT_ACTIVE_Sold_count,...,bureau_CREDIT_TYPE_Mobile operator loan_sum,bureau_CREDIT_TYPE_Mortgage_count,bureau_CREDIT_TYPE_Mortgage_mean,bureau_CREDIT_TYPE_Mortgage_sum,bureau_CREDIT_TYPE_Real estate loan_count,bureau_CREDIT_TYPE_Real estate loan_mean,bureau_CREDIT_TYPE_Real estate loan_sum,bureau_CREDIT_TYPE_Unknown type of loan_count,bureau_CREDIT_TYPE_Unknown type of loan_mean,bureau_CREDIT_TYPE_Unknown type of loan_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,0.428571,3,7,0.0,0,7,0.571429,4,7,...,0,7,0.0,0,7,0.0,0,7,0.0,0
100002,8,0.25,2,8,0.0,0,8,0.75,6,8,...,0,8,0.0,0,8,0.0,0,8,0.0,0
100003,4,0.25,1,4,0.0,0,4,0.75,3,4,...,0,4,0.0,0,4,0.0,0,4,0.0,0
100004,2,0.0,0,2,0.0,0,2,1.0,2,2,...,0,2,0.0,0,2,0.0,0,2,0.0,0
100005,3,0.666667,2,3,0.0,0,3,0.333333,1,3,...,0,3,0.0,0,3,0.0,0,3,0.0,0


* Object grouping for bureau_balance

In [37]:
bureau_balance_object_group = group_object_values(bureau_balance, 'bureau', groupby_id = 'SK_ID_BUREAU')

In [38]:
bureau_balance_object_group.head(5)

Unnamed: 0_level_0,bureau_STATUS_0_count,bureau_STATUS_0_mean,bureau_STATUS_0_sum,bureau_STATUS_1_count,bureau_STATUS_1_mean,bureau_STATUS_1_sum,bureau_STATUS_2_count,bureau_STATUS_2_mean,bureau_STATUS_2_sum,bureau_STATUS_3_count,...,bureau_STATUS_4_sum,bureau_STATUS_5_count,bureau_STATUS_5_mean,bureau_STATUS_5_sum,bureau_STATUS_C_count,bureau_STATUS_C_mean,bureau_STATUS_C_sum,bureau_STATUS_X_count,bureau_STATUS_X_mean,bureau_STATUS_X_sum
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5001709,97,0.0,0,97,0.0,0,97,0.0,0,97,...,0,97,0.0,0,97,0.886598,86,97,0.113402,11
5001710,83,0.060241,5,83,0.0,0,83,0.0,0,83,...,0,83,0.0,0,83,0.578313,48,83,0.361446,30
5001711,4,0.75,3,4,0.0,0,4,0.0,0,4,...,0,4,0.0,0,4,0.0,0,4,0.25,1
5001712,19,0.526316,10,19,0.0,0,19,0.0,0,19,...,0,19,0.0,0,19,0.473684,9,19,0.0,0
5001713,22,0.0,0,22,0.0,0,22,0.0,0,22,...,0,22,0.0,0,22,0.0,0,22,1.0,22


### Now, we have the 4 grouped data frames, on which we can work

### We can start of by grouping bureau and bureau_balance frames with each other

In [39]:
bureau_total = bureau_group_num.merge(bureau_group_object, on = 'SK_ID_CURR', how = 'left')

In [40]:
bureau_total.shape

(305811, 99)

In [41]:
bureau_total.head(5)

Unnamed: 0_level_0,bureau_DAYS_CREDIT_count,bureau_DAYS_CREDIT_min,bureau_DAYS_CREDIT_max,bureau_DAYS_CREDIT_mean,bureau_DAYS_CREDIT_median,bureau_DAYS_CREDIT_sum,bureau_CREDIT_DAY_OVERDUE_count,bureau_CREDIT_DAY_OVERDUE_min,bureau_CREDIT_DAY_OVERDUE_max,bureau_CREDIT_DAY_OVERDUE_mean,...,bureau_CREDIT_TYPE_Mobile operator loan_sum,bureau_CREDIT_TYPE_Mortgage_count,bureau_CREDIT_TYPE_Mortgage_mean,bureau_CREDIT_TYPE_Mortgage_sum,bureau_CREDIT_TYPE_Real estate loan_count,bureau_CREDIT_TYPE_Real estate loan_mean,bureau_CREDIT_TYPE_Real estate loan_sum,bureau_CREDIT_TYPE_Unknown type of loan_count,bureau_CREDIT_TYPE_Unknown type of loan_mean,bureau_CREDIT_TYPE_Unknown type of loan_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,0.0,...,0,7,0.0,0,7,0.0,0,7,0.0,0
100002,8,-1437,-103,-874.0,-1042.5,-6992,8,0,0,0.0,...,0,8,0.0,0,8,0.0,0,8,0.0,0
100003,4,-2586,-606,-1400.75,-1205.5,-5603,4,0,0,0.0,...,0,4,0.0,0,4,0.0,0,4,0.0,0
100004,2,-1326,-408,-867.0,-867.0,-1734,2,0,0,0.0,...,0,2,0.0,0,2,0.0,0,2,0.0,0
100005,3,-373,-62,-190.666667,-137.0,-572,3,0,0,0.0,...,0,3,0.0,0,3,0.0,0,3,0.0,0


In [42]:
bureau_balance_total = bureau_balance_group_num.merge(bureau_balance_object_group, on = 'SK_ID_BUREAU', how = 'left')

In [43]:
bureau_balance_total.shape

(817395, 30)

In [44]:
bureau_balance_total.head(5)

Unnamed: 0_level_0,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_min,bureau_balance_MONTHS_BALANCE_max,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_MONTHS_BALANCE_median,bureau_balance_MONTHS_BALANCE_sum,bureau_STATUS_0_count,bureau_STATUS_0_mean,bureau_STATUS_0_sum,bureau_STATUS_1_count,...,bureau_STATUS_4_sum,bureau_STATUS_5_count,bureau_STATUS_5_mean,bureau_STATUS_5_sum,bureau_STATUS_C_count,bureau_STATUS_C_mean,bureau_STATUS_C_sum,bureau_STATUS_X_count,bureau_STATUS_X_mean,bureau_STATUS_X_sum
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5001709,97,-96,0,-48.0,-48.0,-4656,97,0.0,0,97,...,0,97,0.0,0,97,0.886598,86,97,0.113402,11
5001710,83,-82,0,-41.0,-41.0,-3403,83,0.060241,5,83,...,0,83,0.0,0,83,0.578313,48,83,0.361446,30
5001711,4,-3,0,-1.5,-1.5,-6,4,0.75,3,4,...,0,4,0.0,0,4,0.0,0,4,0.25,1
5001712,19,-18,0,-9.0,-9.0,-171,19,0.526316,10,19,...,0,19,0.0,0,19,0.473684,9,19,0.0,0
5001713,22,-21,0,-10.5,-10.5,-231,22,0.0,0,22,...,0,22,0.0,0,22,0.0,0,22,1.0,22


### Now, it would be convenient to somehow join these two tables together, so that we can maybe apply other techniques, such as log transformation, and merge that table into the main data frame

### Let's take a quick look at the original bureau file

In [45]:
new_bureau.head(3)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE
0,215354,5714462,Closed,currency 1,-497,0,0,0.0,Consumer credit,-131
1,215354,5714463,Active,currency 1,-208,0,0,0.0,Credit card,-20
2,215354,5714464,Active,currency 1,-203,0,0,0.0,Consumer credit,-16


### Unique identifier in that case is SK_ID_BUREAU, we can make use of the two ID columns, and merge them into bureau_balance_total file, and after that another grouping can be done

In [46]:
bureau_balance_total = bureau_balance_total.merge(new_bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], 
                                                  right_index = False, 
                                                  on = 'SK_ID_BUREAU', 
                                                  how = 'outer')

In [47]:
bureau_balance_total.head(5)

Unnamed: 0,SK_ID_BUREAU,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_min,bureau_balance_MONTHS_BALANCE_max,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_MONTHS_BALANCE_median,bureau_balance_MONTHS_BALANCE_sum,bureau_STATUS_0_count,bureau_STATUS_0_mean,bureau_STATUS_0_sum,...,bureau_STATUS_5_count,bureau_STATUS_5_mean,bureau_STATUS_5_sum,bureau_STATUS_C_count,bureau_STATUS_C_mean,bureau_STATUS_C_sum,bureau_STATUS_X_count,bureau_STATUS_X_mean,bureau_STATUS_X_sum,SK_ID_CURR
0,5001709,97.0,-96.0,0.0,-48.0,-48.0,-4656.0,97.0,0.0,0.0,...,97.0,0.0,0.0,97.0,0.886598,86.0,97.0,0.113402,11.0,
1,5001710,83.0,-82.0,0.0,-41.0,-41.0,-3403.0,83.0,0.060241,5.0,...,83.0,0.0,0.0,83.0,0.578313,48.0,83.0,0.361446,30.0,162368.0
2,5001711,4.0,-3.0,0.0,-1.5,-1.5,-6.0,4.0,0.75,3.0,...,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.25,1.0,162368.0
3,5001712,19.0,-18.0,0.0,-9.0,-9.0,-171.0,19.0,0.526316,10.0,...,19.0,0.0,0.0,19.0,0.473684,9.0,19.0,0.0,0.0,162368.0
4,5001713,22.0,-21.0,0.0,-10.5,-10.5,-231.0,22.0,0.0,0.0,...,22.0,0.0,0.0,22.0,0.0,0.0,22.0,1.0,22.0,150635.0


### Seems like ID_CURR in some rows can be missing, thus we will just pull out all the rows values, of which SK_ID_CURR filed is not NaN

In [48]:
bureau_balance_total = bureau_balance_total[bureau_balance_total['SK_ID_CURR'] > 1000]

In [49]:
bureau_balance_total.head(5)

Unnamed: 0,SK_ID_BUREAU,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_min,bureau_balance_MONTHS_BALANCE_max,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_MONTHS_BALANCE_median,bureau_balance_MONTHS_BALANCE_sum,bureau_STATUS_0_count,bureau_STATUS_0_mean,bureau_STATUS_0_sum,...,bureau_STATUS_5_count,bureau_STATUS_5_mean,bureau_STATUS_5_sum,bureau_STATUS_C_count,bureau_STATUS_C_mean,bureau_STATUS_C_sum,bureau_STATUS_X_count,bureau_STATUS_X_mean,bureau_STATUS_X_sum,SK_ID_CURR
1,5001710,83.0,-82.0,0.0,-41.0,-41.0,-3403.0,83.0,0.060241,5.0,...,83.0,0.0,0.0,83.0,0.578313,48.0,83.0,0.361446,30.0,162368.0
2,5001711,4.0,-3.0,0.0,-1.5,-1.5,-6.0,4.0,0.75,3.0,...,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.25,1.0,162368.0
3,5001712,19.0,-18.0,0.0,-9.0,-9.0,-171.0,19.0,0.526316,10.0,...,19.0,0.0,0.0,19.0,0.473684,9.0,19.0,0.0,0.0,162368.0
4,5001713,22.0,-21.0,0.0,-10.5,-10.5,-231.0,22.0,0.0,0.0,...,22.0,0.0,0.0,22.0,0.0,0.0,22.0,1.0,22.0,150635.0
5,5001714,15.0,-14.0,0.0,-7.0,-7.0,-105.0,15.0,0.0,0.0,...,15.0,0.0,0.0,15.0,0.0,0.0,15.0,1.0,15.0,150635.0


### Now, we can merge these to together

In [50]:
bureau_total = bureau_total.merge(bureau_balance_total.drop(columns = ['SK_ID_BUREAU']), 
                                  on = 'SK_ID_CURR', 
                                  how = 'left')

In [51]:
bureau_total.head(5)

Unnamed: 0,SK_ID_CURR,bureau_DAYS_CREDIT_count,bureau_DAYS_CREDIT_min,bureau_DAYS_CREDIT_max,bureau_DAYS_CREDIT_mean,bureau_DAYS_CREDIT_median,bureau_DAYS_CREDIT_sum,bureau_CREDIT_DAY_OVERDUE_count,bureau_CREDIT_DAY_OVERDUE_min,bureau_CREDIT_DAY_OVERDUE_max,...,bureau_STATUS_4_sum,bureau_STATUS_5_count,bureau_STATUS_5_mean,bureau_STATUS_5_sum,bureau_STATUS_C_count,bureau_STATUS_C_mean,bureau_STATUS_C_sum,bureau_STATUS_X_count,bureau_STATUS_X_mean,bureau_STATUS_X_sum
0,100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,...,0.0,29.0,0.0,0.0,29.0,0.655172,19.0,29.0,0.241379,7.0
1,100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,...,0.0,30.0,0.0,0.0,30.0,0.966667,29.0,30.0,0.0,0.0
2,100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,...,0.0,29.0,0.0,0.0,29.0,0.62069,18.0,29.0,0.310345,9.0
3,100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,...,0.0,52.0,0.0,0.0,52.0,0.846154,44.0,52.0,0.134615,7.0
4,100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,...,0.0,19.0,0.0,0.0,19.0,0.0,0.0,19.0,0.315789,6.0


### Now, we are going to be having a lot to many columns, if we group it the standard way
### We can, however, have only mean as a gropuing statistic

In [52]:
bureau_total_grouped = group_numeric_values(bureau_total,
                                           'BureauMin',
                                            groupby_id = 'SK_ID_CURR', 
                                            grouping_statistics = ['mean'])

In [53]:
bureau_total_grouped.shape

(305811, 129)

In [54]:
bureau_total_grouped.head(5)

Unnamed: 0_level_0,BureauMin_bureau_DAYS_CREDIT_count_mean,BureauMin_bureau_DAYS_CREDIT_min_mean,BureauMin_bureau_DAYS_CREDIT_max_mean,BureauMin_bureau_DAYS_CREDIT_mean_mean,BureauMin_bureau_DAYS_CREDIT_median_mean,BureauMin_bureau_DAYS_CREDIT_sum_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_count_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_min_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_max_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_mean_mean,...,BureauMin_bureau_STATUS_4_sum_mean,BureauMin_bureau_STATUS_5_count_mean,BureauMin_bureau_STATUS_5_mean_mean,BureauMin_bureau_STATUS_5_sum_mean,BureauMin_bureau_STATUS_C_count_mean,BureauMin_bureau_STATUS_C_mean_mean,BureauMin_bureau_STATUS_C_sum_mean,BureauMin_bureau_STATUS_X_count_mean,BureauMin_bureau_STATUS_X_mean_mean,BureauMin_bureau_STATUS_X_sum_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,0.0,...,0.0,24.571429,0.0,0.0,24.571429,0.44124,15.714286,24.571429,0.21459,4.285714
100002,8,-1437,-103,-874.0,-1042.5,-6992,8,0,0,0.0,...,0.0,13.75,0.0,0.0,13.75,0.175426,2.875,13.75,0.161932,1.875
100003,4,-2586,-606,-1400.75,-1205.5,-5603,4,0,0,0.0,...,,,,,,,,,,
100004,2,-1326,-408,-867.0,-867.0,-1734,2,0,0,0.0,...,,,,,,,,,,
100005,3,-373,-62,-190.666667,-137.0,-572,3,0,0,0.0,...,0.0,7.0,0.0,0.0,7.0,0.128205,1.666667,7.0,0.136752,0.666667


### We need to fill the NaN values in the columns
### Filling them with 0 could create some outliers from the look of data, therefore we will fill them with mean of the given columns

In [55]:
bureau_total_grouped = fill_missing_values(bureau_total_grouped, mean = True)

In [56]:
bureau_total_grouped.head(5)

Unnamed: 0_level_0,BureauMin_bureau_DAYS_CREDIT_count_mean,BureauMin_bureau_DAYS_CREDIT_min_mean,BureauMin_bureau_DAYS_CREDIT_max_mean,BureauMin_bureau_DAYS_CREDIT_mean_mean,BureauMin_bureau_DAYS_CREDIT_median_mean,BureauMin_bureau_DAYS_CREDIT_sum_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_count_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_min_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_max_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_mean_mean,...,BureauMin_bureau_STATUS_4_sum_mean,BureauMin_bureau_STATUS_5_count_mean,BureauMin_bureau_STATUS_5_mean_mean,BureauMin_bureau_STATUS_5_sum_mean,BureauMin_bureau_STATUS_C_count_mean,BureauMin_bureau_STATUS_C_mean_mean,BureauMin_bureau_STATUS_C_sum_mean,BureauMin_bureau_STATUS_X_count_mean,BureauMin_bureau_STATUS_X_mean_mean,BureauMin_bureau_STATUS_X_sum_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,0.0,...,0.0,24.571429,0.0,0.0,24.571429,0.44124,15.714286,24.571429,0.21459,4.285714
100002,8,-1437,-103,-874.0,-1042.5,-6992,8,0,0,0.0,...,0.0,13.75,0.0,0.0,13.75,0.175426,2.875,13.75,0.161932,1.875
100003,4,-2586,-606,-1400.75,-1205.5,-5603,4,0,0,0.0,...,0.009197,30.253224,0.001817,0.093624,30.253224,0.334922,14.047435,30.253224,0.212361,6.381404
100004,2,-1326,-408,-867.0,-867.0,-1734,2,0,0,0.0,...,0.009197,30.253224,0.001817,0.093624,30.253224,0.334922,14.047435,30.253224,0.212361,6.381404
100005,3,-373,-62,-190.666667,-137.0,-572,3,0,0,0.0,...,0.0,7.0,0.0,0.0,7.0,0.128205,1.666667,7.0,0.136752,0.666667


### It seems like there are no extremely big values (> 10000), therefore we can apply normalization to make another features, which may possibly be better for our model

In [57]:
bureau_total_grouped = normalization(bureau_total_grouped, 
                                     min_mean_value = 10, 
                                     groupby_id = 'SK_ID_CURR')

In [58]:
bureau_total_grouped.head(5)

Unnamed: 0_level_0,BureauMin_bureau_DAYS_CREDIT_count_mean,BureauMin_bureau_DAYS_CREDIT_min_mean,BureauMin_bureau_DAYS_CREDIT_max_mean,BureauMin_bureau_DAYS_CREDIT_mean_mean,BureauMin_bureau_DAYS_CREDIT_median_mean,BureauMin_bureau_DAYS_CREDIT_sum_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_count_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_min_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_max_mean,BureauMin_bureau_CREDIT_DAY_OVERDUE_mean_mean,...,BureauMin_bureau_balance_MONTHS_BALANCE_count_mean_norm,BureauMin_bureau_STATUS_0_count_mean_norm,BureauMin_bureau_STATUS_1_count_mean_norm,BureauMin_bureau_STATUS_2_count_mean_norm,BureauMin_bureau_STATUS_3_count_mean_norm,BureauMin_bureau_STATUS_4_count_mean_norm,BureauMin_bureau_STATUS_5_count_mean_norm,BureauMin_bureau_STATUS_C_count_mean_norm,BureauMin_bureau_STATUS_C_sum_mean_norm,BureauMin_bureau_STATUS_X_count_mean_norm
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,-1572,-49,-735.0,-857.0,-5145,7,0,0,0.0,...,0.245536,0.245536,0.245536,0.245536,0.245536,0.245536,0.245536,0.245536,0.16369,0.245536
100002,8,-1437,-103,-874.0,-1042.5,-6992,8,0,0,0.0,...,0.132812,0.132812,0.132812,0.132812,0.132812,0.132812,0.132812,0.132812,0.029948,0.132812
100003,4,-2586,-606,-1400.75,-1205.5,-5603,4,0,0,0.0,...,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.146327,0.304721
100004,2,-1326,-408,-867.0,-867.0,-1734,2,0,0,0.0,...,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.146327,0.304721
100005,3,-373,-62,-190.666667,-137.0,-572,3,0,0,0.0,...,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.017361,0.0625


### We can now join the new data frame to the main data

In [59]:
main_data = main_data.merge(bureau_total_grouped,
                            on = 'SK_ID_CURR',
                            how = 'left')

In [60]:
main_data.shape

(246008, 182)

In [61]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,...,BureauMin_bureau_balance_MONTHS_BALANCE_count_mean_norm,BureauMin_bureau_STATUS_0_count_mean_norm,BureauMin_bureau_STATUS_1_count_mean_norm,BureauMin_bureau_STATUS_2_count_mean_norm,BureauMin_bureau_STATUS_3_count_mean_norm,BureauMin_bureau_STATUS_4_count_mean_norm,BureauMin_bureau_STATUS_5_count_mean_norm,BureauMin_bureau_STATUS_C_count_mean_norm,BureauMin_bureau_STATUS_C_sum_mean_norm,BureauMin_bureau_STATUS_X_count_mean_norm
0,307359,0,594000.0,16628,3785.0,5108.0,181,1,2,2,...,0.158144,0.158144,0.158144,0.158144,0.158144,0.158144,0.158144,0.158144,0.058712,0.158144
1,120529,0,814500.0,21944,1648.0,1403.0,4501,0,3,3,...,0.08125,0.08125,0.08125,0.08125,0.08125,0.08125,0.08125,0.08125,0.0,0.08125
2,198439,1,450000.0,16831,806.0,438.0,386,1,3,3,...,,,,,,,,,,
3,304860,0,900000.0,15851,189.0,1922.0,1922,1,3,1,...,0.236111,0.236111,0.236111,0.236111,0.236111,0.236111,0.236111,0.236111,0.069444,0.236111
4,102965,0,454500.0,15978,501.0,6147.0,3907,1,2,2,...,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.146327,0.304721


### We will, one again, fill the NaN values with the column means

In [62]:
main_data = fill_missing_values(main_data, mean = True)

In [63]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,...,BureauMin_bureau_balance_MONTHS_BALANCE_count_mean_norm,BureauMin_bureau_STATUS_0_count_mean_norm,BureauMin_bureau_STATUS_1_count_mean_norm,BureauMin_bureau_STATUS_2_count_mean_norm,BureauMin_bureau_STATUS_3_count_mean_norm,BureauMin_bureau_STATUS_4_count_mean_norm,BureauMin_bureau_STATUS_5_count_mean_norm,BureauMin_bureau_STATUS_C_count_mean_norm,BureauMin_bureau_STATUS_C_sum_mean_norm,BureauMin_bureau_STATUS_X_count_mean_norm
0,307359,0,594000.0,16628,3785.0,5108.0,181,1,2,2,...,0.158144,0.158144,0.158144,0.158144,0.158144,0.158144,0.158144,0.158144,0.058712,0.158144
1,120529,0,814500.0,21944,1648.0,1403.0,4501,0,3,3,...,0.08125,0.08125,0.08125,0.08125,0.08125,0.08125,0.08125,0.08125,0.0,0.08125
2,198439,1,450000.0,16831,806.0,438.0,386,1,3,3,...,0.294499,0.294499,0.294499,0.294499,0.294499,0.294499,0.294499,0.294499,0.142365,0.294499
3,304860,0,900000.0,15851,189.0,1922.0,1922,1,3,1,...,0.236111,0.236111,0.236111,0.236111,0.236111,0.236111,0.236111,0.236111,0.069444,0.236111
4,102965,0,454500.0,15978,501.0,6147.0,3907,1,2,2,...,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.304721,0.146327,0.304721


### Now, we need to look at the correlations in regards to the 'TARGET' column, since we still don't want to have to many features

In [64]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [65]:
new_cors.head(10)

Unnamed: 0,TARGET
TARGET,1.0
BureauMin_bureau_DAYS_CREDIT_mean_mean,0.082914
BureauMin_bureau_DAYS_CREDIT_median_mean,0.078744
BureauMin_bureau_CREDIT_ACTIVE_Active_mean_mean,0.071271
BureauMin_bureau_DAYS_CREDIT_min_mean,0.068983
BureauMin_bureau_DAYS_CREDIT_UPDATE_mean_mean,0.063956
BureauMin_bureau_DAYS_CREDIT_UPDATE_median_mean,0.06217
BureauMin_bureau_CREDIT_ACTIVE_Active_sum_mean,0.062168
REGION_RATING_CLIENT_W_CITY,0.061792
REGION_RATING_CLIENT,0.060025


In [66]:
new_cors.tail(10)

Unnamed: 0,TARGET
DAYS_EMPLOYED,-0.063216
DAYS_EMPLOYED_norm,-0.063216
DAYS_EMPLOYED_log,-0.07288
BureauMin_bureau_CREDIT_ACTIVE_Closed_mean_mean,-0.07301
DAYS_BIRTH_norm,-0.078967
DAYS_BIRTH,-0.078967
DAYS_BIRTH_log,-0.079284
EXT_SOURCE_1,-0.100051
EXT_SOURCE_3,-0.157119
EXT_SOURCE_2,-0.161329


### Many new features showed up
### We will, once again, delete some potentailly useless columns

In [67]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

In [68]:
main_data.shape

(246008, 68)

# Previous applications

### Let's analyze the data

In [69]:
previous_application.shape

(1670214, 37)

In [70]:
previous_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   SK_ID_PREV                   1670214 non-null  int64  
 1   SK_ID_CURR                   1670214 non-null  int64  
 2   NAME_CONTRACT_TYPE           1670214 non-null  object 
 3   AMT_ANNUITY                  1297979 non-null  float64
 4   AMT_APPLICATION              1670214 non-null  float64
 5   AMT_CREDIT                   1670213 non-null  float64
 6   AMT_DOWN_PAYMENT             774370 non-null   float64
 7   AMT_GOODS_PRICE              1284699 non-null  float64
 8   WEEKDAY_APPR_PROCESS_START   1670214 non-null  object 
 9   HOUR_APPR_PROCESS_START      1670214 non-null  int64  
 10  FLAG_LAST_APPL_PER_CONTRACT  1670214 non-null  object 
 11  NFLAG_LAST_APPL_IN_DAY       1670214 non-null  int64  
 12  RATE_DOWN_PAYMENT            774370 non-nu

In [71]:
previous_application.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


### Let's take a quick look, whether there are many missing values in the newly created data frames

In [72]:
previous_application.isnull().sum()

SK_ID_PREV                           0
SK_ID_CURR                           0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
NAME_CASH_LOAN_PURPOSE               0
NAME_CONTRACT_STATUS                 0
DAYS_DECISION                        0
NAME_PAYMENT_TYPE                    0
CODE_REJECT_REASON                   0
NAME_TYPE_SUITE                 820405
NAME_CLIENT_TYPE                     0
NAME_GOODS_CATEGORY                  0
NAME_PORTFOLIO                       0
NAME_PRODUCT_TYPE                    0
CHANNEL_TYPE             

### Let's drop the columns with at least 75% of values missing

In [73]:
previous_application = drop_missing_columns(previous_application, threshold = 75)

There are 15 with greater than 75 missing values
10 exemplary incomplete columns to be deleted: 
['AMT_ANNUITY', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'NAME_TYPE_SUITE', 'CNT_PAYMENT', 'PRODUCT_COMBINATION', 'DAYS_FIRST_DRAWING']


### We can apply numerical and object groping to the frame

In [74]:
prev_app_num = group_numeric_values(previous_application.drop(columns = ['SK_ID_PREV']), 'prev_app')

In [75]:
prev_app_num.shape

(338857, 36)

In [76]:
prev_app_num.head(5)

Unnamed: 0_level_0,prev_app_AMT_APPLICATION_count,prev_app_AMT_APPLICATION_min,prev_app_AMT_APPLICATION_max,prev_app_AMT_APPLICATION_mean,prev_app_AMT_APPLICATION_median,prev_app_AMT_APPLICATION_sum,prev_app_AMT_CREDIT_count,prev_app_AMT_CREDIT_min,prev_app_AMT_CREDIT_max,prev_app_AMT_CREDIT_mean,...,prev_app_DAYS_DECISION_max,prev_app_DAYS_DECISION_mean,prev_app_DAYS_DECISION_median,prev_app_DAYS_DECISION_sum,prev_app_SELLERPLACE_AREA_count,prev_app_SELLERPLACE_AREA_min,prev_app_SELLERPLACE_AREA_max,prev_app_SELLERPLACE_AREA_mean,prev_app_SELLERPLACE_AREA_median,prev_app_SELLERPLACE_AREA_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1,24835.5,24835.5,24835.5,24835.5,24835.5,1,23787.0,23787.0,23787.0,...,-1740,-1740.0,-1740.0,-1740,1,23,23,23.0,23.0,23
100002,1,179055.0,179055.0,179055.0,179055.0,179055.0,1,179055.0,179055.0,179055.0,...,-606,-606.0,-606.0,-606,1,500,500,500.0,500.0,500
100003,3,68809.5,900000.0,435436.5,337500.0,1306309.5,3,68053.5,1035882.0,484191.0,...,-746,-1305.0,-828.0,-3915,3,-1,1400,533.0,200.0,1599
100004,1,24282.0,24282.0,24282.0,24282.0,24282.0,1,20106.0,20106.0,20106.0,...,-815,-815.0,-815.0,-815,1,30,30,30.0,30.0,30
100005,2,0.0,44617.5,22308.75,22308.75,44617.5,2,0.0,40153.5,20076.75,...,-315,-536.0,-536.0,-1072,2,-1,37,18.0,18.0,36


In [77]:
prev_app_obj = group_object_values(previous_application.drop(columns = ['SK_ID_PREV']), 'prev_app')

In [78]:
prev_app_obj.shape

(338857, 357)

In [79]:
prev_app_obj.head(5)

Unnamed: 0_level_0,prev_app_NAME_CONTRACT_TYPE_Cash loans_count,prev_app_NAME_CONTRACT_TYPE_Cash loans_mean,prev_app_NAME_CONTRACT_TYPE_Cash loans_sum,prev_app_NAME_CONTRACT_TYPE_Consumer loans_count,prev_app_NAME_CONTRACT_TYPE_Consumer loans_mean,prev_app_NAME_CONTRACT_TYPE_Consumer loans_sum,prev_app_NAME_CONTRACT_TYPE_Revolving loans_count,prev_app_NAME_CONTRACT_TYPE_Revolving loans_mean,prev_app_NAME_CONTRACT_TYPE_Revolving loans_sum,prev_app_NAME_CONTRACT_TYPE_XNA_count,...,prev_app_NAME_YIELD_GROUP_high_sum,prev_app_NAME_YIELD_GROUP_low_action_count,prev_app_NAME_YIELD_GROUP_low_action_mean,prev_app_NAME_YIELD_GROUP_low_action_sum,prev_app_NAME_YIELD_GROUP_low_normal_count,prev_app_NAME_YIELD_GROUP_low_normal_mean,prev_app_NAME_YIELD_GROUP_low_normal_sum,prev_app_NAME_YIELD_GROUP_middle_count,prev_app_NAME_YIELD_GROUP_middle_mean,prev_app_NAME_YIELD_GROUP_middle_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1,0.0,0,1,1.0,1,1,0.0,0,1,...,1,1,0.0,0,1,0.0,0,1,0.0,0
100002,1,0.0,0,1,1.0,1,1,0.0,0,1,...,0,1,0.0,0,1,1.0,1,1,0.0,0
100003,3,0.333333,1,3,0.666667,2,3,0.0,0,3,...,0,3,0.0,0,3,0.333333,1,3,0.666667,2
100004,1,0.0,0,1,1.0,1,1,0.0,0,1,...,0,1,0.0,0,1,0.0,0,1,1.0,1
100005,2,0.5,1,2,0.5,1,2,0.0,0,2,...,1,2,0.0,0,2,0.0,0,2,0.0,0


### Let's also deal with any possible leftover missing values in the pre_app_num frame
### We can run the function for that, and it will take care of that, if there are any problems of that nature

In [80]:
prev_app_num = fill_missing_values(prev_app_num, mean = True)

### Let's use normalization and log transformation techniques for new feature creation, specifically for prev_app_num frame, since the values seem to be extremely big

In [81]:
prev_app_num = normalization(prev_app_num)

In [82]:
prev_app_num = log_transform(prev_app_num)

### And now, let's take a quick look at the new features, as well as at the shape of the frame

In [83]:
prev_app_num.shape

(338857, 62)

In [84]:
prev_app_num.head(5)

Unnamed: 0_level_0,prev_app_AMT_APPLICATION_count,prev_app_AMT_APPLICATION_min,prev_app_AMT_APPLICATION_max,prev_app_AMT_APPLICATION_mean,prev_app_AMT_APPLICATION_median,prev_app_AMT_APPLICATION_sum,prev_app_AMT_CREDIT_count,prev_app_AMT_CREDIT_min,prev_app_AMT_CREDIT_max,prev_app_AMT_CREDIT_mean,...,prev_app_AMT_APPLICATION_max_log,prev_app_AMT_APPLICATION_mean_log,prev_app_AMT_APPLICATION_median_log,prev_app_AMT_APPLICATION_sum_log,prev_app_AMT_CREDIT_min_log,prev_app_AMT_CREDIT_max_log,prev_app_AMT_CREDIT_mean_log,prev_app_AMT_CREDIT_median_log,prev_app_AMT_CREDIT_sum_log,prev_app_SELLERPLACE_AREA_sum_log
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1,24835.5,24835.5,24835.5,24835.5,24835.5,1,23787.0,23787.0,23787.0,...,10.12007,10.12007,10.12007,10.12007,10.076937,10.076937,10.076937,10.076937,10.076937,3.178054
100002,1,179055.0,179055.0,179055.0,179055.0,179055.0,1,179055.0,179055.0,179055.0,...,12.095454,12.095454,12.095454,12.095454,12.095454,12.095454,12.095454,12.095454,12.095454,6.216606
100003,3,68809.5,900000.0,435436.5,337500.0,1306309.5,3,68053.5,1035882.0,484191.0,...,13.710151,12.984107,12.729324,14.082717,11.128064,13.850765,13.090237,12.761791,14.188848,7.377759
100004,1,24282.0,24282.0,24282.0,24282.0,24282.0,1,20106.0,20106.0,20106.0,...,10.097532,10.097532,10.097532,10.097532,9.908823,9.908823,9.908823,9.908823,9.908823,3.433987
100005,2,0.0,44617.5,22308.75,22308.75,44617.5,2,0.0,40153.5,20076.75,...,10.705904,10.012779,10.012779,10.705904,0.0,10.60049,9.907368,9.907368,10.60049,3.610918


### Finally, let's merge the frames into the data set, and then let's look at the correlation

In [85]:
main_data = main_data.merge(prev_app_num, on = 'SK_ID_CURR', how = 'left')

In [86]:
main_data = main_data.merge(prev_app_obj, on = 'SK_ID_CURR', how = 'left')

In [87]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [88]:
new_cors.head(10)

Unnamed: 0,TARGET
TARGET,1.0
BureauMin_bureau_DAYS_CREDIT_mean_mean,0.082914
BureauMin_bureau_DAYS_CREDIT_median_mean,0.078744
prev_app_NAME_CONTRACT_STATUS_Refused_mean,0.078716
BureauMin_bureau_CREDIT_ACTIVE_Active_mean_mean,0.071271
BureauMin_bureau_DAYS_CREDIT_min_mean,0.068983
prev_app_NAME_CONTRACT_STATUS_Refused_sum,0.064268
BureauMin_bureau_DAYS_CREDIT_UPDATE_mean_mean,0.063956
BureauMin_bureau_DAYS_CREDIT_UPDATE_median_mean,0.06217
BureauMin_bureau_CREDIT_ACTIVE_Active_sum_mean,0.062168


In [89]:
new_cors.tail(10)

Unnamed: 0,TARGET
BureauMin_bureau_CREDIT_ACTIVE_Closed_mean_mean,-0.07301
prev_app_CODE_REJECT_REASON_XAP_mean,-0.074903
DAYS_BIRTH_norm,-0.078967
DAYS_BIRTH,-0.078967
DAYS_BIRTH_log,-0.079284
EXT_SOURCE_1,-0.100051
EXT_SOURCE_3,-0.157119
EXT_SOURCE_2,-0.161329
prev_app_NAME_GOODS_CATEGORY_House Construction_mean,
prev_app_NAME_GOODS_CATEGORY_House Construction_sum,


### After that, we will use the function used for droping columns with low correlation

In [90]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

### Let's look at the shape of the modified data frame

In [91]:
main_data.shape

(246008, 89)

In [92]:
main_data.to_csv('./featureData.csv')

# Credit Card Balance

In [93]:
credit_card_balance.shape

(3840312, 23)

In [94]:
credit_card_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 23 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   SK_ID_CURR                  int64  
 2   MONTHS_BALANCE              int64  
 3   AMT_BALANCE                 float64
 4   AMT_CREDIT_LIMIT_ACTUAL     int64  
 5   AMT_DRAWINGS_ATM_CURRENT    float64
 6   AMT_DRAWINGS_CURRENT        float64
 7   AMT_DRAWINGS_OTHER_CURRENT  float64
 8   AMT_DRAWINGS_POS_CURRENT    float64
 9   AMT_INST_MIN_REGULARITY     float64
 10  AMT_PAYMENT_CURRENT         float64
 11  AMT_PAYMENT_TOTAL_CURRENT   float64
 12  AMT_RECEIVABLE_PRINCIPAL    float64
 13  AMT_RECIVABLE               float64
 14  AMT_TOTAL_RECEIVABLE        float64
 15  CNT_DRAWINGS_ATM_CURRENT    float64
 16  CNT_DRAWINGS_CURRENT        int64  
 17  CNT_DRAWINGS_OTHER_CURRENT  float64
 18  CNT_DRAWINGS_POS_CURRENT    float64
 19  CNT_INSTALMENT_MATURE

In [95]:
credit_card_balance.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
5,2646502,380010,-7,82903.815,270000,0.0,0.0,0.0,0.0,4449.105,...,82773.315,82773.315,0.0,0,0.0,0.0,2.0,Active,7,0
6,1079071,171320,-6,353451.645,585000,67500.0,67500.0,0.0,0.0,14684.175,...,351881.145,351881.145,1.0,1,0.0,0.0,6.0,Active,0,0
7,2095912,118650,-7,47962.125,45000,45000.0,45000.0,0.0,0.0,0.0,...,47962.125,47962.125,1.0,1,0.0,0.0,51.0,Active,0,0
8,2181852,367360,-4,291543.075,292500,90000.0,289339.425,0.0,199339.425,130.5,...,286831.575,286831.575,3.0,8,0.0,5.0,3.0,Active,0,0
9,1235299,203885,-5,201261.195,225000,76500.0,111026.7,0.0,34526.7,6338.34,...,197224.695,197224.695,3.0,9,0.0,6.0,38.0,Active,0,0


### Check for missing values

In [96]:
credit_card_balance.isnull().sum()

SK_ID_PREV                         0
SK_ID_CURR                         0
MONTHS_BALANCE                     0
AMT_BALANCE                        0
AMT_CREDIT_LIMIT_ACTUAL            0
AMT_DRAWINGS_ATM_CURRENT      749816
AMT_DRAWINGS_CURRENT               0
AMT_DRAWINGS_OTHER_CURRENT    749816
AMT_DRAWINGS_POS_CURRENT      749816
AMT_INST_MIN_REGULARITY       305236
AMT_PAYMENT_CURRENT           767988
AMT_PAYMENT_TOTAL_CURRENT          0
AMT_RECEIVABLE_PRINCIPAL           0
AMT_RECIVABLE                      0
AMT_TOTAL_RECEIVABLE               0
CNT_DRAWINGS_ATM_CURRENT      749816
CNT_DRAWINGS_CURRENT               0
CNT_DRAWINGS_OTHER_CURRENT    749816
CNT_DRAWINGS_POS_CURRENT      749816
CNT_INSTALMENT_MATURE_CUM     305236
NAME_CONTRACT_STATUS               0
SK_DPD                             0
SK_DPD_DEF                         0
dtype: int64

In [97]:
credit_card_balance = fill_missing_values(credit_card_balance, mean = False)

### Let's apply grouping

In [98]:
card_bal_obj_grp = group_object_values(credit_card_balance,
                                      'card_balance',
                                      groupby_id = 'SK_ID_CURR', 
                                      grouping_statistics = ['count', 'min', 'max', 'mean', 'median', 'sum'])

In [99]:
card_bal_obj_numr = group_numeric_values(credit_card_balance,
                                      'card_balance',
                                      groupby_id = 'SK_ID_CURR', 
                                      grouping_statistics = ['count', 'min', 'max', 'mean', 'median', 'sum'])

### Let's analyze the files

In [100]:
card_bal_obj_grp.shape

(103558, 42)

In [101]:
card_bal_obj_grp.head(5)

Unnamed: 0_level_0,card_balance_NAME_CONTRACT_STATUS_Active_count,card_balance_NAME_CONTRACT_STATUS_Active_min,card_balance_NAME_CONTRACT_STATUS_Active_max,card_balance_NAME_CONTRACT_STATUS_Active_mean,card_balance_NAME_CONTRACT_STATUS_Active_median,card_balance_NAME_CONTRACT_STATUS_Active_sum,card_balance_NAME_CONTRACT_STATUS_Approved_count,card_balance_NAME_CONTRACT_STATUS_Approved_min,card_balance_NAME_CONTRACT_STATUS_Approved_max,card_balance_NAME_CONTRACT_STATUS_Approved_mean,...,card_balance_NAME_CONTRACT_STATUS_Sent proposal_max,card_balance_NAME_CONTRACT_STATUS_Sent proposal_mean,card_balance_NAME_CONTRACT_STATUS_Sent proposal_median,card_balance_NAME_CONTRACT_STATUS_Sent proposal_sum,card_balance_NAME_CONTRACT_STATUS_Signed_count,card_balance_NAME_CONTRACT_STATUS_Signed_min,card_balance_NAME_CONTRACT_STATUS_Signed_max,card_balance_NAME_CONTRACT_STATUS_Signed_mean,card_balance_NAME_CONTRACT_STATUS_Signed_median,card_balance_NAME_CONTRACT_STATUS_Signed_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,6,1,1,1.0,1.0,6,6,0,0,0.0,...,0,0.0,0,0,6,0,0,0.0,0.0,0
100011,74,1,1,1.0,1.0,74,74,0,0,0.0,...,0,0.0,0,0,74,0,0,0.0,0.0,0
100013,96,1,1,1.0,1.0,96,96,0,0,0.0,...,0,0.0,0,0,96,0,0,0.0,0.0,0
100021,17,0,1,0.411765,0.0,7,17,0,0,0.0,...,0,0.0,0,0,17,0,0,0.0,0.0,0
100023,8,1,1,1.0,1.0,8,8,0,0,0.0,...,0,0.0,0,0,8,0,0,0.0,0.0,0


In [102]:
card_bal_obj_numr.shape

(103558, 120)

In [103]:
card_bal_obj_numr.head(5)

Unnamed: 0_level_0,card_balance_MONTHS_BALANCE_count,card_balance_MONTHS_BALANCE_min,card_balance_MONTHS_BALANCE_max,card_balance_MONTHS_BALANCE_mean,card_balance_MONTHS_BALANCE_median,card_balance_MONTHS_BALANCE_sum,card_balance_AMT_BALANCE_count,card_balance_AMT_BALANCE_min,card_balance_AMT_BALANCE_max,card_balance_AMT_BALANCE_mean,...,card_balance_SK_DPD_max,card_balance_SK_DPD_mean,card_balance_SK_DPD_median,card_balance_SK_DPD_sum,card_balance_SK_DPD_DEF_count,card_balance_SK_DPD_DEF_min,card_balance_SK_DPD_DEF_max,card_balance_SK_DPD_DEF_mean,card_balance_SK_DPD_DEF_median,card_balance_SK_DPD_DEF_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,6,-6,-1,-3.5,-3.5,-21,6,0.0,0.0,0.0,...,0,0.0,0.0,0,6,0,0,0.0,0.0,0
100011,74,-75,-2,-38.5,-38.5,-2849,74,0.0,189000.0,54482.111149,...,0,0.0,0.0,0,74,0,0,0.0,0.0,0
100013,96,-96,-1,-48.5,-48.5,-4656,96,0.0,161420.22,18159.919219,...,1,0.010417,0.0,1,96,0,1,0.010417,0.0,1
100021,17,-18,-2,-10.0,-10.0,-170,17,0.0,0.0,0.0,...,0,0.0,0.0,0,17,0,0,0.0,0.0,0
100023,8,-11,-4,-7.5,-7.5,-60,8,0.0,0.0,0.0,...,0,0.0,0.0,0,8,0,0,0.0,0.0,0


### Let's apply the normalization only, since the values are not that big for log transformation

In [104]:
card_bal_obj_numr = normalization(card_bal_obj_numr, min_mean_value = 10, groupby_id = 'SK_ID_CURR')

In [105]:
card_bal_obj_grp = normalization(card_bal_obj_grp, min_mean_value = 10, groupby_id = 'SK_ID_CURR')

### Merge the main data with the grouped frames

In [106]:
main_data = main_data.merge(card_bal_obj_grp, on = 'SK_ID_CURR', how = 'left')

In [107]:
main_data = main_data.merge(card_bal_obj_numr, on = 'SK_ID_CURR', how = 'left')

### Remove the values, which are not under the threshold

In [108]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

In [109]:
main_data.shape

(246008, 211)

In [110]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [111]:
new_cors.head(20)

Unnamed: 0,TARGET
TARGET,1.0
card_balance_CNT_DRAWINGS_ATM_CURRENT_mean,0.119016
card_balance_CNT_DRAWINGS_CURRENT_max,0.102471
card_balance_CNT_DRAWINGS_ATM_CURRENT_max,0.08658
card_balance_CNT_DRAWINGS_CURRENT_mean,0.086063
BureauMin_bureau_DAYS_CREDIT_mean_mean,0.082914
card_balance_CNT_DRAWINGS_POS_CURRENT_max,0.082129
card_balance_AMT_BALANCE_mean_norm,0.080376
card_balance_AMT_BALANCE_mean,0.080376
card_balance_AMT_TOTAL_RECEIVABLE_mean_norm,0.079645


In [112]:
new_cors.tail(20)

Unnamed: 0,TARGET
card_balance_NAME_CONTRACT_STATUS_Demand_count_norm,-0.058815
card_balance_AMT_CREDIT_LIMIT_ACTUAL_count_norm,-0.058815
card_balance_SK_DPD_DEF_count_norm,-0.058815
card_balance_AMT_DRAWINGS_CURRENT_count_norm,-0.058815
card_balance_NAME_CONTRACT_STATUS_Completed_count_norm,-0.058815
card_balance_NAME_CONTRACT_STATUS_Active_count_norm,-0.058815
card_balance_NAME_CONTRACT_STATUS_Approved_count_norm,-0.058815
card_balance_AMT_DRAWINGS_ATM_CURRENT_count_norm,-0.058815
DAYS_EMPLOYED,-0.063216
DAYS_EMPLOYED_norm,-0.063216


# Installments payments

In [115]:
intallments_payments = pd.read_csv(f'{path}installments_payments.csv')

In [116]:
intallments_payments.shape

(13605401, 8)

In [117]:
intallments_payments.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [118]:
intallments_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_PREV              int64  
 1   SK_ID_CURR              int64  
 2   NUM_INSTALMENT_VERSION  float64
 3   NUM_INSTALMENT_NUMBER   int64  
 4   DAYS_INSTALMENT         float64
 5   DAYS_ENTRY_PAYMENT      float64
 6   AMT_INSTALMENT          float64
 7   AMT_PAYMENT             float64
dtypes: float64(5), int64(3)
memory usage: 830.4 MB


In [119]:
intallments_payments.isnull().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

### Let's fill the missing values with mean of the columns

In [120]:
intallments_payments = fill_missing_values(intallments_payments, mean = True)

In [121]:
intallments_payments.isnull().sum()

SK_ID_PREV                0
SK_ID_CURR                0
NUM_INSTALMENT_VERSION    0
NUM_INSTALMENT_NUMBER     0
DAYS_INSTALMENT           0
DAYS_ENTRY_PAYMENT        0
AMT_INSTALMENT            0
AMT_PAYMENT               0
dtype: int64

### Let's now apply numerical grouping

In [123]:
install_num = group_numeric_values(intallments_payments, 'install_pay')

### Let's take a look at the data frame

In [124]:
install_num.shape

(339587, 36)

In [126]:
install_num.head(5)

Unnamed: 0_level_0,install_pay_NUM_INSTALMENT_VERSION_count,install_pay_NUM_INSTALMENT_VERSION_min,install_pay_NUM_INSTALMENT_VERSION_max,install_pay_NUM_INSTALMENT_VERSION_mean,install_pay_NUM_INSTALMENT_VERSION_median,install_pay_NUM_INSTALMENT_VERSION_sum,install_pay_NUM_INSTALMENT_NUMBER_count,install_pay_NUM_INSTALMENT_NUMBER_min,install_pay_NUM_INSTALMENT_NUMBER_max,install_pay_NUM_INSTALMENT_NUMBER_mean,...,install_pay_AMT_INSTALMENT_max,install_pay_AMT_INSTALMENT_mean,install_pay_AMT_INSTALMENT_median,install_pay_AMT_INSTALMENT_sum,install_pay_AMT_PAYMENT_count,install_pay_AMT_PAYMENT_min,install_pay_AMT_PAYMENT_max,install_pay_AMT_PAYMENT_mean,install_pay_AMT_PAYMENT_median,install_pay_AMT_PAYMENT_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,1.0,2.0,1.142857,1.0,8.0,7,1,4,2.714286,...,17397.9,5885.132143,3980.925,41195.925,7,3951.0,17397.9,5885.132143,3980.925,41195.925
100002,19,1.0,2.0,1.052632,1.0,20.0,19,1,19,10.0,...,53093.745,11559.247105,9251.775,219625.695,19,9251.775,53093.745,11559.247105,9251.775,219625.695
100003,25,1.0,2.0,1.04,1.0,26.0,25,1,12,5.08,...,560835.36,64754.586,64275.615,1618864.65,25,6662.97,560835.36,64754.586,64275.615,1618864.65
100004,3,1.0,2.0,1.333333,1.0,4.0,3,1,3,2.0,...,10573.965,7096.155,5357.25,21288.465,3,5357.25,10573.965,7096.155,5357.25,21288.465
100005,9,1.0,2.0,1.111111,1.0,10.0,9,1,9,5.0,...,17656.245,6240.205,4813.2,56161.845,9,4813.2,17656.245,6240.205,4813.2,56161.845


### We can apply normalization and log transformation

In [128]:
install_num = normalization(install_num)

In [129]:
install_num = log_transform(install_num)

In [130]:
install_num.shape

(339587, 57)

In [131]:
install_num.head(5)

Unnamed: 0_level_0,install_pay_NUM_INSTALMENT_VERSION_count,install_pay_NUM_INSTALMENT_VERSION_min,install_pay_NUM_INSTALMENT_VERSION_max,install_pay_NUM_INSTALMENT_VERSION_mean,install_pay_NUM_INSTALMENT_VERSION_median,install_pay_NUM_INSTALMENT_VERSION_sum,install_pay_NUM_INSTALMENT_NUMBER_count,install_pay_NUM_INSTALMENT_NUMBER_min,install_pay_NUM_INSTALMENT_NUMBER_max,install_pay_NUM_INSTALMENT_NUMBER_mean,...,install_pay_AMT_INSTALMENT_min_log,install_pay_AMT_INSTALMENT_max_log,install_pay_AMT_INSTALMENT_mean_log,install_pay_AMT_INSTALMENT_median_log,install_pay_AMT_INSTALMENT_sum_log,install_pay_AMT_PAYMENT_min_log,install_pay_AMT_PAYMENT_max_log,install_pay_AMT_PAYMENT_mean_log,install_pay_AMT_PAYMENT_median_log,install_pay_AMT_PAYMENT_sum_log
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,1.0,2.0,1.142857,1.0,8.0,7,1,4,2.714286,...,8.281977,9.764162,8.680354,8.289521,10.626119,8.281977,9.764162,8.680354,8.289521,10.626119
100002,19,1.0,2.0,1.052632,1.0,20.0,19,1,19,10.0,...,9.132679,10.879833,9.355328,9.132679,12.299685,9.132679,10.879833,9.355328,9.132679,12.299685
100003,25,1.0,2.0,1.04,1.0,26.0,25,1,12,5.08,...,8.804471,13.237184,11.078375,11.070951,14.297236,8.804471,13.237184,11.078375,11.070951,14.297236
100004,3,1.0,2.0,1.333333,1.0,4.0,3,1,3,2.0,...,8.586393,9.266245,8.867449,8.586393,9.965968,8.586393,9.266245,8.867449,8.586393,9.965968
100005,9,1.0,2.0,1.111111,1.0,10.0,9,1,9,5.0,...,8.479325,9.778901,8.738929,8.479325,10.936011,8.479325,9.778901,8.738929,8.479325,10.936011


### We can now simply join the file into the main data set

In [132]:
main_data = main_data.merge(install_num, on = 'SK_ID_CURR', how = 'left')

### Get the values, which fall fall over the threshold

In [133]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

In [134]:
main_data.shape

(246008, 223)

# POS Cash Balance

In [135]:
POS_CASH_balance = pd.read_csv(f'{path}POS_CASH_balance.csv')

### Analyse the data

In [136]:
POS_CASH_balance.shape

(10001358, 8)

In [137]:
POS_CASH_balance.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [138]:
POS_CASH_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   SK_ID_CURR             int64  
 2   MONTHS_BALANCE         int64  
 3   CNT_INSTALMENT         float64
 4   CNT_INSTALMENT_FUTURE  float64
 5   NAME_CONTRACT_STATUS   object 
 6   SK_DPD                 int64  
 7   SK_DPD_DEF             int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 610.4+ MB


In [139]:
POS_CASH_balance.isnull().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
MONTHS_BALANCE               0
CNT_INSTALMENT           26071
CNT_INSTALMENT_FUTURE    26087
NAME_CONTRACT_STATUS         0
SK_DPD                       0
SK_DPD_DEF                   0
dtype: int64

### Let's try to drop the collumns

In [140]:
POS_CASH_balance = drop_missing_columns(POS_CASH_balance)

There are 2 with greater than 70 missing values
Incomplete columns: 
['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']


In [141]:
POS_CASH_balance.isnull().sum()

SK_ID_PREV              0
SK_ID_CURR              0
MONTHS_BALANCE          0
NAME_CONTRACT_STATUS    0
SK_DPD                  0
SK_DPD_DEF              0
dtype: int64

### Apply the grouping to the data set

In [143]:
pcb_num = group_numeric_values(POS_CASH_balance, 'pos_cash_balance')

In [144]:
pcb_obj = group_object_values(POS_CASH_balance, 'pos_cash_balance')

### Take a look at the data

In [145]:
pcb_num.shape

(337252, 18)

In [146]:
pcb_num.head(5)

Unnamed: 0_level_0,pos_cash_balance_MONTHS_BALANCE_count,pos_cash_balance_MONTHS_BALANCE_min,pos_cash_balance_MONTHS_BALANCE_max,pos_cash_balance_MONTHS_BALANCE_mean,pos_cash_balance_MONTHS_BALANCE_median,pos_cash_balance_MONTHS_BALANCE_sum,pos_cash_balance_SK_DPD_count,pos_cash_balance_SK_DPD_min,pos_cash_balance_SK_DPD_max,pos_cash_balance_SK_DPD_mean,pos_cash_balance_SK_DPD_median,pos_cash_balance_SK_DPD_sum,pos_cash_balance_SK_DPD_DEF_count,pos_cash_balance_SK_DPD_DEF_min,pos_cash_balance_SK_DPD_DEF_max,pos_cash_balance_SK_DPD_DEF_mean,pos_cash_balance_SK_DPD_DEF_median,pos_cash_balance_SK_DPD_DEF_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
100001,9,-96,-53,-72.555556,-57.0,-653,9,0,7,0.777778,0.0,7,9,0,7,0.777778,0.0,7
100002,19,-19,-1,-10.0,-10.0,-190,19,0,0,0.0,0.0,0,19,0,0,0.0,0.0,0
100003,28,-77,-18,-43.785714,-26.5,-1226,28,0,0,0.0,0.0,0,28,0,0,0.0,0.0,0
100004,4,-27,-24,-25.5,-25.5,-102,4,0,0,0.0,0.0,0,4,0,0,0.0,0.0,0
100005,11,-25,-15,-20.0,-20.0,-220,11,0,0,0.0,0.0,0,11,0,0,0.0,0.0,0


In [147]:
pcb_obj.shape

(337252, 27)

In [148]:
pcb_obj.head(5)

Unnamed: 0_level_0,pos_cash_balance_NAME_CONTRACT_STATUS_Active_count,pos_cash_balance_NAME_CONTRACT_STATUS_Active_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Active_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Amortized debt_count,pos_cash_balance_NAME_CONTRACT_STATUS_Amortized debt_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Amortized debt_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Approved_count,pos_cash_balance_NAME_CONTRACT_STATUS_Approved_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Approved_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Canceled_count,...,pos_cash_balance_NAME_CONTRACT_STATUS_Demand_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Returned to the store_count,pos_cash_balance_NAME_CONTRACT_STATUS_Returned to the store_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Returned to the store_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Signed_count,pos_cash_balance_NAME_CONTRACT_STATUS_Signed_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Signed_sum,pos_cash_balance_NAME_CONTRACT_STATUS_XNA_count,pos_cash_balance_NAME_CONTRACT_STATUS_XNA_mean,pos_cash_balance_NAME_CONTRACT_STATUS_XNA_sum
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,9,0.777778,7.0,9,0.0,0,9,0.0,0,9,...,0,9,0.0,0,9,0.0,0,9,0.0,0
100002,19,1.0,19.0,19,0.0,0,19,0.0,0,19,...,0,19,0.0,0,19,0.0,0,19,0.0,0
100003,28,0.928571,26.0,28,0.0,0,28,0.0,0,28,...,0,28,0.0,0,28,0.0,0,28,0.0,0
100004,4,0.75,3.0,4,0.0,0,4,0.0,0,4,...,0,4,0.0,0,4,0.0,0,4,0.0,0
100005,11,0.818182,9.0,11,0.0,0,11,0.0,0,11,...,0,11,0.0,0,11,0.090909,1,11,0.0,0


### Normalisation could be applied

In [149]:
pcb_obj = normalization(pcb_obj)

In [150]:
pcb_num = normalization(pcb_num)

### Merge the data

In [151]:
main_data = main_data.merge(pcb_obj, on = 'SK_ID_CURR', how = 'left')

In [152]:
main_data = main_data.merge(pcb_num, on = 'SK_ID_CURR', how = 'left')

### Let's look at the correlations

In [153]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [154]:
new_cors.head(25)

Unnamed: 0,TARGET
TARGET,1.0
card_balance_CNT_DRAWINGS_ATM_CURRENT_mean,0.119016
card_balance_CNT_DRAWINGS_CURRENT_max,0.102471
card_balance_CNT_DRAWINGS_ATM_CURRENT_max,0.08658
card_balance_CNT_DRAWINGS_CURRENT_mean,0.086063
BureauMin_bureau_DAYS_CREDIT_mean_mean,0.082914
card_balance_CNT_DRAWINGS_POS_CURRENT_max,0.082129
card_balance_AMT_BALANCE_mean_norm,0.080376
card_balance_AMT_BALANCE_mean,0.080376
card_balance_AMT_TOTAL_RECEIVABLE_mean_norm,0.079645


In [155]:
new_cors.tail(25)

Unnamed: 0,TARGET
card_balance_AMT_INST_MIN_REGULARITY_count_norm,-0.058815
card_balance_AMT_DRAWINGS_POS_CURRENT_count_norm,-0.058815
card_balance_CNT_DRAWINGS_ATM_CURRENT_count_norm,-0.058815
card_balance_AMT_BALANCE_count_norm,-0.058815
card_balance_AMT_DRAWINGS_OTHER_CURRENT_count_norm,-0.058815
card_balance_CNT_DRAWINGS_OTHER_CURRENT_count_norm,-0.058815
card_balance_CNT_DRAWINGS_CURRENT_count_norm,-0.058815
card_balance_CNT_INSTALMENT_MATURE_CUM_count_norm,-0.058815
card_balance_SK_DPD_count_norm,-0.058815
card_balance_SK_DPD_DEF_count_norm,-0.058815


In [156]:
main_data =  remove_target_correlated_cols(main_data, 
                                          special_id = 'SK_ID_CURR', 
                                          threshold = 0.05)

In [157]:
main_data.shape

(246008, 160)

In [None]:
main_data.to_csv('/featureData.csv')