# Feature Engineering

### We start off with necessary imports of python libraries

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

### We can now read all the necessary data files, which are to be analysed

### We assume, that some elementary actions of data cleaning have already been applied to the train data

In [2]:
path = '../../Data/'

main_data = pd.read_csv('./initial_data.csv')
bureau = pd.read_csv(f'{path}bureau.csv')
bureau_balance = pd.read_csv(f'{path}bureau_balance.csv')
credit_card_balance = pd.read_csv(f'{path}credit_card_balance.csv')
intallments_payments = pd.read_csv(f'{path}installments_payments.csv')
POS_CASH_balance = pd.read_csv(f'{path}POS_CASH_balance.csv')
previous_application = pd.read_csv(f'{path}previous_application.csv')

In [3]:
from Functions.DataPreperation import *
from Functions.FeatureEngineering import *

### We can now take a quick look at the main data, to see its parameters such as shape or top 5 values, so that we will be able to compare the data after applying feature engineering techniques to the original data frame

In [4]:
main_data.shape

(307511, 244)

In [5]:
main_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 244 entries, Unnamed: 0 to EMERGENCYSTATE_MODE_Yes
dtypes: float64(66), int64(178)
memory usage: 572.5 MB


In [6]:
main_data.head(5)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,0,100002,1,0,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0,0,1,0,1,0
1,1,100003,0,0,0,0,0,270000.0,1293502.5,35698.5,...,0,1,0,0,0,0,0,0,1,0
2,2,100004,0,1,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0,0,0,0,0,0
3,3,100006,0,0,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,0,0,0,0,0,0
4,4,100007,0,0,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0,0,0,0,0,0


### To try to find even more interesting features, which could help in analysis, we can create following columns

In [7]:
copy_of_main = main_data.copy()

main_data['DAYS_EMPLOYED_PERC'] = copy_of_main['DAYS_EMPLOYED'] / copy_of_main['DAYS_BIRTH']
main_data['INCOME_CREDIT_PERC'] = copy_of_main['AMT_INCOME_TOTAL'] / copy_of_main['AMT_CREDIT']
main_data['INCOME_PER_PERSON'] = copy_of_main['AMT_INCOME_TOTAL'] / copy_of_main['CNT_FAM_MEMBERS']
main_data['ANNUITY_INCOME_PERC'] = copy_of_main['AMT_ANNUITY'] / copy_of_main['AMT_INCOME_TOTAL']
main_data['PAYMENT_RATE'] = copy_of_main['AMT_ANNUITY'] /copy_of_main['AMT_CREDIT']

### Since the "Unnamed: 0" column is not needed, we will drop it

In [8]:
main_data = main_data.drop(columns = ['Unnamed: 0'])

### Let's see, whether the changes applied

In [9]:
main_data.head(1)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,DAYS_EMPLOYED_PERC,INCOME_CREDIT_PERC,INCOME_PER_PERSON,ANNUITY_INCOME_PERC,PAYMENT_RATE
0,100002,1,0,0,1,0,202500.0,406597.5,24700.5,351000.0,...,0,1,0,1,0,0.067329,0.498036,202500.0,0.121978,0.060749


### As we can see, the unneeded column was succesfully removed
### However, it's not the only column, that we should drop
### We want our models to be not only accurate, but also fast, and applying feature engineering to 243 columns will take a lot of time
### Thus, we need to remove some of the columns

### To choose, which columns need to be removed, we can take a look at the correlations table
### The columns, that will be of high positive or negative correlation in regards to the 'TARGET' table, can be kept, while those of low correlation should probably be dropped, since they may have almost none impact on the final predictions made by our models

In [10]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

pd.DataFrame(correlations['TARGET'])

Unnamed: 0,TARGET
TARGET,1.000000
REGION_RATING_CLIENT_W_CITY,0.060893
REGION_RATING_CLIENT,0.058899
NAME_INCOME_TYPE_Working,0.057481
DAYS_LAST_PHONE_CHANGE,0.055218
...,...
DAYS_EMPLOYED,-0.063368
DAYS_BIRTH,-0.078239
EXT_SOURCE_1,-0.098887
EXT_SOURCE_3,-0.155892


### The correlations are not very high
### We should probably keep all the tables with values, which have an absolute value of correlation equal to around 0.04, and we can treat that values as a certain correlation threshold
### To find such function, we will use auxilliary function remove_target_correlated_cols() from .py files
### It returns the data frame only left with columns of correlation's value bigger than the given threshold

In [11]:
main_data = remove_target_correlated_cols(main_data, special_id = 'SK_ID_CURR', threshold = 0.04)

### The data after applying changes presents as follows

In [12]:
main_data.shape

(307511, 27)

In [13]:
main_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 27 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   SK_ID_CURR                                         307511 non-null  int64  
 1   TARGET                                             307511 non-null  int64  
 2   DAYS_BIRTH                                         307511 non-null  int64  
 3   DAYS_EMPLOYED                                      307511 non-null  float64
 4   DAYS_REGISTRATION                                  307511 non-null  float64
 5   DAYS_ID_PUBLISH                                    307511 non-null  int64  
 6   FLAG_EMP_PHONE                                     307511 non-null  int64  
 7   REGION_RATING_CLIENT                               307511 non-null  int64  
 8   REGION_RATING_CLIENT_W_CITY                        307511 non-null  int64 

In [14]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,...,CODE_GENDER_M,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Secondary / secondary special,OCCUPATION_TYPE_Laborers,ORGANIZATION_TYPE_XNA,HOUSETYPE_MODE_block of flats,EMERGENCYSTATE_MODE_No,DAYS_EMPLOYED_PERC
0,100002,1,9461,637.0,3648.0,2120,1,2,2,0,...,1,0,1,0,1,1,0,1,1,0.067329
1,100003,0,16765,1188.0,1186.0,291,1,1,1,0,...,0,0,0,1,0,0,0,1,1,0.070862
2,100004,0,19046,225.0,4260.0,2531,1,2,2,0,...,1,0,1,0,1,1,0,0,0,0.011814
3,100006,0,19005,3039.0,9833.0,2437,1,2,2,0,...,0,0,1,0,1,1,0,0,0,0.159905
4,100007,0,19932,3038.0,4311.0,3458,1,2,2,0,...,1,0,1,0,1,0,0,0,0,0.152418


### Now, we can apply simple numerical feature engineering techniques
### We can start with adding normalization to the columns
### We will use auxilliary normalization() function from .py files
### It adds another feature in a form of a column with normalized values
### Since we want to apply it to bigger numbers, we can apply it to columns with mean values equal to, for example, as least 100

In [15]:
main_data = normalization(main_data, min_mean_value = 100, groupby_id = 'SK_ID_CURR')

### The data presents now in a following way

In [16]:
main_data.shape

(307511, 31)

In [17]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,...,NAME_EDUCATION_TYPE_Secondary / secondary special,OCCUPATION_TYPE_Laborers,ORGANIZATION_TYPE_XNA,HOUSETYPE_MODE_block of flats,EMERGENCYSTATE_MODE_No,DAYS_EMPLOYED_PERC,DAYS_BIRTH_norm,DAYS_EMPLOYED_norm,DAYS_REGISTRATION_norm,DAYS_ID_PUBLISH_norm
0,100002,1,9461,637.0,3648.0,2120,1,2,2,0,...,1,1,0,1,1,0.067329,0.111161,0.035563,0.14786,0.294567
1,100003,0,16765,1188.0,1186.0,291,1,1,1,0,...,0,0,0,1,1,0.070862,0.522886,0.066324,0.048071,0.040434
2,100004,0,19046,225.0,4260.0,2531,1,2,2,0,...,1,1,0,0,0,0.011814,0.651466,0.012561,0.172665,0.351674
3,100006,0,19005,3039.0,9833.0,2437,1,2,2,0,...,1,1,0,0,0,0.159905,0.649154,0.169663,0.398549,0.338613
4,100007,0,19932,3038.0,4311.0,3458,1,2,2,0,...,1,0,0,0,0,0.152418,0.701409,0.169607,0.174732,0.480478


### That way, we added 5 new features

### We can also try to apply logarithm transformation to big numerical values
### We will, one again, use auxilliary function log_transform() from .py files

In [18]:
main_data = log_transform(main_data, min_mean_value = 1000, groupby_id = 'SK_ID_CURR')

### Data after transformations looks in the following way

In [19]:
main_data.shape

(307511, 35)

In [20]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,...,EMERGENCYSTATE_MODE_No,DAYS_EMPLOYED_PERC,DAYS_BIRTH_norm,DAYS_EMPLOYED_norm,DAYS_REGISTRATION_norm,DAYS_ID_PUBLISH_norm,DAYS_BIRTH_log,DAYS_EMPLOYED_log,DAYS_REGISTRATION_log,DAYS_ID_PUBLISH_log
0,100002,1,9461,637.0,3648.0,2120,1,2,2,0,...,1,0.067329,0.111161,0.035563,0.14786,0.294567,9.155039,6.458338,8.202208,7.659643
1,100003,0,16765,1188.0,1186.0,291,1,1,1,0,...,1,0.070862,0.522886,0.066324,0.048071,0.040434,9.727108,7.080868,7.079184,5.676754
2,100004,0,19046,225.0,4260.0,2531,1,2,2,0,...,0,0.011814,0.651466,0.012561,0.172665,0.351674,9.854665,5.420535,8.357259,7.836765
3,100006,0,19005,3039.0,9833.0,2437,1,2,2,0,...,0,0.159905,0.649154,0.169663,0.398549,0.338613,9.85251,8.019613,9.193601,7.798933
4,100007,0,19932,3038.0,4311.0,3458,1,2,2,0,...,0,0.152418,0.701409,0.169607,0.174732,0.480478,9.900132,8.019284,8.369157,8.148735


### From that point, we can start analyzing additional files, which can be helpful in creating powerful and fast models

### But before that, let's write the current set into .csv file, as we want to compare results, when applying feature engineering to only single file, and then when adding other files as well

In [21]:
main_data.to_csv('./featureData1.csv')

# Bureau files

### Let's start with simple bureau files analysis

In [22]:
bureau.shape

(1716428, 17)

In [23]:
bureau.head(5)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


### We have a lot of NaN values, which need to be changed to either 0, or other values, such as mean of the column
### The columns can also be dropped, which we can do in that case, with help of auxilliary function
### It takes the data frame, threshold for number of missing values, and information about possibility of printing the information to the screen, as parameters

In [24]:
new_bureau = drop_missing_columns(bureau, threshold = 70, print_info = True)

There are 7 with greater than 70 missing values
Incomplete columns: 
['DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_ANNUITY']


### Let's take a look the the data now

In [25]:
new_bureau.head(5)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE
0,215354,5714462,Closed,currency 1,-497,0,0,0.0,Consumer credit,-131
1,215354,5714463,Active,currency 1,-208,0,0,0.0,Credit card,-20
2,215354,5714464,Active,currency 1,-203,0,0,0.0,Consumer credit,-16
3,215354,5714465,Active,currency 1,-203,0,0,0.0,Credit card,-16
4,215354,5714466,Active,currency 1,-629,0,0,0.0,Consumer credit,-21


In [26]:
new_bureau.isnull().sum()

SK_ID_CURR                0
SK_ID_BUREAU              0
CREDIT_ACTIVE             0
CREDIT_CURRENCY           0
DAYS_CREDIT               0
CREDIT_DAY_OVERDUE        0
CNT_CREDIT_PROLONG        0
AMT_CREDIT_SUM_OVERDUE    0
CREDIT_TYPE               0
DAYS_CREDIT_UPDATE        0
dtype: int64

### There are no NaN values in our frame now

### Let's see, how the frame presents now

In [27]:
new_bureau.shape

(1716428, 10)

### Let's apply the very same procedures, such as general analysis and data cleaning, to bureau_balance

In [28]:
bureau_balance.shape

(27299925, 3)

In [29]:
bureau_balance.head(5)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [30]:
bureau_balance.isnull().sum()

SK_ID_BUREAU      0
MONTHS_BALANCE    0
STATUS            0
dtype: int64

### No data cleaning has to be applied to the file, since there are no values missing

### Both files can have object as well as numerical grouping applied to them
### That way, we can get additional features
### Bureau data frame should be grouped by SK_ID_CURR, while bureau_balance data frame should by grouped by SK_ID_BUREA
### In both cases, we will make use of auxilliary functions from .py file:
* group_numeric_values(data_frame,
                         data_frame_name,
                         groupby_id = 'SK_ID_CURR', 
                         grouping_statistics = ['count', 'mean', 'median'])
* group_object_values(data_frame,
                        data_frame_name,
                        groupby_id = 'SK_ID_CURR',
                        grouping_statistics = ['count'])
                        
### They apply the grouping feature engineering technique, in which we get the data frame with additional variables in return

### We will apply those techniques to these two bureau files and see, what kind of data we get

* Numerical grouping for bureau

In [31]:
bureau_group_num = group_numeric_values(new_bureau,
                         'bureau',
                         groupby_id = 'SK_ID_CURR')

In [32]:
bureau_group_num.head(5)

Unnamed: 0_level_0,bureau_DAYS_CREDIT_count,bureau_DAYS_CREDIT_mean,bureau_DAYS_CREDIT_median,bureau_CREDIT_DAY_OVERDUE_count,bureau_CREDIT_DAY_OVERDUE_mean,bureau_CREDIT_DAY_OVERDUE_median,bureau_CNT_CREDIT_PROLONG_count,bureau_CNT_CREDIT_PROLONG_mean,bureau_CNT_CREDIT_PROLONG_median,bureau_AMT_CREDIT_SUM_OVERDUE_count,bureau_AMT_CREDIT_SUM_OVERDUE_mean,bureau_AMT_CREDIT_SUM_OVERDUE_median,bureau_DAYS_CREDIT_UPDATE_count,bureau_DAYS_CREDIT_UPDATE_mean,bureau_DAYS_CREDIT_UPDATE_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
100001,7,-735.0,-857.0,7,0.0,0.0,7,0.0,0.0,7,0.0,0.0,7,-93.142857,-155.0
100002,8,-874.0,-1042.5,8,0.0,0.0,8,0.0,0.0,8,0.0,0.0,8,-499.875,-402.5
100003,4,-1400.75,-1205.5,4,0.0,0.0,4,0.0,0.0,4,0.0,0.0,4,-816.0,-545.0
100004,2,-867.0,-867.0,2,0.0,0.0,2,0.0,0.0,2,0.0,0.0,2,-532.0,-532.0
100005,3,-190.666667,-137.0,3,0.0,0.0,3,0.0,0.0,3,0.0,0.0,3,-54.333333,-31.0


* Numerical grouping for bureau_balance

In [33]:
bureau_balance_group_num = group_numeric_values(bureau_balance,
                         'bureau_balance',
                         groupby_id = 'SK_ID_BUREAU')

In [34]:
bureau_balance_group_num.head(5)

Unnamed: 0_level_0,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_MONTHS_BALANCE_median
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5001709,97,-48.0,-48.0
5001710,83,-41.0,-41.0
5001711,4,-1.5,-1.5
5001712,19,-9.0,-9.0
5001713,22,-10.5,-10.5


### We can drop median value, since its values is the same as the ones from the mean column, and we don't necessarily want to duplicate column just for sake of creating another features

In [35]:
bureau_balance_group_num = bureau_balance_group_num.drop(columns = ['bureau_balance_MONTHS_BALANCE_median'])

In [36]:
bureau_balance_group_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 817395 entries, 5001709 to 6842888
Data columns (total 2 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   bureau_balance_MONTHS_BALANCE_count  817395 non-null  int64  
 1   bureau_balance_MONTHS_BALANCE_mean   817395 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 18.7 MB


* Object grouping for bureau

In [37]:
bureau_group_object = group_object_values(new_bureau, 'bureau', groupby_id = 'SK_ID_CURR', 
                         grouping_statistics = ['sum', 'mean'])

In [38]:
bureau_group_object.head(5)

Unnamed: 0_level_0,bureau_CREDIT_ACTIVE_Active_sum,bureau_CREDIT_ACTIVE_Active_mean,bureau_CREDIT_ACTIVE_Bad debt_sum,bureau_CREDIT_ACTIVE_Bad debt_mean,bureau_CREDIT_ACTIVE_Closed_sum,bureau_CREDIT_ACTIVE_Closed_mean,bureau_CREDIT_ACTIVE_Sold_sum,bureau_CREDIT_ACTIVE_Sold_mean,bureau_CREDIT_CURRENCY_currency 1_sum,bureau_CREDIT_CURRENCY_currency 1_mean,...,bureau_CREDIT_TYPE_Microloan_sum,bureau_CREDIT_TYPE_Microloan_mean,bureau_CREDIT_TYPE_Mobile operator loan_sum,bureau_CREDIT_TYPE_Mobile operator loan_mean,bureau_CREDIT_TYPE_Mortgage_sum,bureau_CREDIT_TYPE_Mortgage_mean,bureau_CREDIT_TYPE_Real estate loan_sum,bureau_CREDIT_TYPE_Real estate loan_mean,bureau_CREDIT_TYPE_Unknown type of loan_sum,bureau_CREDIT_TYPE_Unknown type of loan_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,3,0.428571,0,0.0,4,0.571429,0,0.0,7,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100002,2,0.25,0,0.0,6,0.75,0,0.0,8,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100003,1,0.25,0,0.0,3,0.75,0,0.0,4,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100004,0,0.0,0,0.0,2,1.0,0,0.0,2,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100005,2,0.666667,0,0.0,1,0.333333,0,0.0,3,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


* Object grouping for bureau_balance

In [39]:
bureau_balance_object_group = group_object_values(bureau_balance, 'bureau', groupby_id = 'SK_ID_BUREAU', grouping_statistics = ['sum', 'mean'])

In [40]:
bureau_balance_object_group.head(5)

Unnamed: 0_level_0,bureau_STATUS_0_sum,bureau_STATUS_0_mean,bureau_STATUS_1_sum,bureau_STATUS_1_mean,bureau_STATUS_2_sum,bureau_STATUS_2_mean,bureau_STATUS_3_sum,bureau_STATUS_3_mean,bureau_STATUS_4_sum,bureau_STATUS_4_mean,bureau_STATUS_5_sum,bureau_STATUS_5_mean,bureau_STATUS_C_sum,bureau_STATUS_C_mean,bureau_STATUS_X_sum,bureau_STATUS_X_mean
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5001709,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,86,0.886598,11,0.113402
5001710,5,0.060241,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,48,0.578313,30,0.361446
5001711,3,0.75,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.25
5001712,10,0.526316,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,9,0.473684,0,0.0
5001713,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,22,1.0


### Now, we have the 4 grouped data frames, on which we can work

### We can start of by grouping bureau and bureau_balance frames

* Making one, big data frame of two grouped bureau frames

In [41]:
bureau_total = bureau_group_num.merge(bureau_group_object, on = 'SK_ID_CURR', how = 'left')

In [42]:
bureau_total.shape

(305811, 61)

In [43]:
bureau_total.head(5)

Unnamed: 0_level_0,bureau_DAYS_CREDIT_count,bureau_DAYS_CREDIT_mean,bureau_DAYS_CREDIT_median,bureau_CREDIT_DAY_OVERDUE_count,bureau_CREDIT_DAY_OVERDUE_mean,bureau_CREDIT_DAY_OVERDUE_median,bureau_CNT_CREDIT_PROLONG_count,bureau_CNT_CREDIT_PROLONG_mean,bureau_CNT_CREDIT_PROLONG_median,bureau_AMT_CREDIT_SUM_OVERDUE_count,...,bureau_CREDIT_TYPE_Microloan_sum,bureau_CREDIT_TYPE_Microloan_mean,bureau_CREDIT_TYPE_Mobile operator loan_sum,bureau_CREDIT_TYPE_Mobile operator loan_mean,bureau_CREDIT_TYPE_Mortgage_sum,bureau_CREDIT_TYPE_Mortgage_mean,bureau_CREDIT_TYPE_Real estate loan_sum,bureau_CREDIT_TYPE_Real estate loan_mean,bureau_CREDIT_TYPE_Unknown type of loan_sum,bureau_CREDIT_TYPE_Unknown type of loan_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,-735.0,-857.0,7,0.0,0.0,7,0.0,0.0,7,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100002,8,-874.0,-1042.5,8,0.0,0.0,8,0.0,0.0,8,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100003,4,-1400.75,-1205.5,4,0.0,0.0,4,0.0,0.0,4,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100004,2,-867.0,-867.0,2,0.0,0.0,2,0.0,0.0,2,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100005,3,-190.666667,-137.0,3,0.0,0.0,3,0.0,0.0,3,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


* Making one, big data frame of two grouped bureau_balance frames

In [44]:
bureau_balance_total = bureau_balance_group_num.merge(bureau_balance_object_group, on = 'SK_ID_BUREAU', how = 'left')

In [45]:
bureau_balance_total.shape

(817395, 18)

In [46]:
bureau_balance_total.head(5)

Unnamed: 0_level_0,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_mean,bureau_STATUS_0_sum,bureau_STATUS_0_mean,bureau_STATUS_1_sum,bureau_STATUS_1_mean,bureau_STATUS_2_sum,bureau_STATUS_2_mean,bureau_STATUS_3_sum,bureau_STATUS_3_mean,bureau_STATUS_4_sum,bureau_STATUS_4_mean,bureau_STATUS_5_sum,bureau_STATUS_5_mean,bureau_STATUS_C_sum,bureau_STATUS_C_mean,bureau_STATUS_X_sum,bureau_STATUS_X_mean
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5001709,97,-48.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,86,0.886598,11,0.113402
5001710,83,-41.0,5,0.060241,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,48,0.578313,30,0.361446
5001711,4,-1.5,3,0.75,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.25
5001712,19,-9.0,10,0.526316,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,9,0.473684,0,0.0
5001713,22,-10.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,22,1.0


### Since the bureau_total frame is already in state of being able to be merged into main data frame, we can apply some kind of additional feature engineering techniques

### By taking a quick glance, it looks like the values are not very big, therefore normalisation may be enough for that data frame

In [47]:
bureau_balance_total = normalization(bureau_balance_total, min_mean_value = 10)

### We can now just straight up join the merged bureau data frame into the main data frame
### Bureau_balance files will still need some modifications

In [48]:
main_data = main_data.merge(bureau_total, on = 'SK_ID_CURR', how = 'left')

In [49]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,...,bureau_CREDIT_TYPE_Microloan_sum,bureau_CREDIT_TYPE_Microloan_mean,bureau_CREDIT_TYPE_Mobile operator loan_sum,bureau_CREDIT_TYPE_Mobile operator loan_mean,bureau_CREDIT_TYPE_Mortgage_sum,bureau_CREDIT_TYPE_Mortgage_mean,bureau_CREDIT_TYPE_Real estate loan_sum,bureau_CREDIT_TYPE_Real estate loan_mean,bureau_CREDIT_TYPE_Unknown type of loan_sum,bureau_CREDIT_TYPE_Unknown type of loan_mean
0,100002,1,9461,637.0,3648.0,2120,1,2,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003,0,16765,1188.0,1186.0,291,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,19046,225.0,4260.0,2531,1,2,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,19005,3039.0,9833.0,2437,1,2,2,0,...,,,,,,,,,,
4,100007,0,19932,3038.0,4311.0,3458,1,2,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Just from looking at the data, we can see that there are NaN values, which we are going to have to deal with in the future

### Now, we need to take care of bureau_balance files

### Let's take a quick look at the original bureau file

In [50]:
new_bureau.head(5)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE
0,215354,5714462,Closed,currency 1,-497,0,0,0.0,Consumer credit,-131
1,215354,5714463,Active,currency 1,-208,0,0,0.0,Credit card,-20
2,215354,5714464,Active,currency 1,-203,0,0,0.0,Consumer credit,-16
3,215354,5714465,Active,currency 1,-203,0,0,0.0,Credit card,-16
4,215354,5714466,Active,currency 1,-629,0,0,0.0,Consumer credit,-21


### Unique identifier in that case is SK_ID_BUREAU, we can make use of the two ID columns, and merge them into bureau_balance_total file, and after that another grouping can be done

In [51]:
bureau_balance_total = bureau_balance_total.merge(new_bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], 
                                                  right_index = False, 
                                                  on = 'SK_ID_BUREAU', 
                                                  how = 'outer')

In [52]:
bureau_balance_total.head(5)

Unnamed: 0,SK_ID_BUREAU,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_mean,bureau_STATUS_0_sum,bureau_STATUS_0_mean,bureau_STATUS_1_sum,bureau_STATUS_1_mean,bureau_STATUS_2_sum,bureau_STATUS_2_mean,bureau_STATUS_3_sum,...,bureau_STATUS_4_mean,bureau_STATUS_5_sum,bureau_STATUS_5_mean,bureau_STATUS_C_sum,bureau_STATUS_C_mean,bureau_STATUS_X_sum,bureau_STATUS_X_mean,bureau_balance_MONTHS_BALANCE_count_norm,bureau_STATUS_C_sum_norm,SK_ID_CURR
0,5001709,97.0,-48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,86.0,0.886598,11.0,0.113402,1.0,0.886598,
1,5001710,83.0,-41.0,5.0,0.060241,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,48.0,0.578313,30.0,0.361446,0.854167,0.494845,162368.0
2,5001711,4.0,-1.5,3.0,0.75,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.03125,0.0,162368.0
3,5001712,19.0,-9.0,10.0,0.526316,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,0.473684,0.0,0.0,0.1875,0.092784,162368.0
4,5001713,22.0,-10.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,22.0,1.0,0.21875,0.0,150635.0


### Seems like SK_ID_CURR in some rows can be missing, thus we will just pull out all the rows values, of which SK_ID_CURR filed is not NaN

In [53]:
bureau_balance_total.shape

(1759469, 22)

In [54]:
bureau_balance_total = bureau_balance_total[bureau_balance_total['SK_ID_CURR'] > 1000]

In [55]:
bureau_balance_total.shape

(1716428, 22)

### That file can now be numerically aggregated by SK_ID_CURR

In [56]:
bureau_balance_total = group_numeric_values(bureau_balance_total, 'bureau_balance')

### We can now take a look at the data frame

In [57]:
bureau_balance_total.head()

Unnamed: 0_level_0,bureau_balance_bureau_balance_MONTHS_BALANCE_count_count,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median,bureau_balance_bureau_balance_MONTHS_BALANCE_mean_count,bureau_balance_bureau_balance_MONTHS_BALANCE_mean_mean,bureau_balance_bureau_balance_MONTHS_BALANCE_mean_median,bureau_balance_bureau_STATUS_0_sum_count,bureau_balance_bureau_STATUS_0_sum_mean,bureau_balance_bureau_STATUS_0_sum_median,bureau_balance_bureau_STATUS_0_mean_count,...,bureau_balance_bureau_STATUS_X_sum_median,bureau_balance_bureau_STATUS_X_mean_count,bureau_balance_bureau_STATUS_X_mean_mean,bureau_balance_bureau_STATUS_X_mean_median,bureau_balance_bureau_balance_MONTHS_BALANCE_count_norm_count,bureau_balance_bureau_balance_MONTHS_BALANCE_count_norm_mean,bureau_balance_bureau_balance_MONTHS_BALANCE_count_norm_median,bureau_balance_bureau_STATUS_C_sum_norm_count,bureau_balance_bureau_STATUS_C_sum_norm_mean,bureau_balance_bureau_STATUS_C_sum_norm_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001.0,7,24.571429,29.0,7,-11.785714,-14.0,7,4.428571,2.0,7,...,6.0,7,0.21459,0.241379,7,0.245536,0.291667,7,0.162003,0.185567
100002.0,8,13.75,16.0,8,-21.875,-26.0,8,5.625,5.0,8,...,2.5,8,0.161932,0.1875,8,0.132812,0.15625,8,0.029639,0.020619
100003.0,0,,,0,,,0,,,0,...,,0,,,0,,,0,,
100004.0,0,,,0,,,0,,,0,...,,0,,,0,,,0,,
100005.0,3,7.0,5.0,3,-3.0,-2.0,3,4.666667,5.0,3,...,1.0,3,0.136752,0.076923,3,0.0625,0.041667,3,0.017182,0.0


### There are some NaN values, which can be treated by filling them with mean of the given column
### We will do that, and then we may apply normalisation and log transformation

In [58]:
bureau_balance_total = fill_missing_values(bureau_balance_total, mean = True)

In [59]:
bureau_balance_total = normalization(bureau_balance_total, min_mean_value = 10)

In [60]:
bureau_balance_total = log_transform(bureau_balance_total, min_mean_value = 10)

In [61]:
bureau_balance_total.shape

(305811, 68)

In [62]:
bureau_balance_total.head()

Unnamed: 0_level_0,bureau_balance_bureau_balance_MONTHS_BALANCE_count_count,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median,bureau_balance_bureau_balance_MONTHS_BALANCE_mean_count,bureau_balance_bureau_balance_MONTHS_BALANCE_mean_mean,bureau_balance_bureau_balance_MONTHS_BALANCE_mean_median,bureau_balance_bureau_STATUS_0_sum_count,bureau_balance_bureau_STATUS_0_sum_mean,bureau_balance_bureau_STATUS_0_sum_median,bureau_balance_bureau_STATUS_0_mean_count,...,bureau_balance_bureau_STATUS_C_sum_norm_mean,bureau_balance_bureau_STATUS_C_sum_norm_median,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_norm,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_norm,bureau_balance_bureau_STATUS_C_sum_mean_norm,bureau_balance_bureau_STATUS_C_sum_median_norm,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_log,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_log,bureau_balance_bureau_STATUS_C_sum_mean_log,bureau_balance_bureau_STATUS_C_sum_median_log
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001.0,7,24.571429,29.0,7,-11.785714,-14.0,7,4.428571,2.0,7,...,0.162003,0.185567,0.245536,0.291667,0.16369,0.1875,3.241476,3.401197,2.816264,2.944439
100002.0,8,13.75,16.0,8,-21.875,-26.0,8,5.625,5.0,8,...,0.029639,0.020619,0.132812,0.15625,0.029948,0.020833,2.691243,2.833213,1.354546,1.098612
100003.0,0,30.253224,29.021447,0,-19.946528,-18.466574,0,9.351876,8.372508,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807
100004.0,0,30.253224,29.021447,0,-19.946528,-18.466574,0,9.351876,8.372508,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807
100005.0,3,7.0,5.0,3,-3.0,-2.0,3,4.666667,5.0,3,...,0.017182,0.0,0.0625,0.041667,0.017361,0.0,2.079442,1.791759,0.980829,0.0


### Now, we can merge the file into the main data frame

In [63]:
main_data = main_data.merge(bureau_balance_total, 
                                  on = 'SK_ID_CURR', 
                                  how = 'left')

In [64]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,...,bureau_balance_bureau_STATUS_C_sum_norm_mean,bureau_balance_bureau_STATUS_C_sum_norm_median,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_norm,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_norm,bureau_balance_bureau_STATUS_C_sum_mean_norm,bureau_balance_bureau_STATUS_C_sum_median_norm,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_log,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_log,bureau_balance_bureau_STATUS_C_sum_mean_log,bureau_balance_bureau_STATUS_C_sum_median_log
0,100002,1,9461,637.0,3648.0,2120,1,2,2,0,...,0.029639,0.020619,0.132812,0.15625,0.029948,0.020833,2.691243,2.833213,1.354546,1.098612
1,100003,0,16765,1188.0,1186.0,291,1,1,1,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807
2,100004,0,19046,225.0,4260.0,2531,1,2,2,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807
3,100006,0,19005,3039.0,9833.0,2437,1,2,2,0,...,,,,,,,,,,
4,100007,0,19932,3038.0,4311.0,3458,1,2,2,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807


### Now, we have to deal with the missing values

In [65]:
main_data.isnull().sum()

SK_ID_CURR                                                           0
TARGET                                                               0
DAYS_BIRTH                                                           0
DAYS_EMPLOYED                                                        0
DAYS_REGISTRATION                                                    0
                                                                 ...  
bureau_balance_bureau_STATUS_C_sum_median_norm                   44020
bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_log      44020
bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_log    44020
bureau_balance_bureau_STATUS_C_sum_mean_log                      44020
bureau_balance_bureau_STATUS_C_sum_median_log                    44020
Length: 164, dtype: int64

### The missing values are a result of joining smaller set into a bigger one

### Nevertheless, for simplicity sake, we may fill the missing values with zeros, since other values already seem to be small

In [66]:
main_data = fill_missing_values(main_data, mean = False)

In [67]:
main_data.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,...,bureau_balance_bureau_STATUS_C_sum_norm_mean,bureau_balance_bureau_STATUS_C_sum_norm_median,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_norm,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_norm,bureau_balance_bureau_STATUS_C_sum_mean_norm,bureau_balance_bureau_STATUS_C_sum_median_norm,bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_log,bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_log,bureau_balance_bureau_STATUS_C_sum_mean_log,bureau_balance_bureau_STATUS_C_sum_median_log
0,100002,1,9461,637.0,3648.0,2120,1,2,2,0,...,0.029639,0.020619,0.132812,0.15625,0.029948,0.020833,2.691243,2.833213,1.354546,1.098612
1,100003,0,16765,1188.0,1186.0,291,1,1,1,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807
2,100004,0,19046,225.0,4260.0,2531,1,2,2,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807
3,100006,0,19005,3039.0,9833.0,2437,1,2,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0,19932,3038.0,4311.0,3458,1,2,2,0,...,0.144819,0.118052,0.304721,0.29189,0.146327,0.119282,3.442123,3.401912,2.711208,2.521807


### Now, we need to look at the correlations in regards to the 'TARGET' column, since we still don't want to have to many features

In [68]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [69]:
new_cors.head(10)

Unnamed: 0,TARGET
TARGET,1.0
bureau_DAYS_CREDIT_mean,0.08396
bureau_DAYS_CREDIT_median,0.081943
bureau_DAYS_CREDIT_UPDATE_mean,0.069687
bureau_DAYS_CREDIT_UPDATE_median,0.068019
REGION_RATING_CLIENT_W_CITY,0.060893
REGION_RATING_CLIENT,0.058899
NAME_INCOME_TYPE_Working,0.057481
DAYS_LAST_PHONE_CHANGE,0.055218
CODE_GENDER_M,0.054713


In [70]:
new_cors.tail(10)

Unnamed: 0,TARGET
DAYS_EMPLOYED,-0.063368
DAYS_EMPLOYED_norm,-0.063368
DAYS_EMPLOYED_log,-0.072317
bureau_CREDIT_ACTIVE_Closed_mean,-0.076501
DAYS_BIRTH_norm,-0.078239
DAYS_BIRTH,-0.078239
DAYS_BIRTH_log,-0.078504
EXT_SOURCE_1,-0.098887
EXT_SOURCE_3,-0.155892
EXT_SOURCE_2,-0.160295


### Many new features showed up
### We will, once again, delete some potentailly useless columns

In [71]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

In [72]:
main_data.shape

(307511, 61)

# Previous applications

### Let's analyze the data

In [73]:
previous_application.shape

(1670214, 37)

In [74]:
previous_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   SK_ID_PREV                   1670214 non-null  int64  
 1   SK_ID_CURR                   1670214 non-null  int64  
 2   NAME_CONTRACT_TYPE           1670214 non-null  object 
 3   AMT_ANNUITY                  1297979 non-null  float64
 4   AMT_APPLICATION              1670214 non-null  float64
 5   AMT_CREDIT                   1670213 non-null  float64
 6   AMT_DOWN_PAYMENT             774370 non-null   float64
 7   AMT_GOODS_PRICE              1284699 non-null  float64
 8   WEEKDAY_APPR_PROCESS_START   1670214 non-null  object 
 9   HOUR_APPR_PROCESS_START      1670214 non-null  int64  
 10  FLAG_LAST_APPL_PER_CONTRACT  1670214 non-null  object 
 11  NFLAG_LAST_APPL_IN_DAY       1670214 non-null  int64  
 12  RATE_DOWN_PAYMENT            774370 non-nu

In [75]:
previous_application.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


### Let's take a quick look, whether there are many missing values in the newly created data frames

In [76]:
previous_application.isnull().sum()

SK_ID_PREV                           0
SK_ID_CURR                           0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
NAME_CASH_LOAN_PURPOSE               0
NAME_CONTRACT_STATUS                 0
DAYS_DECISION                        0
NAME_PAYMENT_TYPE                    0
CODE_REJECT_REASON                   0
NAME_TYPE_SUITE                 820405
NAME_CLIENT_TYPE                     0
NAME_GOODS_CATEGORY                  0
NAME_PORTFOLIO                       0
NAME_PRODUCT_TYPE                    0
CHANNEL_TYPE             

### Let's drop the columns with at least 70% of values missing

In [77]:
previous_application = drop_missing_columns(previous_application, threshold = 75)

There are 15 with greater than 75 missing values
10 exemplary incomplete columns to be deleted: 
['AMT_ANNUITY', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'NAME_TYPE_SUITE', 'CNT_PAYMENT', 'PRODUCT_COMBINATION', 'DAYS_FIRST_DRAWING']


### We will simply ignore the column SK_ID_PREV in that frame as well as any future ones
### We can apply numerical and object groping to the frame

In [78]:
prev_app_num = group_numeric_values(previous_application.drop(columns = ['SK_ID_PREV']), 'prev_app')

In [79]:
prev_app_num.shape

(338857, 18)

In [80]:
prev_app_num.head(5)

Unnamed: 0_level_0,prev_app_AMT_APPLICATION_count,prev_app_AMT_APPLICATION_mean,prev_app_AMT_APPLICATION_median,prev_app_AMT_CREDIT_count,prev_app_AMT_CREDIT_mean,prev_app_AMT_CREDIT_median,prev_app_HOUR_APPR_PROCESS_START_count,prev_app_HOUR_APPR_PROCESS_START_mean,prev_app_HOUR_APPR_PROCESS_START_median,prev_app_NFLAG_LAST_APPL_IN_DAY_count,prev_app_NFLAG_LAST_APPL_IN_DAY_mean,prev_app_NFLAG_LAST_APPL_IN_DAY_median,prev_app_DAYS_DECISION_count,prev_app_DAYS_DECISION_mean,prev_app_DAYS_DECISION_median,prev_app_SELLERPLACE_AREA_count,prev_app_SELLERPLACE_AREA_mean,prev_app_SELLERPLACE_AREA_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
100001,1,24835.5,24835.5,1,23787.0,23787.0,1,13.0,13.0,1,1.0,1.0,1,-1740.0,-1740.0,1,23.0,23.0
100002,1,179055.0,179055.0,1,179055.0,179055.0,1,9.0,9.0,1,1.0,1.0,1,-606.0,-606.0,1,500.0,500.0
100003,3,435436.5,337500.0,3,484191.0,348637.5,3,14.666667,15.0,3,1.0,1.0,3,-1305.0,-828.0,3,533.0,200.0
100004,1,24282.0,24282.0,1,20106.0,20106.0,1,5.0,5.0,1,1.0,1.0,1,-815.0,-815.0,1,30.0,30.0
100005,2,22308.75,22308.75,2,20076.75,20076.75,2,10.5,10.5,2,1.0,1.0,2,-536.0,-536.0,2,18.0,18.0


In [81]:
prev_app_obj = group_object_values(previous_application.drop(columns = ['SK_ID_PREV']), 'prev_app', grouping_statistics = ['sum', 'mean'])

In [82]:
prev_app_obj.shape

(338857, 238)

In [83]:
prev_app_obj.head(5)

Unnamed: 0_level_0,prev_app_NAME_CONTRACT_TYPE_Cash loans_sum,prev_app_NAME_CONTRACT_TYPE_Cash loans_mean,prev_app_NAME_CONTRACT_TYPE_Consumer loans_sum,prev_app_NAME_CONTRACT_TYPE_Consumer loans_mean,prev_app_NAME_CONTRACT_TYPE_Revolving loans_sum,prev_app_NAME_CONTRACT_TYPE_Revolving loans_mean,prev_app_NAME_CONTRACT_TYPE_XNA_sum,prev_app_NAME_CONTRACT_TYPE_XNA_mean,prev_app_WEEKDAY_APPR_PROCESS_START_FRIDAY_sum,prev_app_WEEKDAY_APPR_PROCESS_START_FRIDAY_mean,...,prev_app_NAME_YIELD_GROUP_XNA_sum,prev_app_NAME_YIELD_GROUP_XNA_mean,prev_app_NAME_YIELD_GROUP_high_sum,prev_app_NAME_YIELD_GROUP_high_mean,prev_app_NAME_YIELD_GROUP_low_action_sum,prev_app_NAME_YIELD_GROUP_low_action_mean,prev_app_NAME_YIELD_GROUP_low_normal_sum,prev_app_NAME_YIELD_GROUP_low_normal_mean,prev_app_NAME_YIELD_GROUP_middle_sum,prev_app_NAME_YIELD_GROUP_middle_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0,0.0,1,1.0,0,0.0,0,0.0,1,1.0,...,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0
100002,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,...,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0
100003,1,0.333333,2,0.666667,0,0.0,0,0.0,1,0.333333,...,0,0.0,0,0.0,0,0.0,1,0.333333,2,0.666667
100004,0,0.0,1,1.0,0,0.0,0,0.0,1,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0
100005,1,0.5,1,0.5,0,0.0,0,0.0,1,0.5,...,1,0.5,1,0.5,0,0.0,0,0.0,0,0.0


### Let's see, whether there are any NaN values in those frames after grouping

In [84]:
prev_app_obj.isnull().sum()

prev_app_NAME_CONTRACT_TYPE_Cash loans_sum         0
prev_app_NAME_CONTRACT_TYPE_Cash loans_mean        0
prev_app_NAME_CONTRACT_TYPE_Consumer loans_sum     0
prev_app_NAME_CONTRACT_TYPE_Consumer loans_mean    0
prev_app_NAME_CONTRACT_TYPE_Revolving loans_sum    0
                                                  ..
prev_app_NAME_YIELD_GROUP_low_action_mean          0
prev_app_NAME_YIELD_GROUP_low_normal_sum           0
prev_app_NAME_YIELD_GROUP_low_normal_mean          0
prev_app_NAME_YIELD_GROUP_middle_sum               0
prev_app_NAME_YIELD_GROUP_middle_mean              0
Length: 238, dtype: int64

In [85]:
prev_app_num.isnull().sum()

prev_app_AMT_APPLICATION_count             0
prev_app_AMT_APPLICATION_mean              0
prev_app_AMT_APPLICATION_median            0
prev_app_AMT_CREDIT_count                  0
prev_app_AMT_CREDIT_mean                   0
prev_app_AMT_CREDIT_median                 0
prev_app_HOUR_APPR_PROCESS_START_count     0
prev_app_HOUR_APPR_PROCESS_START_mean      0
prev_app_HOUR_APPR_PROCESS_START_median    0
prev_app_NFLAG_LAST_APPL_IN_DAY_count      0
prev_app_NFLAG_LAST_APPL_IN_DAY_mean       0
prev_app_NFLAG_LAST_APPL_IN_DAY_median     0
prev_app_DAYS_DECISION_count               0
prev_app_DAYS_DECISION_mean                0
prev_app_DAYS_DECISION_median              0
prev_app_SELLERPLACE_AREA_count            0
prev_app_SELLERPLACE_AREA_mean             0
prev_app_SELLERPLACE_AREA_median           0
dtype: int64

### As we can see, there are no missing values

### Let's now use normalization and log transformation techniques for new feature creation, specifically for prev_app_num frame, since the values seem to be extremely big

In [86]:
prev_app_num = normalization(prev_app_num)

In [87]:
prev_app_num = log_transform(prev_app_num)

### And now, let's take a quick look at the new features, as well as at the shape of the frame

In [88]:
prev_app_num.shape

(338857, 28)

In [89]:
prev_app_num.head(5)

Unnamed: 0_level_0,prev_app_AMT_APPLICATION_count,prev_app_AMT_APPLICATION_mean,prev_app_AMT_APPLICATION_median,prev_app_AMT_CREDIT_count,prev_app_AMT_CREDIT_mean,prev_app_AMT_CREDIT_median,prev_app_HOUR_APPR_PROCESS_START_count,prev_app_HOUR_APPR_PROCESS_START_mean,prev_app_HOUR_APPR_PROCESS_START_median,prev_app_NFLAG_LAST_APPL_IN_DAY_count,...,prev_app_AMT_APPLICATION_mean_norm,prev_app_AMT_APPLICATION_median_norm,prev_app_AMT_CREDIT_mean_norm,prev_app_AMT_CREDIT_median_norm,prev_app_SELLERPLACE_AREA_mean_norm,prev_app_SELLERPLACE_AREA_median_norm,prev_app_AMT_APPLICATION_mean_log,prev_app_AMT_APPLICATION_median_log,prev_app_AMT_CREDIT_mean_log,prev_app_AMT_CREDIT_median_log
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1,24835.5,24835.5,1,23787.0,23787.0,1,13.0,13.0,1,...,0.006132,0.006132,0.005873,0.005873,6e-06,6e-06,10.12007,10.12007,10.076937,10.076937
100002,1,179055.0,179055.0,1,179055.0,179055.0,1,9.0,9.0,1,...,0.044211,0.044211,0.044211,0.044211,0.000125,0.000125,12.095454,12.095454,12.095454,12.095454
100003,3,435436.5,337500.0,3,484191.0,348637.5,3,14.666667,15.0,3,...,0.107515,0.083333,0.119553,0.086083,0.000133,5e-05,12.984107,12.729324,13.090237,12.761791
100004,1,24282.0,24282.0,1,20106.0,20106.0,1,5.0,5.0,1,...,0.005996,0.005996,0.004964,0.004964,8e-06,8e-06,10.097532,10.097532,9.908823,9.908823
100005,2,22308.75,22308.75,2,20076.75,20076.75,2,10.5,10.5,2,...,0.005508,0.005508,0.004957,0.004957,5e-06,5e-06,10.012779,10.012779,9.907368,9.907368


### Finally, let's merge the frames into the data set

In [90]:
main_data = main_data.merge(prev_app_num, on = 'SK_ID_CURR', how = 'left')

In [91]:
main_data = main_data.merge(prev_app_obj, on = 'SK_ID_CURR', how = 'left')

In [92]:
main_data .isnull().sum()

SK_ID_CURR                                       0
TARGET                                           0
DAYS_BIRTH                                       0
DAYS_EMPLOYED                                    0
DAYS_REGISTRATION                                0
                                             ...  
prev_app_NAME_YIELD_GROUP_low_action_mean    16454
prev_app_NAME_YIELD_GROUP_low_normal_sum     16454
prev_app_NAME_YIELD_GROUP_low_normal_mean    16454
prev_app_NAME_YIELD_GROUP_middle_sum         16454
prev_app_NAME_YIELD_GROUP_middle_mean        16454
Length: 327, dtype: int64

### Once again, we are dealing with missing values
### In that case, and in every future one, we will just fill those values with zeros, as it will make further analysis easier, in theoretically, in that case 0's are a bit of representation of NaN, since they come not from lack of information or computer error, but from themselves being non-existant after joining them

In [93]:
main_data = fill_missing_values(main_data, mean = False)

### Now, we can take a look at the correlation

In [94]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [95]:
new_cors.head(10)

Unnamed: 0,TARGET
TARGET,1.0
bureau_DAYS_CREDIT_mean,0.08396
bureau_DAYS_CREDIT_median,0.081943
prev_app_NAME_CONTRACT_STATUS_Refused_mean,0.077894
bureau_DAYS_CREDIT_UPDATE_mean,0.069687
bureau_DAYS_CREDIT_UPDATE_median,0.068019
prev_app_NAME_CONTRACT_STATUS_Refused_sum,0.064756
prev_app_NAME_PRODUCT_TYPE_walk-in_sum,0.062785
REGION_RATING_CLIENT_W_CITY,0.060893
REGION_RATING_CLIENT,0.058899


In [96]:
new_cors.tail(10)

Unnamed: 0,TARGET
DAYS_EMPLOYED_log,-0.072317
bureau_CREDIT_ACTIVE_Closed_mean,-0.076501
DAYS_BIRTH_norm,-0.078239
DAYS_BIRTH,-0.078239
DAYS_BIRTH_log,-0.078504
EXT_SOURCE_1,-0.098887
EXT_SOURCE_3,-0.155892
EXT_SOURCE_2,-0.160295
prev_app_NAME_GOODS_CATEGORY_House Construction_sum,
prev_app_NAME_GOODS_CATEGORY_House Construction_mean,


### New correlations seem to have shown up as some important ones!

### After that, we will use the function used for droping columns with low correlation

In [97]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

### Let's look at the shape of the modified data frame

In [98]:
main_data.shape

(307511, 79)

# Credit Card Balance

### Let's start with simple analysis

In [99]:
credit_card_balance.shape

(3840312, 23)

In [100]:
credit_card_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 23 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   SK_ID_CURR                  int64  
 2   MONTHS_BALANCE              int64  
 3   AMT_BALANCE                 float64
 4   AMT_CREDIT_LIMIT_ACTUAL     int64  
 5   AMT_DRAWINGS_ATM_CURRENT    float64
 6   AMT_DRAWINGS_CURRENT        float64
 7   AMT_DRAWINGS_OTHER_CURRENT  float64
 8   AMT_DRAWINGS_POS_CURRENT    float64
 9   AMT_INST_MIN_REGULARITY     float64
 10  AMT_PAYMENT_CURRENT         float64
 11  AMT_PAYMENT_TOTAL_CURRENT   float64
 12  AMT_RECEIVABLE_PRINCIPAL    float64
 13  AMT_RECIVABLE               float64
 14  AMT_TOTAL_RECEIVABLE        float64
 15  CNT_DRAWINGS_ATM_CURRENT    float64
 16  CNT_DRAWINGS_CURRENT        int64  
 17  CNT_DRAWINGS_OTHER_CURRENT  float64
 18  CNT_DRAWINGS_POS_CURRENT    float64
 19  CNT_INSTALMENT_MATURE

In [101]:
credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


### Check for missing values

In [102]:
credit_card_balance.isnull().sum()

SK_ID_PREV                         0
SK_ID_CURR                         0
MONTHS_BALANCE                     0
AMT_BALANCE                        0
AMT_CREDIT_LIMIT_ACTUAL            0
AMT_DRAWINGS_ATM_CURRENT      749816
AMT_DRAWINGS_CURRENT               0
AMT_DRAWINGS_OTHER_CURRENT    749816
AMT_DRAWINGS_POS_CURRENT      749816
AMT_INST_MIN_REGULARITY       305236
AMT_PAYMENT_CURRENT           767988
AMT_PAYMENT_TOTAL_CURRENT          0
AMT_RECEIVABLE_PRINCIPAL           0
AMT_RECIVABLE                      0
AMT_TOTAL_RECEIVABLE               0
CNT_DRAWINGS_ATM_CURRENT      749816
CNT_DRAWINGS_CURRENT               0
CNT_DRAWINGS_OTHER_CURRENT    749816
CNT_DRAWINGS_POS_CURRENT      749816
CNT_INSTALMENT_MATURE_CUM     305236
NAME_CONTRACT_STATUS               0
SK_DPD                             0
SK_DPD_DEF                         0
dtype: int64

### This time, for experimentation sake, we were trying fill those missing values with mean values
### However, due to some problems with kernel, we decided to stick with just removing the columns

In [103]:
credit_card_balance = drop_missing_columns(credit_card_balance)

There are 9 with greater than 70 missing values
Incomplete columns: 
['AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM']


### Let's apply grouping to our data frame

In [104]:
card_bal_obj_grp = group_object_values(credit_card_balance,
                                      'card_balance',
                                      groupby_id = 'SK_ID_CURR')

In [105]:
card_bal_obj_numr = group_numeric_values(credit_card_balance,
                                      'card_balance',
                                      groupby_id = 'SK_ID_CURR')

### Let's analyze the files

In [106]:
card_bal_obj_grp.shape

(103558, 14)

In [107]:
card_bal_obj_grp.head(5)

Unnamed: 0_level_0,card_balance_NAME_CONTRACT_STATUS_Active_sum,card_balance_NAME_CONTRACT_STATUS_Active_mean,card_balance_NAME_CONTRACT_STATUS_Approved_sum,card_balance_NAME_CONTRACT_STATUS_Approved_mean,card_balance_NAME_CONTRACT_STATUS_Completed_sum,card_balance_NAME_CONTRACT_STATUS_Completed_mean,card_balance_NAME_CONTRACT_STATUS_Demand_sum,card_balance_NAME_CONTRACT_STATUS_Demand_mean,card_balance_NAME_CONTRACT_STATUS_Refused_sum,card_balance_NAME_CONTRACT_STATUS_Refused_mean,card_balance_NAME_CONTRACT_STATUS_Sent proposal_sum,card_balance_NAME_CONTRACT_STATUS_Sent proposal_mean,card_balance_NAME_CONTRACT_STATUS_Signed_sum,card_balance_NAME_CONTRACT_STATUS_Signed_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100006,6,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100011,74,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100013,96,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100021,7,0.411765,0,0.0,10,0.588235,0,0.0,0,0.0,0,0.0,0,0.0
100023,8,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [108]:
card_bal_obj_numr.shape

(103558, 33)

In [109]:
card_bal_obj_numr.head(5)

Unnamed: 0_level_0,card_balance_MONTHS_BALANCE_count,card_balance_MONTHS_BALANCE_mean,card_balance_MONTHS_BALANCE_median,card_balance_AMT_BALANCE_count,card_balance_AMT_BALANCE_mean,card_balance_AMT_BALANCE_median,card_balance_AMT_CREDIT_LIMIT_ACTUAL_count,card_balance_AMT_CREDIT_LIMIT_ACTUAL_mean,card_balance_AMT_CREDIT_LIMIT_ACTUAL_median,card_balance_AMT_DRAWINGS_CURRENT_count,...,card_balance_AMT_TOTAL_RECEIVABLE_median,card_balance_CNT_DRAWINGS_CURRENT_count,card_balance_CNT_DRAWINGS_CURRENT_mean,card_balance_CNT_DRAWINGS_CURRENT_median,card_balance_SK_DPD_count,card_balance_SK_DPD_mean,card_balance_SK_DPD_median,card_balance_SK_DPD_DEF_count,card_balance_SK_DPD_DEF_mean,card_balance_SK_DPD_DEF_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,6,-3.5,-3.5,6,0.0,0.0,6,270000.0,270000,6,...,0.0,6,0.0,0.0,6,0.0,0.0,6,0.0,0.0
100011,74,-38.5,-38.5,74,54482.111149,0.0,74,164189.189189,180000,74,...,0.0,74,0.054054,0.0,74,0.0,0.0,74,0.0,0.0
100013,96,-48.5,-48.5,96,18159.919219,0.0,96,131718.75,157500,96,...,0.0,96,0.239583,0.0,96,0.010417,0.0,96,0.010417,0.0
100021,17,-10.0,-10.0,17,0.0,0.0,17,675000.0,675000,17,...,0.0,17,0.0,0.0,17,0.0,0.0,17,0.0,0.0
100023,8,-7.5,-7.5,8,0.0,0.0,8,135000.0,135000,8,...,0.0,8,0.0,0.0,8,0.0,0.0,8,0.0,0.0


### Let's apply the normalization only, since the values are not that big for log transformation

In [110]:
card_bal_obj_numr = normalization(card_bal_obj_numr, min_mean_value = 10, groupby_id = 'SK_ID_CURR')

In [111]:
card_bal_obj_grp = normalization(card_bal_obj_grp, min_mean_value = 10, groupby_id = 'SK_ID_CURR')

### Merge the main data with the grouped frames

In [112]:
main_data = main_data.merge(card_bal_obj_grp, on = 'SK_ID_CURR', how = 'left')

In [113]:
main_data = main_data.merge(card_bal_obj_numr, on = 'SK_ID_CURR', how = 'left')

### Let's replace the missing values with 0's

In [114]:
main_data = fill_missing_values(main_data, mean = False)

### Remove the values, which are not under the threshold

In [115]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

In [116]:
main_data.shape

(307511, 96)

In [117]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [118]:
new_cors.head(20)

Unnamed: 0,TARGET
TARGET,1.0
bureau_DAYS_CREDIT_mean,0.08396
bureau_DAYS_CREDIT_median,0.081943
prev_app_NAME_CONTRACT_STATUS_Refused_mean,0.077894
bureau_DAYS_CREDIT_UPDATE_mean,0.069687
bureau_DAYS_CREDIT_UPDATE_median,0.068019
prev_app_NAME_CONTRACT_STATUS_Refused_sum,0.064756
prev_app_NAME_PRODUCT_TYPE_walk-in_sum,0.062785
REGION_RATING_CLIENT_W_CITY,0.060893
REGION_RATING_CLIENT,0.058899


In [119]:
new_cors.tail(20)

Unnamed: 0,TARGET
DAYS_ID_PUBLISH,-0.051457
DAYS_ID_PUBLISH_norm,-0.051457
bureau_balance_bureau_balance_MONTHS_BALANCE_count_median,-0.052083
bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_norm,-0.052218
bureau_balance_bureau_balance_MONTHS_BALANCE_count_norm_median,-0.052218
bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean,-0.053076
bureau_balance_bureau_balance_MONTHS_BALANCE_count_norm_mean,-0.053288
bureau_balance_bureau_balance_MONTHS_BALANCE_count_mean_norm,-0.053288
CODE_GENDER_F,-0.054704
NAME_EDUCATION_TYPE_Higher education,-0.056593


# Installments payments

In [120]:
intallments_payments.shape

(13605401, 8)

In [121]:
intallments_payments.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [122]:
intallments_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_PREV              int64  
 1   SK_ID_CURR              int64  
 2   NUM_INSTALMENT_VERSION  float64
 3   NUM_INSTALMENT_NUMBER   int64  
 4   DAYS_INSTALMENT         float64
 5   DAYS_ENTRY_PAYMENT      float64
 6   AMT_INSTALMENT          float64
 7   AMT_PAYMENT             float64
dtypes: float64(5), int64(3)
memory usage: 830.4 MB


In [123]:
intallments_payments.isnull().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

### Let's remove the columns with missing values below the certain threshold

In [124]:
intallments_payments = fill_missing_values(intallments_payments, mean = True)

In [125]:
intallments_payments.isnull().sum()

SK_ID_PREV                0
SK_ID_CURR                0
NUM_INSTALMENT_VERSION    0
NUM_INSTALMENT_NUMBER     0
DAYS_INSTALMENT           0
DAYS_ENTRY_PAYMENT        0
AMT_INSTALMENT            0
AMT_PAYMENT               0
dtype: int64

### Let's now apply numerical grouping

In [126]:
install_num = group_numeric_values(intallments_payments, 'install_pay')

### Let's take a look at the data frame

In [127]:
install_num.shape

(339587, 18)

In [128]:
install_num.head(5)

Unnamed: 0_level_0,install_pay_NUM_INSTALMENT_VERSION_count,install_pay_NUM_INSTALMENT_VERSION_mean,install_pay_NUM_INSTALMENT_VERSION_median,install_pay_NUM_INSTALMENT_NUMBER_count,install_pay_NUM_INSTALMENT_NUMBER_mean,install_pay_NUM_INSTALMENT_NUMBER_median,install_pay_DAYS_INSTALMENT_count,install_pay_DAYS_INSTALMENT_mean,install_pay_DAYS_INSTALMENT_median,install_pay_DAYS_ENTRY_PAYMENT_count,install_pay_DAYS_ENTRY_PAYMENT_mean,install_pay_DAYS_ENTRY_PAYMENT_median,install_pay_AMT_INSTALMENT_count,install_pay_AMT_INSTALMENT_mean,install_pay_AMT_INSTALMENT_median,install_pay_AMT_PAYMENT_count,install_pay_AMT_PAYMENT_mean,install_pay_AMT_PAYMENT_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
100001,7,1.142857,1.0,7,2.714286,3.0,7,-2187.714286,-1709.0,7,-2195.0,-1715.0,7,5885.132143,3980.925,7,5885.132143,3980.925
100002,19,1.052632,1.0,19,10.0,10.0,19,-295.0,-295.0,19,-315.421053,-312.0,19,11559.247105,9251.775,19,11559.247105,9251.775
100003,25,1.04,1.0,25,5.08,5.0,25,-1378.16,-797.0,25,-1385.32,-806.0,25,64754.586,64275.615,25,64754.586,64275.615
100004,3,1.333333,1.0,3,2.0,2.0,3,-754.0,-754.0,3,-761.666667,-763.0,3,7096.155,5357.25,3,7096.155,5357.25
100005,9,1.111111,1.0,9,5.0,5.0,9,-586.0,-586.0,9,-609.555556,-585.0,9,6240.205,4813.2,9,6240.205,4813.2


### We can apply normalization and log transformation

In [129]:
install_num = normalization(install_num)

In [130]:
install_num = log_transform(install_num)

In [131]:
install_num.shape

(339587, 26)

In [132]:
install_num.head(5)

Unnamed: 0_level_0,install_pay_NUM_INSTALMENT_VERSION_count,install_pay_NUM_INSTALMENT_VERSION_mean,install_pay_NUM_INSTALMENT_VERSION_median,install_pay_NUM_INSTALMENT_NUMBER_count,install_pay_NUM_INSTALMENT_NUMBER_mean,install_pay_NUM_INSTALMENT_NUMBER_median,install_pay_DAYS_INSTALMENT_count,install_pay_DAYS_INSTALMENT_mean,install_pay_DAYS_INSTALMENT_median,install_pay_DAYS_ENTRY_PAYMENT_count,...,install_pay_AMT_PAYMENT_mean,install_pay_AMT_PAYMENT_median,install_pay_AMT_INSTALMENT_mean_norm,install_pay_AMT_INSTALMENT_median_norm,install_pay_AMT_PAYMENT_mean_norm,install_pay_AMT_PAYMENT_median_norm,install_pay_AMT_INSTALMENT_mean_log,install_pay_AMT_INSTALMENT_median_log,install_pay_AMT_PAYMENT_mean_log,install_pay_AMT_PAYMENT_median_log
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,7,1.142857,1.0,7,2.714286,3.0,7,-2187.714286,-1709.0,7,...,5885.132143,3980.925,0.00235,0.001589,0.00235,0.001589,8.680354,8.289521,8.680354,8.289521
100002,19,1.052632,1.0,19,10.0,10.0,19,-295.0,-295.0,19,...,11559.247105,9251.775,0.004615,0.003694,0.004615,0.003694,9.355328,9.132679,9.355328,9.132679
100003,25,1.04,1.0,25,5.08,5.0,25,-1378.16,-797.0,25,...,64754.586,64275.615,0.025854,0.025663,0.025854,0.025663,11.078375,11.070951,11.078375,11.070951
100004,3,1.333333,1.0,3,2.0,2.0,3,-754.0,-754.0,3,...,7096.155,5357.25,0.002833,0.002139,0.002833,0.002139,8.867449,8.586393,8.867449,8.586393
100005,9,1.111111,1.0,9,5.0,5.0,9,-586.0,-586.0,9,...,6240.205,4813.2,0.002492,0.001922,0.002491,0.001922,8.738929,8.479325,8.738929,8.479325


### We can now simply join the file into the main data set

In [133]:
main_data = main_data.merge(install_num, on = 'SK_ID_CURR', how = 'left')

### Replace missing values with 0's

In [134]:
main_data = fill_missing_values(main_data, mean = False)

### Get the values, which fall fall over the threshold

In [135]:
main_data = remove_target_correlated_cols(main_data, 
                                         special_id = 'SK_ID_CURR', 
                                         threshold = 0.04)

In [136]:
main_data.shape

(307511, 96)

# POS Cash Balance

In [137]:
POS_CASH_balance = pd.read_csv(f'{path}POS_CASH_balance.csv')

### Analyse the data

In [138]:
POS_CASH_balance.shape

(10001358, 8)

In [139]:
POS_CASH_balance.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [140]:
POS_CASH_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   SK_ID_CURR             int64  
 2   MONTHS_BALANCE         int64  
 3   CNT_INSTALMENT         float64
 4   CNT_INSTALMENT_FUTURE  float64
 5   NAME_CONTRACT_STATUS   object 
 6   SK_DPD                 int64  
 7   SK_DPD_DEF             int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 610.4+ MB


In [141]:
POS_CASH_balance.isnull().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
MONTHS_BALANCE               0
CNT_INSTALMENT           26071
CNT_INSTALMENT_FUTURE    26087
NAME_CONTRACT_STATUS         0
SK_DPD                       0
SK_DPD_DEF                   0
dtype: int64

### Let's try to drop the collumns

In [142]:
POS_CASH_balance = drop_missing_columns(POS_CASH_balance)

There are 2 with greater than 70 missing values
Incomplete columns: 
['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']


In [143]:
POS_CASH_balance.isnull().sum()

SK_ID_PREV              0
SK_ID_CURR              0
MONTHS_BALANCE          0
NAME_CONTRACT_STATUS    0
SK_DPD                  0
SK_DPD_DEF              0
dtype: int64

### Apply the grouping to the data set

In [144]:
pcb_num = group_numeric_values(POS_CASH_balance, 'pos_cash_balance')

In [145]:
pcb_obj = group_object_values(POS_CASH_balance, 'pos_cash_balance')

### Take a look at the data

In [146]:
pcb_num.shape

(337252, 9)

In [147]:
pcb_num.head(5)

Unnamed: 0_level_0,pos_cash_balance_MONTHS_BALANCE_count,pos_cash_balance_MONTHS_BALANCE_mean,pos_cash_balance_MONTHS_BALANCE_median,pos_cash_balance_SK_DPD_count,pos_cash_balance_SK_DPD_mean,pos_cash_balance_SK_DPD_median,pos_cash_balance_SK_DPD_DEF_count,pos_cash_balance_SK_DPD_DEF_mean,pos_cash_balance_SK_DPD_DEF_median
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100001,9,-72.555556,-57.0,9,0.777778,0.0,9,0.777778,0.0
100002,19,-10.0,-10.0,19,0.0,0.0,19,0.0,0.0
100003,28,-43.785714,-26.5,28,0.0,0.0,28,0.0,0.0
100004,4,-25.5,-25.5,4,0.0,0.0,4,0.0,0.0
100005,11,-20.0,-20.0,11,0.0,0.0,11,0.0,0.0


In [148]:
pcb_obj.shape

(337252, 18)

In [149]:
pcb_obj.head(5)

Unnamed: 0_level_0,pos_cash_balance_NAME_CONTRACT_STATUS_Active_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Active_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Amortized debt_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Amortized debt_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Approved_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Approved_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Canceled_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Canceled_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Completed_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Completed_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Demand_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Demand_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Returned to the store_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Returned to the store_mean,pos_cash_balance_NAME_CONTRACT_STATUS_Signed_sum,pos_cash_balance_NAME_CONTRACT_STATUS_Signed_mean,pos_cash_balance_NAME_CONTRACT_STATUS_XNA_sum,pos_cash_balance_NAME_CONTRACT_STATUS_XNA_mean
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
100001,7.0,0.777778,0,0.0,0,0.0,0,0.0,2,0.222222,0,0.0,0,0.0,0,0.0,0,0.0
100002,19.0,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
100003,26.0,0.928571,0,0.0,0,0.0,0,0.0,2,0.071429,0,0.0,0,0.0,0,0.0,0,0.0
100004,3.0,0.75,0,0.0,0,0.0,0,0.0,1,0.25,0,0.0,0,0.0,0,0.0,0,0.0
100005,9.0,0.818182,0,0.0,0,0.0,0,0.0,1,0.090909,0,0.0,0,0.0,1,0.090909,0,0.0


### Normalisation could be applied

In [150]:
pcb_obj = normalization(pcb_obj)

In [151]:
pcb_num = normalization(pcb_num)

### Merge the data

In [152]:
main_data = main_data.merge(pcb_obj, on = 'SK_ID_CURR', how = 'left')

In [153]:
main_data = main_data.merge(pcb_num, on = 'SK_ID_CURR', how = 'left')

### Remove missing values

In [154]:
main_data = fill_missing_values(main_data, mean = False)

### Let's look at the correlations

In [155]:
correlations = main_data.corr()
correlations = correlations.sort_values('TARGET', ascending = False)

new_cors = pd.DataFrame(correlations['TARGET'])

In [156]:
new_cors.head(25)

Unnamed: 0,TARGET
TARGET,1.0
bureau_DAYS_CREDIT_mean,0.08396
bureau_DAYS_CREDIT_median,0.081943
prev_app_NAME_CONTRACT_STATUS_Refused_mean,0.077894
bureau_DAYS_CREDIT_UPDATE_mean,0.069687
bureau_DAYS_CREDIT_UPDATE_median,0.068019
prev_app_NAME_CONTRACT_STATUS_Refused_sum,0.064756
prev_app_NAME_PRODUCT_TYPE_walk-in_sum,0.062785
REGION_RATING_CLIENT_W_CITY,0.060893
REGION_RATING_CLIENT,0.058899


In [157]:
new_cors.tail(25)

Unnamed: 0,TARGET
NAME_INCOME_TYPE_Pensioner,-0.046209
bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_log,-0.046244
bureau_balance_bureau_STATUS_C_sum_median_log,-0.047419
bureau_balance_bureau_STATUS_C_sum_mean_log,-0.04877
DAYS_EMPLOYED_PERC,-0.049603
DAYS_ID_PUBLISH,-0.051457
DAYS_ID_PUBLISH_norm,-0.051457
bureau_balance_bureau_balance_MONTHS_BALANCE_count_median,-0.052083
bureau_balance_bureau_balance_MONTHS_BALANCE_count_norm_median,-0.052218
bureau_balance_bureau_balance_MONTHS_BALANCE_count_median_norm,-0.052218


In [158]:
main_data =  remove_target_correlated_cols(main_data, 
                                          special_id = 'SK_ID_CURR', 
                                          threshold = 0.04)

In [159]:
main_data.shape

(307511, 96)

### Now we can export the data to second file for comparison

In [160]:
main_data.to_csv('./featureData2.csv')