# Feature Engineering

* We start off with necessary imports:

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

* Read data

In [2]:
main_data = pd.read_csv('./initial_data.csv')
main_data = main_data.drop(columns = ['Unnamed: 0'])
main_data

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100002,1,0,0,1,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,...,0,1,0,0,0,0,0,0,1,0
2,100004,0,1,1,1,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,0,0,1,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,0,0,1,0,1,0
307507,456252,0,0,0,1,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,0,0,1,0,1,0
307508,456253,0,0,0,1,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,0,1,0,0,1,0
307509,456254,1,0,0,1,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0,0,1,0,1,0


* Read already prepared data

In [3]:
main_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 243 entries, SK_ID_CURR to EMERGENCYSTATE_MODE_Yes
dtypes: float64(66), int64(177)
memory usage: 570.1 MB


In [4]:
from Functions.DataPreperation import drop_missing_columns
copy_of_main = main_data.copy()
drop_missing_columns(main_data, 70, print_info = True)
main_data['DAYS_EMPLOYED_PERC'] = copy_of_main['DAYS_EMPLOYED'] / copy_of_main['DAYS_BIRTH']
main_data['INCOME_CREDIT_PERC'] = copy_of_main['AMT_INCOME_TOTAL'] / copy_of_main['AMT_CREDIT']
main_data['INCOME_PER_PERSON'] = copy_of_main['AMT_INCOME_TOTAL'] / copy_of_main['CNT_FAM_MEMBERS']
main_data['ANNUITY_INCOME_PERC'] = copy_of_main['AMT_ANNUITY'] / copy_of_main['AMT_INCOME_TOTAL']
main_data['PAYMENT_RATE'] = copy_of_main['AMT_ANNUITY'] /copy_of_main['AMT_CREDIT']


There are 0 with greater than 70 missing values
Incomplete columns: 
[]


* Import necessary functions for feature engineering

In [5]:
from Functions.FeatureEngineering import *

* Test correlations before applying feature engineering

In [6]:
corrs = main_data.corr()
corrs = corrs.sort_values('TARGET', ascending = False)

tmp_data = pd.DataFrame(corrs['TARGET'])

In [7]:
corrs

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,DAYS_EMPLOYED_PERC,INCOME_CREDIT_PERC,INCOME_PER_PERSON,ANNUITY_INCOME_PERC,PAYMENT_RATE
TARGET,-0.002108,1.000000,-0.030896,-0.021851,-0.006148,0.019187,-0.003982,-0.030369,-0.012815,-0.039623,...,-0.033119,-0.012657,0.007946,-0.042201,0.004829,-0.049603,-0.001817,-0.006573,0.014268,0.012698
REGION_RATING_CLIENT_W_CITY,-0.001138,0.060893,-0.023116,-0.021405,0.001884,0.024781,-0.091735,-0.110915,-0.141674,-0.112185,...,-0.110948,-0.056365,0.018482,-0.169696,0.012061,0.008822,-0.034876,-0.129859,0.083544,-0.000445
REGION_RATING_CLIENT,-0.001075,0.058899,-0.021593,-0.022668,0.001289,0.025423,-0.085465,-0.101776,-0.128516,-0.103759,...,-0.073936,-0.056535,0.025218,-0.141678,0.017752,0.010842,-0.033004,-0.121298,0.080810,0.000426
NAME_INCOME_TYPE_Working,-0.002610,0.057481,0.020789,0.067329,-0.035097,0.136604,-0.024522,-0.056512,-0.043905,-0.057441,...,-0.029961,-0.010263,0.007134,-0.050594,0.011262,0.106421,0.007158,-0.067929,-0.003819,0.030869
DAYS_LAST_PHONE_CHANGE,-0.000859,0.055218,0.061830,-0.039089,0.027020,-0.005866,-0.018585,-0.073702,-0.063746,-0.076294,...,-0.026312,-0.010363,0.001463,-0.031942,0.003972,-0.120919,0.017154,-0.009552,-0.012503,0.035841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DAYS_EMPLOYED,-0.000105,-0.063368,-0.026510,-0.000682,0.018602,-0.026122,0.021221,0.091447,0.062342,0.093049,...,0.017641,0.015346,-0.006086,0.026668,-0.003591,0.949090,-0.023252,0.018648,0.010169,-0.065328
DAYS_BIRTH,0.001500,-0.078239,-0.086364,-0.129879,0.119146,-0.330938,-0.027261,0.055436,-0.009443,0.053510,...,0.013597,0.010964,-0.008758,0.019852,-0.005434,-0.057987,-0.064730,0.033887,0.081488,-0.092093
EXT_SOURCE_1,0.000060,-0.098887,-0.009776,-0.039876,0.054510,-0.098389,0.024389,0.113983,0.080747,0.119068,...,0.036299,0.027897,-0.008289,0.064012,-0.009032,0.045623,-0.035826,0.043670,0.034351,-0.076442
EXT_SOURCE_3,0.000184,-0.155892,-0.004323,-0.016272,0.036962,-0.039543,-0.030737,0.036640,0.026738,0.040440,...,0.008611,0.005191,-0.004200,0.011052,-0.003394,0.051630,-0.038304,-0.037367,0.094253,0.002249


* To do any kind of feature engineering, we should first decide, what features to keep, and which to drop, since by using techniques of feeture engineering, we will most likely create additional features.
* For that reason, we will keep exactly 10 columns with the biggest and smallest correlation in regards to the Target column.

In [8]:
new_train = remove_target_correlated_cols(main_data)

In [9]:
#train_data = train_data.drop(columns = cols_to_remove)
#train_data.head(5)


* Apply feature engineering

In [10]:
new_train = log_transform(new_train)

In [11]:
new_train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,DAYS_EMPLOYED_PERC,ANNUITY_INCOME_PERC,PAYMENT_RATE,AMT_CREDIT_log,AMT_ANNUITY_log,AMT_GOODS_PRICE_log,DAYS_BIRTH_log,DAYS_EMPLOYED_log,DAYS_REGISTRATION_log,DAYS_ID_PUBLISH_log
0,100002,1,0,0,0,406597.5,24700.5,351000.0,0.018801,9461,...,0.067329,0.121978,0.060749,12.915581,10.114619,12.768544,9.155039,6.458338,8.202208,7.659643
1,100003,0,0,0,0,1293502.5,35698.5,1129500.0,0.003541,16765,...,0.070862,0.132217,0.027598,14.072865,10.482892,13.937287,9.727108,7.080868,7.079184,5.676754
2,100004,0,1,1,0,135000.0,6750.0,135000.0,0.010032,19046,...,0.011814,0.100000,0.050000,11.813037,8.817446,11.813037,9.854665,5.420535,8.357259,7.836765
3,100006,0,0,0,0,312682.5,29686.5,297000.0,0.008019,19005,...,0.159905,0.219900,0.094941,12.652947,10.298481,12.601491,9.852510,8.019613,9.193601,7.798933
4,100007,0,0,0,0,513000.0,21865.5,513000.0,0.028663,19932,...,0.152418,0.179963,0.042623,13.148033,9.992711,13.148033,9.900132,8.019284,8.369157,8.148735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,254700.0,27558.0,225000.0,0.032561,9327,...,0.025303,0.174971,0.108198,12.447846,10.224084,12.323860,9.140776,5.468060,9.042750,7.592366
307507,456252,0,0,0,0,269550.0,12001.5,225000.0,0.025164,20775,...,0.079326,0.166687,0.044524,12.504513,9.392870,12.323860,9.941554,7.407924,8.386857,8.316545
307508,456253,0,0,0,0,677664.0,29979.0,585000.0,0.005002,14966,...,0.529266,0.195941,0.044239,13.426408,10.308286,13.279369,9.613603,8.977399,8.815518,8.546946
307509,456254,1,0,0,0,370107.0,20205.0,319500.0,0.005313,11961,...,0.400134,0.118158,0.054592,12.821550,9.913735,12.674516,9.389490,8.473659,7.848934,6.837333


In [12]:
new_train = normalization(new_train)

In [13]:
new_train.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,AMT_GOODS_PRICE_norm,DAYS_BIRTH_norm,DAYS_EMPLOYED_norm,DAYS_REGISTRATION_norm,DAYS_ID_PUBLISH_norm,OWN_CAR_AGE_norm,HOUR_APPR_PROCESS_START_norm,AMT_CREDIT_log_norm,AMT_ANNUITY_log_norm,AMT_GOODS_PRICE_log_norm
0,100002,1,0,0,0,406597.5,24700.5,351000.0,0.018801,9461,...,0.077441,0.111161,0.035563,0.14786,0.294567,0.098901,0.434783,0.489166,0.537494,0.468924
1,100003,0,0,0,0,1293502.5,35698.5,1129500.0,0.003541,16765,...,0.271605,0.522886,0.066324,0.048071,0.040434,0.098901,0.478261,0.746352,0.610092,0.722714
2,100004,0,1,1,0,135000.0,6750.0,135000.0,0.010032,19046,...,0.023569,0.651466,0.012561,0.172665,0.351674,0.285714,0.391304,0.244144,0.281783,0.261437
3,100006,0,0,0,0,312682.5,29686.5,297000.0,0.008019,19005,...,0.063973,0.649154,0.169663,0.398549,0.338613,0.098901,0.73913,0.4308,0.573739,0.432648
4,100007,0,0,0,0,513000.0,21865.5,513000.0,0.028663,19932,...,0.117845,0.701409,0.169607,0.174732,0.480478,0.098901,0.478261,0.540824,0.513463,0.551329


In [14]:
corrs = new_train.corr()
corrs = corrs.sort_values('TARGET', ascending = False)

pd.DataFrame(corrs['TARGET'])

Unnamed: 0,TARGET
TARGET,1.000000
REGION_RATING_CLIENT_W_CITY,0.060893
REGION_RATING_CLIENT,0.058899
NAME_INCOME_TYPE_Working,0.057481
DAYS_LAST_PHONE_CHANGE,0.055218
...,...
DAYS_BIRTH,-0.078239
DAYS_BIRTH_log,-0.078504
EXT_SOURCE_1,-0.098887
EXT_SOURCE_3,-0.155892


In [15]:
#train_numeric.head(10)

In [16]:
#corrs = train_numeric.corr()
#corrs = corrs.sort_values('TARGET', ascending = False)

#pd.DataFrame(corrs['TARGET'].head(25))

In [17]:
new_train.to_csv('./featureData.csv')

In [18]:
new_train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,AMT_GOODS_PRICE_norm,DAYS_BIRTH_norm,DAYS_EMPLOYED_norm,DAYS_REGISTRATION_norm,DAYS_ID_PUBLISH_norm,OWN_CAR_AGE_norm,HOUR_APPR_PROCESS_START_norm,AMT_CREDIT_log_norm,AMT_ANNUITY_log_norm,AMT_GOODS_PRICE_log_norm
0,100002,1,0,0,0,406597.5,24700.5,351000.0,0.018801,9461,...,0.077441,0.111161,0.035563,0.147860,0.294567,0.098901,0.434783,0.489166,0.537494,0.468924
1,100003,0,0,0,0,1293502.5,35698.5,1129500.0,0.003541,16765,...,0.271605,0.522886,0.066324,0.048071,0.040434,0.098901,0.478261,0.746352,0.610092,0.722714
2,100004,0,1,1,0,135000.0,6750.0,135000.0,0.010032,19046,...,0.023569,0.651466,0.012561,0.172665,0.351674,0.285714,0.391304,0.244144,0.281783,0.261437
3,100006,0,0,0,0,312682.5,29686.5,297000.0,0.008019,19005,...,0.063973,0.649154,0.169663,0.398549,0.338613,0.098901,0.739130,0.430800,0.573739,0.432648
4,100007,0,0,0,0,513000.0,21865.5,513000.0,0.028663,19932,...,0.117845,0.701409,0.169607,0.174732,0.480478,0.098901,0.478261,0.540824,0.513463,0.551329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,0,0,0,254700.0,27558.0,225000.0,0.032561,9327,...,0.046016,0.103608,0.013176,0.342737,0.275393,0.098901,0.652174,0.385219,0.559073,0.372361
307507,456252,0,0,0,0,269550.0,12001.5,225000.0,0.025164,20775,...,0.046016,0.748929,0.092005,0.177853,0.568292,0.098901,0.347826,0.397813,0.395216,0.372361
307508,456253,0,0,0,0,677664.0,29979.0,585000.0,0.005002,14966,...,0.135802,0.421477,0.442218,0.273063,0.715576,0.098901,0.391304,0.602688,0.575672,0.579849
307509,456254,1,0,0,0,370107.0,20205.0,319500.0,0.005313,11961,...,0.069585,0.252086,0.267195,0.103842,0.129359,0.098901,0.391304,0.468269,0.497894,0.448506
