# Feature Engineering

* We start off with necessary imports:

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

* Read data

In [2]:
train_data = pd.read_csv('../train.csv')
train_data = train_data.drop(columns = ['Unnamed: 0'])
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,307359,0,0,0,1,0,180000.0,746896.5,31774.5,594000.0,...,0,0,0,0,0,1,0,0,1,0
1,120529,0,0,0,1,0,63000.0,814500.0,23944.5,814500.0,...,0,0,0,0,0,0,0,0,0,0
2,198439,1,0,0,1,0,225000.0,450000.0,30573.0,450000.0,...,0,0,0,0,0,0,0,0,0,0
3,304860,0,0,0,1,0,157500.0,1256400.0,44644.5,900000.0,...,0,0,0,0,0,1,0,0,1,0
4,102965,0,0,0,0,0,90000.0,454500.0,14661.0,454500.0,...,0,0,0,0,0,0,0,0,0,0


* Read already prepared data

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246008 entries, 0 to 246007
Columns: 243 entries, SK_ID_CURR to EMERGENCYSTATE_MODE_Yes
dtypes: float64(66), int64(177)
memory usage: 456.1 MB


In [4]:
from Functions.DataPreperation import drop_missing_columns

drop_missing_columns(train_data, 70, print_info = True)

There are 0 with greater than 70 missing values
Incomplete columns: 
[]


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,307359,0,0,0,1,0,180000.0,746896.5,31774.5,594000.0,...,0,0,0,0,0,1,0,0,1,0
1,120529,0,0,0,1,0,63000.0,814500.0,23944.5,814500.0,...,0,0,0,0,0,0,0,0,0,0
2,198439,1,0,0,1,0,225000.0,450000.0,30573.0,450000.0,...,0,0,0,0,0,0,0,0,0,0
3,304860,0,0,0,1,0,157500.0,1256400.0,44644.5,900000.0,...,0,0,0,0,0,1,0,0,1,0
4,102965,0,0,0,0,0,90000.0,454500.0,14661.0,454500.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246003,398683,0,0,0,1,2,112500.0,541323.0,29362.5,405000.0,...,0,0,0,0,0,0,1,0,1,0
246004,443860,0,1,0,1,0,107100.0,315000.0,15750.0,315000.0,...,0,0,0,0,0,0,0,0,0,0
246005,103075,0,0,0,1,0,166500.0,682875.0,27940.5,589500.0,...,0,0,0,0,0,0,1,0,1,0
246006,116718,0,0,0,1,0,81000.0,808650.0,26217.0,675000.0,...,0,0,0,0,0,0,1,0,1,0


* Import necessary functions for feature engineering

In [5]:
from Functions.FeatureEngineering import *

* Test correlations before applying feature engineering

In [6]:
corrs = train_data.corr()
corrs = corrs.sort_values('TARGET', ascending = False)

tmp_data = pd.DataFrame(corrs['TARGET'])

* To do any kind of feature engineering, we should first decide, what features to keep, and which to drop, since by using techniques of feeture engineering, we will most likely create additional features.
* For that reason, we will keep exactly 10 columns with the biggest and smallest correlation in regards to the Target column.

In [7]:
new_train = remove_target_correlated_cols(train_data)

In [8]:
#train_data = train_data.drop(columns = cols_to_remove)
#train_data.head(5)


* Apply feature engineering

In [9]:
new_train = log_transform(new_train)

In [10]:
new_train.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,...,NAME_EDUCATION_TYPE_Secondary / secondary special,OCCUPATION_TYPE_Laborers,ORGANIZATION_TYPE_XNA,HOUSETYPE_MODE_block of flats,EMERGENCYSTATE_MODE_No,AMT_GOODS_PRICE_log,DAYS_BIRTH_log,DAYS_EMPLOYED_log,DAYS_REGISTRATION_log,DAYS_ID_PUBLISH_log
0,307359,0,594000.0,16628,3785.0,5108.0,181,1,2,2,...,1,0,0,1,1,13.294636,9.718903,8.239065,8.538759,5.204007
1,120529,0,814500.0,21944,1648.0,1403.0,4501,0,3,3,...,1,0,1,0,0,13.610331,9.996295,7.407924,7.247081,8.412277
2,198439,1,450000.0,16831,806.0,438.0,386,1,3,3,...,1,1,0,0,0,13.017005,9.731037,6.693324,6.084499,5.958425
3,304860,0,900000.0,15851,189.0,1922.0,1922,1,3,1,...,1,1,0,1,1,13.710151,9.671051,5.247024,7.561642,7.561642
4,102965,0,454500.0,15978,501.0,6147.0,3907,1,2,2,...,1,0,0,0,0,13.026955,9.679031,6.2186,8.723882,8.270781


In [12]:
new_train = normalization(new_train)

Unnamed: 0,TARGET
TARGET,1.0
REGION_RATING_CLIENT_W_CITY,0.061792
REGION_RATING_CLIENT,0.060025
NAME_INCOME_TYPE_Working,0.057901
DAYS_LAST_PHONE_CHANGE,0.054899
CODE_GENDER_M,0.05477
REG_CITY_NOT_WORK_CITY,0.052113
NAME_EDUCATION_TYPE_Secondary / secondary special,0.051396
FLAG_EMP_PHONE,0.046135
REG_CITY_NOT_LIVE_CITY,0.044493


In [None]:
#train_numeric.head(10)

In [None]:
#corrs = train_numeric.corr()
#corrs = corrs.sort_values('TARGET', ascending = False)

#pd.DataFrame(corrs['TARGET'].head(25))