# Feature Engineering

* Imports

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

* Read data

In [2]:
train_data = pd.read_csv('./train.csv')
train_data = train_data.drop(columns = ['Unnamed: 0'])
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,254343,0,0,0,1,0,135000.0,900000.0,26316.0,900000.0,...,0,0,0,0,0,0,0,0,0,0
1,243348,0,0,0,1,0,135000.0,269550.0,14751.0,225000.0,...,0,0,0,0,0,0,1,0,1,0
2,369760,0,0,1,0,0,112500.0,1078200.0,34780.5,900000.0,...,0,0,0,0,0,0,0,0,0,0
3,218906,0,0,0,1,2,90000.0,650758.5,34668.0,603000.0,...,0,0,0,0,0,0,0,0,0,0
4,284201,0,0,1,1,1,112500.0,1067940.0,31356.0,765000.0,...,0,0,0,0,0,0,0,0,0,0


* Read already prepared data

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246008 entries, 0 to 246007
Columns: 243 entries, SK_ID_CURR to EMERGENCYSTATE_MODE_Yes
dtypes: float64(66), int64(177)
memory usage: 456.1 MB


In [4]:
from Functions.DataPreperation import drop_missing_columns

drop_missing_columns(train_data, 70, print_info = True)

There are 0 with greater than 70 missing values
Incomplete columns: 
[]


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,254343,0,0,0,1,0,135000.0,900000.0,26316.0,900000.0,...,0,0,0,0,0,0,0,0,0,0
1,243348,0,0,0,1,0,135000.0,269550.0,14751.0,225000.0,...,0,0,0,0,0,0,1,0,1,0
2,369760,0,0,1,0,0,112500.0,1078200.0,34780.5,900000.0,...,0,0,0,0,0,0,0,0,0,0
3,218906,0,0,0,1,2,90000.0,650758.5,34668.0,603000.0,...,0,0,0,0,0,0,0,0,0,0
4,284201,0,0,1,1,1,112500.0,1067940.0,31356.0,765000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246003,234487,0,0,1,0,0,225000.0,1695483.0,46624.5,1327500.0,...,0,0,0,0,0,0,0,0,0,0
246004,278629,0,0,0,1,0,78750.0,225000.0,12694.5,225000.0,...,0,0,0,0,0,1,0,0,1,0
246005,357294,0,0,0,0,0,94500.0,679500.0,26572.5,679500.0,...,0,0,0,0,0,0,0,0,0,0
246006,324660,1,0,0,0,1,119700.0,180000.0,5652.0,180000.0,...,0,0,0,0,0,1,0,0,1,0


* Import necessary functions for feature engineering

In [5]:
from Functions.FeatureEngineering import *

* Test correlations before applying feature engineering

In [6]:
corrs = train_data.corr()
corrs = corrs.sort_values('TARGET', ascending = False)

pd.DataFrame(corrs['TARGET'].head(10))

Unnamed: 0,TARGET
TARGET,1.0
REGION_RATING_CLIENT_W_CITY,0.059051
NAME_INCOME_TYPE_Working,0.057876
REGION_RATING_CLIENT,0.057461
DAYS_LAST_PHONE_CHANGE,0.055419
CODE_GENDER_M,0.054371
REG_CITY_NOT_WORK_CITY,0.050848
NAME_EDUCATION_TYPE_Secondary / secondary special,0.050657
FLAG_EMP_PHONE,0.045991
FLAG_DOCUMENT_3,0.044821


* Apply feature engineering

In [7]:
train_numeric = group_numeric_values(train_data, 'main')

In [8]:
train_numeric.shape

(246008, 1448)

In [9]:
train_numeric.head(10)

Unnamed: 0,SK_ID_CURR,main_NAME_CONTRACT_TYPE_count,main_NAME_CONTRACT_TYPE_min,main_NAME_CONTRACT_TYPE_max,main_NAME_CONTRACT_TYPE_mean,main_NAME_CONTRACT_TYPE_median,main_NAME_CONTRACT_TYPE_sum,main_FLAG_OWN_CAR_count,main_FLAG_OWN_CAR_min,main_FLAG_OWN_CAR_max,...,main_EMERGENCYSTATE_MODE_No_mean,main_EMERGENCYSTATE_MODE_No_median,main_EMERGENCYSTATE_MODE_No_sum,main_EMERGENCYSTATE_MODE_Yes_count,main_EMERGENCYSTATE_MODE_Yes_min,main_EMERGENCYSTATE_MODE_Yes_max,main_EMERGENCYSTATE_MODE_Yes_mean,main_EMERGENCYSTATE_MODE_Yes_median,main_EMERGENCYSTATE_MODE_Yes_sum,TARGET
0,100002,1,0,0,0,0,0,1,0,0,...,1,1,1,1,0,0,0,0,0,1
1,100003,1,0,0,0,0,0,1,0,0,...,1,1,1,1,0,0,0,0,0,0
2,100004,1,1,1,1,1,1,1,1,1,...,0,0,0,1,0,0,0,0,0,0
3,100006,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,100009,1,0,0,0,0,0,1,1,1,...,0,0,0,1,0,0,0,0,0,0
5,100010,1,0,0,0,0,0,1,1,1,...,0,0,0,1,0,0,0,0,0,0
6,100011,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
7,100012,1,1,1,1,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
8,100015,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
9,100018,1,0,0,0,0,0,1,0,0,...,1,1,1,1,0,0,0,0,0,0


In [None]:
corrs = train_numeric.corr()
corrs = corrs.sort_values('TARGET', ascending = False)

pd.DataFrame(corrs['TARGET'].head(25))