# **Libraries**

In [17]:
import pandas as pd
import numpy as np

from feature_engine.imputation import AddMissingIndicator
from feature_engine.imputation import RandomSampleImputer
from feature_engine.encoding import WoEEncoder

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [4]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Variables**

In [5]:
random_state = 101
target = 'TARGET'

# **Data**

## **Load Data**

In [48]:
application_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


application_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Combine Train and Test Data**

In [49]:
data = pd.concat([application_train, application_test], axis=0).reset_index (drop=True)

# **Data Cleaning**

## **Removing Empty Features**

In [50]:
list_columns = functions.check_columns_with_one_uniquevalue(application_test)
    
print(list_columns)
print(f"There are {len(list_columns)} columns with just 1 unique value")

['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
There are 11 columns with just 1 unique value


In [43]:
data = data.drop(list_columns, axis = 1)
application_test = application_test.drop(list_columns, axis = 1)

## **Reduce Memory Usage**

In [44]:
application_train = functions.reduce_memory_usage(application_train)

Memory usage of dataframe is 260.42 MB
Memory usage after optimization is: 89.15 MB
Decreased by 65.8%


In [27]:
application_test = functions.reduce_memory_usage(application_test)

Memory usage of dataframe is 40.91 MB
Memory usage after optimization is: 14.09 MB
Decreased by 65.6%


### **Correlation between each Feature and the Target Feature**

In [45]:
corr_matrix = application_train.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
DAYS_BIRTH                      0.078239
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
DAYS_LAST_PHONE_CHANGE          0.055219
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
DAYS_REGISTRATION               0.041975
OWN_CAR_AGE                     0.037612
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032248
DEF_60_CNT_SOCIAL_CIRCLE        0.031276
FLAG_WORK_PHONE                 0.028524
AMT_REQ_CREDIT_BUREAU_YEAR      0.019930
CNT_CHILDREN                    0.019187
CNT_FAM_MEMBERS                 0.009308
OBS_30_CNT_SOCIAL_CIRCLE        0.009131
OBS_60_CNT_SOCIAL_CIRCLE        0.009022
REG_REGION_NOT_WORK_REGION      0.006942
REG_REGION_NOT_LIVE_REGION      0.005576
LIVE_REGION_NOT_WORK_REGION     0.002819
AMT_REQ_CREDIT_B

# **Missing Values**

In [39]:
functions.MissingValues(application_train)

Unnamed: 0,NumberMissing,PercentageMissing


### **Random Sample Imputer**

In [46]:
rsi = RandomSampleImputer()
rsi.fit(application_train)
application_train = rsi.transform(application_train)

### **Correlation between each Feature and the Target Feature**

In [47]:
corr_matrix = application_train.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
DAYS_BIRTH                      0.078239
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
DAYS_LAST_PHONE_CHANGE          0.055218
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
DAYS_REGISTRATION               0.041975
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032301
DEF_60_CNT_SOCIAL_CIRCLE        0.031192
FLAG_WORK_PHONE                 0.028524
CNT_CHILDREN                    0.019187
AMT_REQ_CREDIT_BUREAU_YEAR      0.017877
OWN_CAR_AGE                     0.011295
CNT_FAM_MEMBERS                 0.009307
OBS_30_CNT_SOCIAL_CIRCLE        0.009016
OBS_60_CNT_SOCIAL_CIRCLE        0.009013
REG_REGION_NOT_WORK_REGION      0.006942
REG_REGION_NOT_LIVE_REGION      0.005576
NONLIVINGAPARTMENTS_MODE        0.003975
LIVE_REGION_NOT_

## **WoE Encoder**

In [38]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(application_train, application_train[target])
application_train = woe.transform(application_train)

### **Correlation between each Feature and the Target Feature**

In [40]:
corr_matrix = application_train.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
DAYS_BIRTH                      0.078239
ORGANIZATION_TYPE               0.071869
NAME_INCOME_TYPE                0.063318
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
OCCUPATION_TYPE                 0.058193
NAME_EDUCATION_TYPE             0.057441
DAYS_LAST_PHONE_CHANGE          0.055220
CODE_GENDER                     0.054633
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
DAYS_REGISTRATION               0.041975
NAME_FAMILY_STATUS              0.040291
NAME_HOUSING_TYPE               0.036947
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032141
DEF_60_CNT_SOCIAL_CIRCLE        0.031193
NAME_CONTRACT_TYPE              0.030896
FLAG_WORK_PHONE                 0.028524
FLAG_OWN_CAR                    0.021851
CNT_CHILDREN    