# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.imputation import AddMissingIndicator
from feature_engine.imputation import RandomSampleImputer
from feature_engine.encoding import WoEEncoder

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Variables**

In [3]:
random_state = 101
target = 'TARGET'

# **Data**

## **Load Data**

In [4]:
application_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


application_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Combine Train and Test Data**

In [5]:
data = pd.concat([application_train, application_test], axis=0).reset_index (drop=True)

# **Data Cleaning**

## **Removing Empty Features**

In [6]:
list_columns = functions.check_columns_with_one_uniquevalue(application_test)
    
print(list_columns)
print(f"There are {len(list_columns)} columns with just 1 unique value")

['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
There are 11 columns with just 1 unique value


In [7]:
data = data.drop(list_columns, axis = 1)

## **Reduce Memory Usage**

In [9]:
data = functions.reduce_memory_usage(data)

Memory usage of dataframe is 301.70 MB
Memory usage after optimization is: 103.62 MB
Decreased by 65.7%


### **Correlation between each Feature and the Target Feature**

In [10]:
corr_matrix = data.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
DAYS_BIRTH                      0.078239
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
DAYS_LAST_PHONE_CHANGE          0.055219
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
DAYS_REGISTRATION               0.041975
OWN_CAR_AGE                     0.037612
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032248
DEF_60_CNT_SOCIAL_CIRCLE        0.031276
FLAG_WORK_PHONE                 0.028524
AMT_REQ_CREDIT_BUREAU_YEAR      0.019930
CNT_CHILDREN                    0.019187
CNT_FAM_MEMBERS                 0.009308
OBS_30_CNT_SOCIAL_CIRCLE        0.009131
OBS_60_CNT_SOCIAL_CIRCLE        0.009022
REG_REGION_NOT_WORK_REGION      0.006942
REG_REGION_NOT_LIVE_REGION      0.005576
LIVE_REGION_NOT_WORK_REGION     0.002819
AMT_REQ_CREDIT_B

# **Missing Values**

In [None]:
functions.MissingValues(data)

## **External Source Features**

In [None]:
selected_columns = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

ext_source = data[selected_columns]
data.drop(selected_columns, axis=1, inplace=True)

### **Random Sample Imputer**

In [18]:
rsi = RandomSampleImputer()
rsi.fit(data)
data = rsi.transform(data)

### **Correlation between each Feature and the Target Feature**

In [19]:
corr_matrix = data.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
DAYS_BIRTH                      0.067364
REGION_RATING_CLIENT_W_CITY     0.051895
REGION_RATING_CLIENT            0.050461
DAYS_LAST_PHONE_CHANGE          0.047765
DAYS_ID_PUBLISH                 0.045284
REG_CITY_NOT_WORK_CITY          0.043265
FLAG_DOCUMENT_3                 0.039890
FLAG_EMP_PHONE                  0.039294
REG_CITY_NOT_LIVE_CITY          0.038268
DAYS_REGISTRATION               0.035678
LIVE_CITY_NOT_WORK_CITY         0.027048
DEF_30_CNT_SOCIAL_CIRCLE        0.026397
DEF_60_CNT_SOCIAL_CIRCLE        0.025728
FLAG_WORK_PHONE                 0.024067
CNT_CHILDREN                    0.016634
AMT_REQ_CREDIT_BUREAU_YEAR      0.014347
OWN_CAR_AGE                     0.011270
CNT_FAM_MEMBERS                 0.007942
OBS_30_CNT_SOCIAL_CIRCLE        0.007108
OBS_60_CNT_SOCIAL_CIRCLE        0.007025
REG_REGION_NOT_WORK_REGION      0.006659
REG_REGION_NOT_LIVE_REGION      0.006000
LIVE_REGION_NOT_WORK_REGION     0.003075
AMT_REQ_CREDIT_B

## **WoE Encoder**

In [None]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(application_train, application_train[target])
application_train = woe.transform(application_train)