# GR5293 - Proj1 - Group9
## Classification on the late payment of credit cards
#### Data cleaning pipeline 
* Drop rows with missing rate >= 30%
* After dropping rows, dropping columns with missing rate >= 10%
* Onehot Encoding
* Rule out outliers
* Implement imputations on missing data via KNNimputer
* Datatype conversion
* Drop columns with "DOCUMENTS"


#### Setup

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.impute import KNNImputer
import matplotlib
from matplotlib import pyplot as plt
import time
import os
import re
import gc
gc.enable()
print(os.getcwd())
mydir = os.getcwd() + "/"
%xmode plain
%matplotlib inline

/Users/kangshuoli/Documents/VScode_workspace/GR5293/EODS-Project1-Group9/doc
Exception reporting mode: Plain


#### Read in data

In [2]:
raw_df = pd.read_csv(
    '../data/application_data.csv', 
    header = 0
)
raw_df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### Split feature and labels

In [41]:
feature_df = raw_df.iloc[:,2:]
label_df = raw_df.loc[:,"TARGET"]
raw_df = raw_df.iloc[:,1:]

In [42]:
feature_df

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,...,0,0,0,0,,,,,,
4,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,Unaccompanied,...,0,0,0,0,,,,,,
307507,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,Unaccompanied,...,0,0,0,0,,,,,,
307508,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,Unaccompanied,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,Unaccompanied,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
label_df.astype(int)

0         1
1         0
2         0
3         0
4         0
         ..
307506    0
307507    0
307508    0
307509    1
307510    0
Name: TARGET, Length: 307511, dtype: int64

#### Drop rows with missing rate >= 30%

In [44]:
drop_seq = ["row", "col"]

def cal_missing_rate(df_, axis_):
    drop_seq = ["row", "col"]
    missing_rate_dict = {}
    if axis_ == 0:
        l = df_.shape[1]
        for i in range(df_.shape[axis_]):
            missing_rate_dict[i] = float(df_.iloc[i,:].isna().sum() / l)
    elif axis_ == 1:
        l = df_.shape[0]
        for name in df_.columns:
            missing_rate_dict[name] = float(df_.loc[:,name].isna().sum() / l)    
    return missing_rate_dict

missing_rate_dict_row = cal_missing_rate(df_ = raw_df, axis_ = 0)
# for key, value in missing_rate_dict_row.items():
#     print(key, value)

In [45]:
def drop_with_missing_rate(df_, axis_, threshold_, dict_):
    to_be_dropped = []
    for key, value in dict_.items():
        if value >= threshold_:
            to_be_dropped.append(key)
    new_df = df_.drop(
        to_be_dropped, 
        axis = axis_
    )
    return new_df

data_cleaned_df_row = drop_with_missing_rate(
    df_ = raw_df, 
    axis_ = 0, 
    threshold_ = 0.3, 
    dict_ = missing_rate_dict_row
)

In [46]:
data_cleaned_df_row.index = np.arange(data_cleaned_df_row.shape[0], dtype = int)

In [47]:
data_cleaned_df_row

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Cash loans,F,N,Y,0,67500.0,80865.0,5881.5,67500.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,Cash loans,M,Y,N,1,225000.0,918468.0,28966.5,697500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,Cash loans,F,N,Y,0,189000.0,773680.5,32778.0,679500.0,...,0,0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153789,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,0,,,,,,
153790,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,0,,,,,,
153791,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
153792,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### After dropping rows, dropping cols with missing rate >= 10%

In [48]:
missing_rate_dict_col = cal_missing_rate(df_ = data_cleaned_df_row, axis_ = 1)
data_cleaned_df = drop_with_missing_rate(
    df_ = data_cleaned_df_row, 
    axis_ = 1, 
    threshold_ = 0.1, 
    dict_ = missing_rate_dict_col
)

#### Drop columns with name "DOCUMENT"

In [49]:
import re
col_doc_list = []
pattern_doc = re.compile('.*DOCUMENT.*')
for name in data_cleaned_df.columns:
    if re.search(pattern_doc, name):
        col_doc_list.append(name)

data_cleaned_df.drop(
    col_doc_list, 
    axis = 1, 
    inplace = True
)

In [50]:
data_cleaned_df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,LIVINGAREA_MEDI,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0.0193,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0.0558,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0
2,0,Cash loans,F,N,Y,0,67500.0,80865.0,5881.5,67500.0,...,0.0792,block of flats,0.0612,,No,0.0,0.0,0.0,0.0,-2370.0
3,0,Cash loans,M,Y,N,1,225000.0,918468.0,28966.5,697500.0,...,0.1422,block of flats,0.1417,Panel,No,0.0,0.0,0.0,0.0,-4.0
4,0,Cash loans,F,N,Y,0,189000.0,773680.5,32778.0,679500.0,...,0.3842,block of flats,0.3811,Panel,No,0.0,0.0,0.0,0.0,-188.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153789,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,...,0.2001,block of flats,0.2898,"Stone, brick",No,0.0,0.0,0.0,0.0,-273.0
153790,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,...,0.0261,block of flats,0.0214,"Stone, brick",No,0.0,0.0,0.0,0.0,0.0
153791,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0.9445,block of flats,0.7970,Panel,No,6.0,0.0,6.0,0.0,-1909.0
153792,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0.0062,block of flats,0.0086,"Stone, brick",No,0.0,0.0,0.0,0.0,-322.0


#### Correction on DAYS (days must be minus int)
Drop rows if DAYs are positive number

In [51]:
pattern_days = re.compile('.*DAYS.*')
col_days_list = []
for name in data_cleaned_df.columns:
    if re.search(pattern_days, name):
        col_days_list.append(name)
col_days_list

['DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE']

In [52]:
for name in col_days_list:
    print(f'Number of pos days in {name}: {data_cleaned_df.loc[data_cleaned_df[name] > 0, name].shape[0]}')

Number of pos days in DAYS_BIRTH: 0
Number of pos days in DAYS_EMPLOYED: 26959
Number of pos days in DAYS_REGISTRATION: 0
Number of pos days in DAYS_ID_PUBLISH: 0
Number of pos days in DAYS_LAST_PHONE_CHANGE: 0


In [53]:
index_to_be_dropped = []
for i in np.arange(data_cleaned_df.shape[0]):
    if data_cleaned_df.loc[i, "DAYS_EMPLOYED"] > 0:
        index_to_be_dropped.append(i)

data_cleaned_df.drop(
    index_to_be_dropped, 
    axis = 0, 
    inplace = True
)

In [54]:
data_cleaned_df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,LIVINGAREA_MEDI,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0.0193,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0.0558,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0
2,0,Cash loans,F,N,Y,0,67500.0,80865.0,5881.5,67500.0,...,0.0792,block of flats,0.0612,,No,0.0,0.0,0.0,0.0,-2370.0
3,0,Cash loans,M,Y,N,1,225000.0,918468.0,28966.5,697500.0,...,0.1422,block of flats,0.1417,Panel,No,0.0,0.0,0.0,0.0,-4.0
4,0,Cash loans,F,N,Y,0,189000.0,773680.5,32778.0,679500.0,...,0.3842,block of flats,0.3811,Panel,No,0.0,0.0,0.0,0.0,-188.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153787,0,Cash loans,F,N,Y,3,81000.0,269550.0,11871.0,225000.0,...,0.0192,block of flats,0.0149,"Stone, brick",No,2.0,1.0,2.0,1.0,-448.0
153789,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,...,0.2001,block of flats,0.2898,"Stone, brick",No,0.0,0.0,0.0,0.0,-273.0
153791,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,...,0.9445,block of flats,0.7970,Panel,No,6.0,0.0,6.0,0.0,-1909.0
153792,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,...,0.0062,block of flats,0.0086,"Stone, brick",No,0.0,0.0,0.0,0.0,-322.0


#### Onehot encoding

In [55]:
data_df_onehot = pd.get_dummies(
    data = data_cleaned_df, 
    drop_first = True
)

In [56]:
data_df_onehot

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,ORGANIZATION_TYPE_University,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,67500.0,80865.0,5881.5,67500.0,0.031329,-13439,-2717,-311.0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,225000.0,918468.0,28966.5,697500.0,0.016612,-14086,-3028,-643.0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,189000.0,773680.5,32778.0,679500.0,0.010006,-14583,-203,-615.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153787,0,3,81000.0,269550.0,11871.0,225000.0,0.009175,-12961,-1046,-1398.0,...,0,0,0,0,0,0,0,1,0,0
153789,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,-9327,-236,-8456.0,...,0,0,0,0,0,0,0,1,0,0
153791,0,0,153000.0,677664.0,29979.0,585000.0,0.005002,-14966,-7921,-6737.0,...,0,0,0,0,0,0,1,0,0,0
153792,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,-11961,-4786,-2562.0,...,0,0,0,0,0,0,0,1,0,0


In [57]:
for name in data_df_onehot:
    print(f'{name}: {len(data_df_onehot[name].unique())}')

TARGET: 2
CNT_CHILDREN: 13
AMT_INCOME_TOTAL: 1439
AMT_CREDIT: 4601
AMT_ANNUITY: 11698
AMT_GOODS_PRICE: 701
REGION_POPULATION_RELATIVE: 79
DAYS_BIRTH: 16013
DAYS_EMPLOYED: 11189
DAYS_REGISTRATION: 13736
DAYS_ID_PUBLISH: 6023
FLAG_MOBIL: 2
FLAG_EMP_PHONE: 2
FLAG_WORK_PHONE: 2
FLAG_CONT_MOBILE: 2
FLAG_PHONE: 2
FLAG_EMAIL: 2
CNT_FAM_MEMBERS: 16
REGION_RATING_CLIENT: 3
REGION_RATING_CLIENT_W_CITY: 3
HOUR_APPR_PROCESS_START: 24
REG_REGION_NOT_LIVE_REGION: 2
REG_REGION_NOT_WORK_REGION: 2
LIVE_REGION_NOT_WORK_REGION: 2
REG_CITY_NOT_LIVE_CITY: 2
REG_CITY_NOT_WORK_CITY: 2
LIVE_CITY_NOT_WORK_CITY: 2
EXT_SOURCE_2: 75014
APARTMENTS_AVG: 2275
YEARS_BEGINEXPLUATATION_AVG: 276
ELEVATORS_AVG: 248
ENTRANCES_AVG: 277
FLOORSMAX_AVG: 387
LIVINGAREA_AVG: 5109
APARTMENTS_MODE: 751
YEARS_BEGINEXPLUATATION_MODE: 216
ELEVATORS_MODE: 27
ENTRANCES_MODE: 31
FLOORSMAX_MODE: 26
LIVINGAREA_MODE: 5201
APARTMENTS_MEDI: 1132
YEARS_BEGINEXPLUATATION_MEDI: 239
ELEVATORS_MEDI: 47
ENTRANCES_MEDI: 47
FLOORSMAX_MEDI: 50
LIVIN

#### Imputation

In [58]:
for name in data_df_onehot.columns:
    print(f'{name}: {data_df_onehot[name].isna().sum() / data_df_onehot.shape[0]:0.4f}')

TARGET: 0.0000
CNT_CHILDREN: 0.0000
AMT_INCOME_TOTAL: 0.0000
AMT_CREDIT: 0.0000
AMT_ANNUITY: 0.0000
AMT_GOODS_PRICE: 0.0009
REGION_POPULATION_RELATIVE: 0.0000
DAYS_BIRTH: 0.0000
DAYS_EMPLOYED: 0.0000
DAYS_REGISTRATION: 0.0000
DAYS_ID_PUBLISH: 0.0000
FLAG_MOBIL: 0.0000
FLAG_EMP_PHONE: 0.0000
FLAG_WORK_PHONE: 0.0000
FLAG_CONT_MOBILE: 0.0000
FLAG_PHONE: 0.0000
FLAG_EMAIL: 0.0000
CNT_FAM_MEMBERS: 0.0000
REGION_RATING_CLIENT: 0.0000
REGION_RATING_CLIENT_W_CITY: 0.0000
HOUR_APPR_PROCESS_START: 0.0000
REG_REGION_NOT_LIVE_REGION: 0.0000
REG_REGION_NOT_WORK_REGION: 0.0000
LIVE_REGION_NOT_WORK_REGION: 0.0000
REG_CITY_NOT_LIVE_CITY: 0.0000
REG_CITY_NOT_WORK_CITY: 0.0000
LIVE_CITY_NOT_WORK_CITY: 0.0000
EXT_SOURCE_2: 0.0018
APARTMENTS_AVG: 0.0233
YEARS_BEGINEXPLUATATION_AVG: 0.0045
ELEVATORS_AVG: 0.0692
ENTRANCES_AVG: 0.0126
FLOORSMAX_AVG: 0.0038
LIVINGAREA_AVG: 0.0261
APARTMENTS_MODE: 0.0233
YEARS_BEGINEXPLUATATION_MODE: 0.0045
ELEVATORS_MODE: 0.0692
ENTRANCES_MODE: 0.0126
FLOORSMAX_MODE: 0.0038
L

In [59]:
imputer = KNNImputer(
    n_neighbors = 10, 
    weights = "distance"
)
data_df_imputed = imputer.fit_transform(data_df_onehot)

In [60]:
data_df_imputed = pd.DataFrame(data = data_df_imputed, columns = data_df_onehot.columns)
data_df_imputed.isna().sum().sum()

0

In [62]:
data_df_imputed

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,ORGANIZATION_TYPE_University,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes
0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,67500.0,80865.0,5881.5,67500.0,0.031329,-13439.0,-2717.0,-311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,225000.0,918468.0,28966.5,697500.0,0.016612,-14086.0,-3028.0,-643.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,189000.0,773680.5,32778.0,679500.0,0.010006,-14583.0,-203.0,-615.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126830,0.0,3.0,81000.0,269550.0,11871.0,225000.0,0.009175,-12961.0,-1046.0,-1398.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
126831,0.0,0.0,157500.0,254700.0,27558.0,225000.0,0.032561,-9327.0,-236.0,-8456.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
126832,0.0,0.0,153000.0,677664.0,29979.0,585000.0,0.005002,-14966.0,-7921.0,-6737.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
126833,1.0,0.0,171000.0,370107.0,20205.0,319500.0,0.005313,-11961.0,-4786.0,-2562.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Save data as fraud_cleaned_ver2.csv

In [63]:
data_df_imputed.to_csv('../data/fraud_cleaned_ver2.csv')

#### Screen outlier

In [64]:
categorical_col_list = []
numerical_col_list = []
for name in data_df_imputed.columns:
    if len(data_df_imputed[name].unique()) >= 10:
        numerical_col_list.append(name)
numerical_col_list

['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'CNT_FAM_MEMBERS',
 'HOUR_APPR_PROCESS_START',
 'EXT_SOURCE_2',
 'APARTMENTS_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'LIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'LIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'LIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE']

In [71]:
data_df_screened = data_df_imputed
index_with_outlier = []
for name in numerical_col_list:
    Q1 = np.percentile(data_df_screened[name], 25)
    Q3 = np.percentile(data_df_screened[name], 75)
    IQR = Q3 - Q1
    outlier_step = 1.5 * IQR
    for i in np.arange(data_df_screened.shape[0]):
        if (data_df_screened.loc[i, name] < Q1 - outlier_step) | (data_df_screened.loc[i, name] > Q3 + outlier_step):
            index_with_outlier.append(i)

# remove duplicated index
index_with_outlier = list(set(index_with_outlier))
print(f'Totoal number of rows with outliers: {len(index_with_outlier)}')

Totoal number of rows with outliers: 54861


In [73]:
data_df_screened

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,ORGANIZATION_TYPE_University,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes
0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461.0,-637.0,-3648.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765.0,-1188.0,-1186.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,67500.0,80865.0,5881.5,67500.0,0.031329,-13439.0,-2717.0,-311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,225000.0,918468.0,28966.5,697500.0,0.016612,-14086.0,-3028.0,-643.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,189000.0,773680.5,32778.0,679500.0,0.010006,-14583.0,-203.0,-615.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126830,0.0,3.0,81000.0,269550.0,11871.0,225000.0,0.009175,-12961.0,-1046.0,-1398.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
126831,0.0,0.0,157500.0,254700.0,27558.0,225000.0,0.032561,-9327.0,-236.0,-8456.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
126832,0.0,0.0,153000.0,677664.0,29979.0,585000.0,0.005002,-14966.0,-7921.0,-6737.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
126833,1.0,0.0,171000.0,370107.0,20205.0,319500.0,0.005313,-11961.0,-4786.0,-2562.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Save data_df_screened as fraud_cleaned_ver3.csv

In [74]:
data_df_screened.to_csv('../data/fraud_cleaned_ver3.csv')