In [11]:
# import all external packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sys
import os


In [12]:
# add current path as a root
sys.path.append(os.getcwd())

In [13]:
# import all created packages
from supplementing.simple.by_mode import by_mode
from supplementing.simple.by_zero import by_zero
from supplementing.complex.by_regression import by_regression
from converting.to_binary_columns import to_binary_columns
from scaling.min_max import min_max
from scaling.standard import standard

In [14]:
# read orginal data
file_path = "../../data/nullified__credit_card_approval.csv"
data = pd.read_csv(file_path)

In [15]:
# fill in the missing data
by_mode(
    data,
    [
        "CODE_GENDER",
        "FLAG_OWN_CAR",
        "FLAG_OWN_REALTY",
        "CNT_CHILDREN",
        "NAME_EDUCATION_TYPE",
        "NAME_FAMILY_STATUS",
        "NAME_HOUSING_TYPE",
        "JOB",
        "STATUS",
        "FLAG_MOBIL"
    ],
)
by_zero(data, ["FLAG_WORK_PHONE", "FLAG_PHONE", "FLAG_EMAIL"])
by_regression(data, ["AMT_INCOME_TOTAL", "DAYS_BIRTH", "DAYS_EMPLOYED", "BEGIN_MONTHS"])

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,JOB,BEGIN_MONTHS,STATUS,TARGET
0,5065438,F,Y,N,2+ children,270000.0,Secondary / secondary special,Married,With parents,-13258.0,-2300.0,1.0,0.0,0.0,0.0,Managers,-6.0,C,0
1,5142753,F,N,N,No children,81000.0,Secondary / secondary special,Single / not married,House / apartment,-17876.0,-377.0,1.0,1.0,1.0,0.0,Private service staff,-4.0,0,0
2,5111146,M,Y,Y,No children,270000.0,Higher education,Married,House / apartment,-19579.0,-1028.0,1.0,0.0,1.0,0.0,Laborers,0.0,C,0
3,5010310,F,Y,Y,1 children,112500.0,Secondary / secondary special,Married,House / apartment,-15109.0,-1956.0,1.0,0.0,0.0,0.0,Laborers,-3.0,0,0
4,5010835,M,Y,Y,2+ children,139500.0,Secondary / secondary special,Married,House / apartment,-17281.0,-5578.0,1.0,1.0,0.0,0.0,Drivers,-29.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537662,5142999,M,Y,N,1 children,166500.0,Secondary / secondary special,Married,With parents,-12372.0,-5401.0,1.0,0.0,1.0,0.0,Core staff,-8.0,0,0
537663,5010773,F,N,Y,No children,135000.0,Higher education,Married,With parents,-14160.0,-4635.0,1.0,0.0,0.0,0.0,Sales staff,-8.0,0,0
537664,5105601,M,N,Y,No children,180000.0,Higher education,Married,House / apartment,-24204.0,-2462.0,1.0,0.0,0.0,0.0,Private service staff,-7.0,0,0
537665,5132833,M,Y,N,No children,220500.0,Secondary / secondary special,Married,House / apartment,-22647.0,-3847.0,1.0,0.0,1.0,0.0,Laborers,-1.0,0,0


In [16]:
# Reduce number of records
_, reduced_data = train_test_split(data, stratify=data["TARGET"], test_size=0.1, random_state=42)
reduced_data = reduced_data.reset_index(drop=True)
reduced_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,JOB,BEGIN_MONTHS,STATUS,TARGET
0,5046172,M,Y,N,No children,180000.0,Secondary / secondary special,Single / not married,House / apartment,-20142.0,-108.0,1.0,0.0,0.0,0.0,High skill tech staff,-24.0,0,0
1,5068809,M,Y,Y,1 children,112500.0,Higher education,Married,House / apartment,-11809.0,-536.0,1.0,0.0,0.0,1.0,Accountants,0.0,X,0
2,5067957,M,Y,Y,No children,360000.0,Higher education,Married,House / apartment,-22111.0,-3854.0,1.0,0.0,0.0,0.0,Managers,-40.0,X,0
3,5146339,F,N,Y,No children,202500.0,Secondary / secondary special,Married,House / apartment,-16173.0,-674.0,1.0,0.0,0.0,0.0,Sales staff,-2.0,C,0
4,5118164,M,Y,N,2+ children,225000.0,Secondary / secondary special,Married,House / apartment,-12801.0,-4357.0,1.0,0.0,0.0,0.0,Laborers,-27.0,X,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53762,5029349,M,Y,Y,No children,270000.0,Higher education,Married,House / apartment,-10942.0,-412.0,1.0,0.0,0.0,0.0,Drivers,-43.0,0,0
53763,5090719,F,N,N,No children,234000.0,Secondary / secondary special,Widow,House / apartment,-21932.0,-10232.0,1.0,1.0,0.0,0.0,Laborers,-35.0,0,0
53764,5023812,F,N,Y,1 children,202500.0,Secondary / secondary special,Separated,House / apartment,-13956.0,-5429.0,1.0,0.0,0.0,0.0,Laborers,-15.0,C,0
53765,5099871,F,Y,Y,2+ children,112500.0,Secondary / secondary special,Married,House / apartment,-9994.0,-644.0,1.0,1.0,0.0,0.0,Sales staff,-4.0,C,0


In [17]:
# Save data without any null
reduced_data.to_csv("../../data/reduced__credit_card_approval.csv", index=False)

In [18]:
# processing of categorical data
reduced_data["CODE_GENDER"].replace({ 'F': 0, 'M': 1 }, inplace=True)
reduced_data["FLAG_OWN_CAR"].replace({ 'N': 0, 'Y': 1 }, inplace=True)
reduced_data["FLAG_OWN_REALTY"].replace({ 'N': 0, 'Y': 1 }, inplace=True)
reduced_data = to_binary_columns(reduced_data, ["CNT_CHILDREN", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "JOB", "STATUS"])[0]
reduced_data

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,...,JOB_Security staff,JOB_Waiters/barmen staff,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X
0,5046172,1,1,0,180000.0,-20142.0,-108.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
1,5068809,1,1,1,112500.0,-11809.0,-536.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,5067957,1,1,1,360000.0,-22111.0,-3854.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,5146339,0,0,1,202500.0,-16173.0,-674.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,5118164,1,1,0,225000.0,-12801.0,-4357.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53762,5029349,1,1,1,270000.0,-10942.0,-412.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
53763,5090719,0,0,0,234000.0,-21932.0,-10232.0,1.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
53764,5023812,0,0,1,202500.0,-13956.0,-5429.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
53765,5099871,0,1,1,112500.0,-9994.0,-644.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [19]:
# Scale data
scaled_income = standard(reduced_data["AMT_INCOME_TOTAL"])
scaled_birth = standard(reduced_data["DAYS_BIRTH"])
scaled_employed = standard(reduced_data["DAYS_EMPLOYED"])
scaled_begin = standard(reduced_data["BEGIN_MONTHS"])

# assign to the dataframe
reduced_data["AMT_INCOME_TOTAL"] = scaled_income
reduced_data["DAYS_BIRTH"] = scaled_birth
reduced_data["DAYS_EMPLOYED"] = scaled_employed
reduced_data["BEGIN_MONTHS"] = scaled_begin

# display
reduced_data[["AMT_INCOME_TOTAL", "DAYS_BIRTH", "DAYS_EMPLOYED", "BEGIN_MONTHS"]]

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,BEGIN_MONTHS
0,-0.167214,-1.501189,1.108864,-0.344697
1,-0.814531,0.935212,0.929780,1.369386
2,1.558966,-2.076885,-0.458543,-1.487419
3,0.048559,-0.340734,0.872037,1.226546
4,0.264331,0.645171,-0.669009,-0.558957
...,...,...,...,...
53762,0.695876,1.188705,0.981664,-1.701679
53763,0.350640,-2.024549,-3.127236,-1.130318
53764,0.048559,0.307472,-1.117557,0.298084
53765,-0.814531,1.465881,0.884590,1.083706


In [20]:
# remove unnecessary columns
reduced_data.drop('FLAG_WORK_PHONE', axis=1, inplace=True)
reduced_data.drop('FLAG_MOBIL', axis=1, inplace=True)
reduced_data.drop('ID', axis=1, inplace=True)
reduced_data.drop('BEGIN_MONTHS', axis=1, inplace=True)
reduced_data.drop('FLAG_PHONE', axis=1, inplace=True)

In [21]:
# Save to the file
reduced_data.to_csv("../../data/postproduction__credit_card_approval.csv", index=False)