# Configuration

| Supplementing   | Converting | Scaling  |
|-----------------|------------|----------|
| random from distribution / zero | binary     | min / max |


In [10]:
# change position of working directory
import os
import sys

module_path = os.path.abspath('..')
if module_path not in sys.path:
    sys.path.append(module_path)

In [11]:
# import all necessary packages
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# import all internal packages
from preprocessing.supplementing.complex.by_random_from_distribution import by_random_from_distribution
from preprocessing.supplementing.simple.by_zero import by_zero
from preprocessing.converting.to_binary_columns import to_binary_columns
from preprocessing.scaling.min_max import min_max
from cross_validation import stratified_cross_validation, validate

In [12]:
# read data
db_path = "../../db/credit-card-approval/credit_card_approval.csv"
df = pd.read_csv(db_path)

In [13]:
# fill in the missing data
by_random_from_distribution(
    df,
    [
        "CODE_GENDER",
        "FLAG_OWN_CAR",
        "FLAG_OWN_REALTY",
        "CNT_CHILDREN",
        "NAME_EDUCATION_TYPE",
        "NAME_FAMILY_STATUS",
        "NAME_HOUSING_TYPE",
        "JOB",
        "STATUS",
        "FLAG_MOBIL",
        "FLAG_WORK_PHONE",
        "FLAG_PHONE",
        "FLAG_EMAIL",
    ],
)
by_zero(df, ["AMT_INCOME_TOTAL", "DAYS_BIRTH", "DAYS_EMPLOYED", "BEGIN_MONTHS"])

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,JOB,BEGIN_MONTHS,STATUS,TARGET
0,5046172,M,Y,N,No children,180000.0,Secondary / secondary special,Single / not married,House / apartment,-20142.0,-108.0,1.0,0.0,0.0,0.0,High skill tech staff,-24.0,0,0
1,5068809,M,Y,Y,1 children,112500.0,Higher education,Married,House / apartment,-11809.0,-536.0,1.0,0.0,0.0,1.0,Accountants,0.0,X,0
2,5067957,M,Y,Y,No children,360000.0,Higher education,Married,House / apartment,-22111.0,-3854.0,1.0,0.0,0.0,0.0,Managers,-40.0,X,0
3,5146339,F,N,Y,No children,202500.0,Secondary / secondary special,Married,House / apartment,-16173.0,-674.0,1.0,0.0,0.0,0.0,Sales staff,-2.0,C,0
4,5118164,M,Y,N,2+ children,225000.0,Secondary / secondary special,Married,House / apartment,-12801.0,-4357.0,1.0,0.0,0.0,0.0,Laborers,-27.0,X,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53762,5029349,M,Y,Y,No children,270000.0,Higher education,Married,House / apartment,-10942.0,-412.0,1.0,0.0,0.0,0.0,Drivers,-43.0,0,0
53763,5090719,F,N,N,No children,234000.0,Secondary / secondary special,Widow,House / apartment,-21932.0,-10232.0,1.0,1.0,0.0,0.0,Laborers,-35.0,0,0
53764,5023812,F,N,Y,1 children,202500.0,Secondary / secondary special,Separated,House / apartment,-13956.0,-5429.0,1.0,0.0,0.0,0.0,Laborers,-15.0,C,0
53765,5099871,F,Y,Y,2+ children,112500.0,Secondary / secondary special,Married,House / apartment,-9994.0,-644.0,1.0,1.0,0.0,0.0,Sales staff,-4.0,C,0


In [14]:
# processing of categorical data
df["CODE_GENDER"].replace({ 'F': 0, 'M': 1 }, inplace=True)
df["FLAG_OWN_CAR"].replace({ 'N': 0, 'Y': 1 }, inplace=True)
df["FLAG_OWN_REALTY"].replace({ 'N': 0, 'Y': 1 }, inplace=True)
df = to_binary_columns(df, ["CNT_CHILDREN", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "JOB", "STATUS"])[0]

In [15]:
# assign to the dataframe
df["AMT_INCOME_TOTAL"] = min_max(df["AMT_INCOME_TOTAL"], (-1, 1))
df["DAYS_BIRTH"] = min_max(df["DAYS_BIRTH"], (-1, 1))
df["DAYS_EMPLOYED"] = min_max(df["DAYS_EMPLOYED"], (-1, 1))
df["BEGIN_MONTHS"] = min_max(df["BEGIN_MONTHS"], (-1, 1))

# display
df[["AMT_INCOME_TOTAL", "DAYS_BIRTH", "DAYS_EMPLOYED", "BEGIN_MONTHS"]]

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,BEGIN_MONTHS
0,-0.771429,-0.636829,0.986253,0.200000
1,-0.857143,0.040348,0.931776,1.000000
2,-0.542857,-0.796839,0.509451,-0.333333
3,-0.742857,-0.314290,0.914211,0.933333
4,-0.714286,-0.040267,0.445427,0.100000
...,...,...,...,...
53762,-0.657143,0.110804,0.947559,-0.433333
53763,-0.702857,-0.782292,-0.302361,-0.166667
53764,-0.742857,-0.134127,0.308980,0.500000
53765,-0.857143,0.187843,0.918030,0.866667


In [16]:
# remove unnecessary columns
df.drop('FLAG_WORK_PHONE', axis=1, inplace=True)
df.drop('FLAG_MOBIL', axis=1, inplace=True)
df.drop('ID', axis=1, inplace=True)
df.drop('BEGIN_MONTHS', axis=1, inplace=True)
df.drop('FLAG_PHONE', axis=1, inplace=True)

In [17]:
# Save model data
prepared_db_path = "../../db/models-data/delta.csv"
df.to_csv(prepared_db_path, index=False)

# Training

In [18]:
# prepare classifiers
xgb = XGBClassifier(eval_metric='logloss')
dtc = DecisionTreeClassifier()

# prepare cross-validation data
train_test_splits = stratified_cross_validation(df, "TARGET", n_folds=10)

In [19]:
# create voting classifier
voting_clf = VotingClassifier(estimators=[('xgb', xgb), ('dtc', dtc)], voting='hard')

# validate
np.mean(validate(train_test_splits, voting_clf, "TARGET"))

0.9999255952380952

In [20]:
# create stacking classifier
stacking_clf = StackingClassifier(estimators=[('xgb', xgb), ('dtc', dtc)], final_estimator=LogisticRegression())

# validate
np.mean(validate(train_test_splits, stacking_clf, "TARGET"))

0.9999255952380952