In [13]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
import json # will be needed for saving preprocessing details
import joblib 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Dataset pre-processing

In [2]:
application_data = pd.read_csv('fapplication_train.csv')

In [3]:
label_vector = application_data['TARGET']
np.unique(label_vector, return_counts=True)

(array([0, 1]), array([282686,  24825]))

In [4]:
dataset_columns = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 
    'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
    'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 
    'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 
    'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 
    'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
    'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 
    'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 
    'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'TARGET'
]

In [5]:
categorical_features = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
    'OCCUPATION_TYPE', 'EXT_SOURCE_1', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 
    'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
    'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
    'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
]
numerical_features = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS', 
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 
    'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 
    'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 
    'AMT_REQ_CREDIT_BUREAU_YEAR'
]

### Missing value treatment

In [6]:
application_data['AMT_ANNUITY'] = application_data['AMT_ANNUITY'].fillna(0)
application_data['OCCUPATION_TYPE'] = application_data['OCCUPATION_TYPE'].fillna('UNKNOWN')
application_data['CNT_FAM_MEMBERS'] = application_data['CNT_FAM_MEMBERS'].fillna(0)
application_data['EXT_SOURCE_1'] = application_data['EXT_SOURCE_1'].fillna(0)
application_data['EXT_SOURCE_2'] = application_data['EXT_SOURCE_2'].fillna(0)
application_data['EXT_SOURCE_3'] = application_data['EXT_SOURCE_3'].fillna(0)
application_data['OBS_30_CNT_SOCIAL_CIRCLE'] = application_data['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['DEF_30_CNT_SOCIAL_CIRCLE'] = application_data['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['OBS_60_CNT_SOCIAL_CIRCLE'] = application_data['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['DEF_60_CNT_SOCIAL_CIRCLE'] = application_data['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['DAYS_LAST_PHONE_CHANGE'] = application_data['DAYS_LAST_PHONE_CHANGE'].fillna(3650)
application_data['AMT_REQ_CREDIT_BUREAU_HOUR'] = application_data['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_DAY'] = application_data['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_WEEK'] = application_data['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_MON'] = application_data['AMT_REQ_CREDIT_BUREAU_MON'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_QRT'] = application_data['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_YEAR'] = application_data['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0)


In [14]:
sample_class_1 = application_data[application_data['TARGET'] == 1][:20000]
sample_class_0 = application_data[application_data['TARGET'] == 0][:20000]
treated_dataset = pd.concat([sample_class_1, sample_class_0])[dataset_columns]
training_dataset, testing_dataset = train_test_split(treated_dataset, shuffle=True, stratify=treated_dataset['TARGET'])
train_mode = dict(training_dataset.mode().iloc[0])
train_mode

{'CODE_GENDER': 'F',
 'FLAG_OWN_CAR': 'N',
 'FLAG_OWN_REALTY': 'Y',
 'AMT_INCOME_TOTAL': 135000.0,
 'AMT_CREDIT': 450000.0,
 'AMT_ANNUITY': 9000.0,
 'NAME_INCOME_TYPE': 'Working',
 'NAME_EDUCATION_TYPE': 'Secondary / secondary special',
 'NAME_FAMILY_STATUS': 'Married',
 'NAME_HOUSING_TYPE': 'House / apartment',
 'DAYS_BIRTH': -18248.0,
 'DAYS_EMPLOYED': 365243.0,
 'DAYS_ID_PUBLISH': -4102.0,
 'FLAG_MOBIL': 1.0,
 'FLAG_EMP_PHONE': 1.0,
 'FLAG_WORK_PHONE': 0.0,
 'FLAG_CONT_MOBILE': 1.0,
 'FLAG_PHONE': 0.0,
 'FLAG_EMAIL': 0.0,
 'OCCUPATION_TYPE': 'UNKNOWN',
 'CNT_FAM_MEMBERS': 2.0,
 'EXT_SOURCE_1': 0.0,
 'EXT_SOURCE_2': 0.0,
 'EXT_SOURCE_3': 0.0,
 'OBS_30_CNT_SOCIAL_CIRCLE': 0.0,
 'DEF_30_CNT_SOCIAL_CIRCLE': 0.0,
 'OBS_60_CNT_SOCIAL_CIRCLE': 0.0,
 'DEF_60_CNT_SOCIAL_CIRCLE': 0.0,
 'DAYS_LAST_PHONE_CHANGE': 0.0,
 'FLAG_DOCUMENT_2': 0.0,
 'FLAG_DOCUMENT_3': 1.0,
 'FLAG_DOCUMENT_4': 0.0,
 'FLAG_DOCUMENT_5': 0.0,
 'FLAG_DOCUMENT_6': 0.0,
 'FLAG_DOCUMENT_7': 0.0,
 'FLAG_DOCUMENT_8': 0.0,
 'FL

In [8]:
features = list(set(dataset_columns) - set(['TARGET'])) 
train_features, Y_train = training_dataset[features], training_dataset['TARGET']
test_features, Y_test = testing_dataset[features], testing_dataset['TARGET']

In [21]:
column_trans = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (StandardScaler(), numerical_features)
    )
transformer = column_trans.fit(treated_dataset[features])
encoders = {}
for column in categorical_features:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(train_features[column])
    encoders[column] = categorical_convert

NameError: name 'LabelEncoder' is not defined

In [10]:
X_train = transformer.transform(train_features)
X_test = transformer.transform(test_features)

# Task 1: Decision Tree classification

In [18]:
for max_depth in [5,10,20,50,None]:
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    dt = clf.fit(X_train, Y_train)
    print("Accuracy score for depth: {} = {}".format(max_depth, accuracy_score(clf.predict(X_test), Y_test)))


Accuracy score for depth: 5 = 0.6468
Accuracy score for depth: 10 = 0.6426
Accuracy score for depth: 20 = 0.5986
Accuracy score for depth: 50 = 0.59
Accuracy score for depth: None = 0.5897


# Task 2: Random Forest classification

In [17]:
for max_depth in [5,10,20,50,None]:
    clf = RandomForestClassifier(criterion='entropy', max_depth=max_depth, random_state=0)
    ra= clf.fit(X_train, Y_train)
    print("Accuracy score for depth: {} = {}".format(max_depth, accuracy_score(clf.predict(X_test), Y_test)))
    
                                 

Accuracy score for depth: 5 = 0.6445
Accuracy score for depth: 10 = 0.6447
Accuracy score for depth: 20 = 0.6513
Accuracy score for depth: 50 = 0.6615
Accuracy score for depth: None = 0.6563


In [19]:
# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rs, "./ra_random_forest.joblib", compress=True)
joblib.dump(dt, "./ra_decision_trees.joblib", compress=True)

NameError: name 'encoders' is not defined

# Task 3: XGBoost classification

In [None]:
xgb_model = xgb.XGBClassifier(n_jobs=-1)
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2, 4, 6, 10, 20, 40, 60, 75, 100],
                    'n_estimators': [50, 100, 200]}, verbose=1, n_jobs=-1)
clf.fit(X_train, Y_train)

In [None]:
print("Best params for XGBoost classifier: {}".format(clf.best_params_))
print("XGBoost classifier accuracy: {}".format(accuracy_score(clf.predict(X_test), Y_test)))
disp = plot_confusion_matrix(clf, X_test, Y_test,
                                         cmap=plt.cm.Blues,
                                         normalize='true')
disp.ax_.set_title("Confusion matrix for XGBoost classifier")