In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from scipy import stats
import statsmodels as sm
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn import metrics
import sys
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf

In [2]:
pd.set_option('display.max_columns', 100)

# load data

## seg1

In [3]:
data_rootpath = r'C:\Users\w10\Desktop\웰컴\AS\8주차(데이터추가추출부터)\model_data\seg1'

In [4]:
os.listdir(data_rootpath)

['model_validation_df.csv',
 'test_df.csv',
 'train_df.csv',
 'valid_df.csv',
 '필터링후']

In [5]:
train_df = pd.read_csv(os.path.join(data_rootpath, 'train_df.csv'))
valid_df = pd.read_csv(os.path.join(data_rootpath, 'valid_df.csv'))
test_df = pd.read_csv(os.path.join(data_rootpath, 'test_df.csv'))
model_valid_df = pd.read_csv(os.path.join(data_rootpath, 'model_validation_df.csv'))

In [6]:
print('Train')
print(train_df.shape)
print('\nValid')
print(valid_df.shape)
print('\nTest')
print(test_df.shape)
print('\nmodel validation')
print(model_valid_df.shape)

Train
(73533, 45)

Valid
(24728, 45)

Test
(24547, 45)

model validation
(19936, 45)


In [7]:
train_df.head(2)

Unnamed: 0,no,BAD,A5WC0000000200,PE1000011,PE0000025,PS0001728,A5RCLSRL078700,PS0000188,A5RCLSRL091300,A5RCLSRL027400,A5RCLSRL027300,A5RCLSRL027500,IDT000004_1,IDT000003,CA0000601,C00000083,C00060606,LA0000604,LU0000902_1,CF1231601,LC0000609,LA0000222_s12,LC0000608,LA0000203_s9,LS0000607,LC0000901,EW0001601_1,LA0000204_s12,LRZ00124G,LA0000204_s6,LA0000020_s9,LA1200206,LA0000227_s9,LA0000204_s3,AE0000005_1,P11252001_1,LA0000001_s12,EH0001601_1,LS0000086,P2E000500_9_1,LH000000E,LA0000001_s6,LA1200203,LA0600203,EH1201002_1
0,1,1,16000,0,0,0,0,289,11830,73.94,73.94,73.94,84,51,0,692,692,0,10000,0,0,0.0,0,0.0,0,7385,691,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0.0,691,0,0.0,0,0.0,0,0,1
1,3,1,30000,9,0,11,3,170,29036,73.99,28.95,41.28,263,263,7752,476,3332,4153,4436,12,4153,843.96,833,1527.27,121,3058,274,1177.46,8899,2271.25,0.13,0,1655.59,-574.6,0,0,0.15,274,12649,0.0,121,0.21,18000,13000,1


In [8]:
train_df[train_df.duplicated(['no'], keep = False)]

Unnamed: 0,no,BAD,A5WC0000000200,PE1000011,PE0000025,PS0001728,A5RCLSRL078700,PS0000188,A5RCLSRL091300,A5RCLSRL027400,A5RCLSRL027300,A5RCLSRL027500,IDT000004_1,IDT000003,CA0000601,C00000083,C00060606,LA0000604,LU0000902_1,CF1231601,LC0000609,LA0000222_s12,LC0000608,LA0000203_s9,LS0000607,LC0000901,EW0001601_1,LA0000204_s12,LRZ00124G,LA0000204_s6,LA0000020_s9,LA1200206,LA0000227_s9,LA0000204_s3,AE0000005_1,P11252001_1,LA0000001_s12,EH0001601_1,LS0000086,P2E000500_9_1,LH000000E,LA0000001_s6,LA1200203,LA0600203,EH1201002_1


## set index

In [9]:
def set_index_and_sort(data, index_col):
    data.set_index(index_col, drop = True, inplace = True, verify_integrity = True)
    data.sort_index(inplace = True)
    return data

In [10]:
train_df = set_index_and_sort(train_df, 'no')
valid_df = set_index_and_sort(valid_df,'no')
test_df = set_index_and_sort(test_df,'no')
model_valid_df = set_index_and_sort(model_valid_df, 'no')

In [11]:
train_df.head(1)

Unnamed: 0_level_0,BAD,A5WC0000000200,PE1000011,PE0000025,PS0001728,A5RCLSRL078700,PS0000188,A5RCLSRL091300,A5RCLSRL027400,A5RCLSRL027300,A5RCLSRL027500,IDT000004_1,IDT000003,CA0000601,C00000083,C00060606,LA0000604,LU0000902_1,CF1231601,LC0000609,LA0000222_s12,LC0000608,LA0000203_s9,LS0000607,LC0000901,EW0001601_1,LA0000204_s12,LRZ00124G,LA0000204_s6,LA0000020_s9,LA1200206,LA0000227_s9,LA0000204_s3,AE0000005_1,P11252001_1,LA0000001_s12,EH0001601_1,LS0000086,P2E000500_9_1,LH000000E,LA0000001_s6,LA1200203,LA0600203,EH1201002_1
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
1,1,16000,0,0,0,0,289,11830,73.94,73.94,73.94,84,51,0,692,692,0,10000,0,0,0.0,0,0.0,0,7385,691,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0.0,691,0,0.0,0,0.0,0,0,1


### 소득변수 제외

In [12]:
train_df = train_df.drop(['A5WC0000000200'], axis = 1)
valid_df = valid_df.drop(['A5WC0000000200'], axis = 1)
test_df = test_df.drop(['A5WC0000000200'], axis = 1)
model_valid_df = model_valid_df.drop(['A5WC0000000200'], axis = 1)

## X,y split

In [13]:
X_train = train_df.drop(['BAD'], axis=1)
X_valid = valid_df.drop(['BAD'], axis = 1)
X_test = test_df.drop(['BAD'], axis=1)
X_model_valid = model_valid_df.drop(['BAD'], axis = 1)


y_train = train_df[['BAD']]
y_valid = valid_df[['BAD']]
y_test = test_df[['BAD']]
y_model_valid = model_valid_df[['BAD']]

In [14]:
X_train.shape

(73533, 42)

In [15]:
X_train.columns

Index(['PE1000011', 'PE0000025', 'PS0001728', 'A5RCLSRL078700', 'PS0000188',
       'A5RCLSRL091300', 'A5RCLSRL027400', 'A5RCLSRL027300', 'A5RCLSRL027500',
       'IDT000004_1', 'IDT000003', 'CA0000601', 'C00000083', 'C00060606',
       'LA0000604', 'LU0000902_1', 'CF1231601', 'LC0000609', 'LA0000222_s12',
       'LC0000608', 'LA0000203_s9', 'LS0000607', 'LC0000901', 'EW0001601_1',
       'LA0000204_s12', 'LRZ00124G', 'LA0000204_s6', 'LA0000020_s9',
       'LA1200206', 'LA0000227_s9', 'LA0000204_s3', 'AE0000005_1',
       'P11252001_1', 'LA0000001_s12', 'EH0001601_1', 'LS0000086',
       'P2E000500_9_1', 'LH000000E', 'LA0000001_s6', 'LA1200203', 'LA0600203',
       'EH1201002_1'],
      dtype='object')

# load gbm model

In [16]:
from sklearn.externals import joblib

In [17]:
model_path = r'C:\Users\w10\Desktop\웰컴\AS\8주차(데이터추가추출부터)\최종적합\seg1\gbm\final_model'

In [18]:
os.listdir(model_path)

['AS_seg1_1000epochs_result_temp4.xlsx',
 'AS_seg1_final_model_temp4.joblib',
 'AS_seg1_final_result_temp4.xlsx',
 'AS_seg1_random_search_result_temp4.xlsx']

In [19]:
gbm = joblib.load(os.path.join(model_path, 'AS_seg1_final_model_temp4.joblib',))

In [20]:
gbm

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.009, loss='deviance', max_depth=4,
                           max_features=8, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=0.02, min_samples_split=0.0255,
                           min_weight_fraction_leaf=0.0, n_estimators=2927,
                           n_iter_no_change=None, presort='auto',
                           random_state=1, subsample=0.89, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [21]:
def get_accuracy(probs, labels, accu_thre=0.5):
    pred_labels = np.zeros(probs.shape)
    pred_labels[probs >= accu_thre] = 1.0
    accuracy = metrics.accuracy_score(labels, pred_labels)
    return accuracy

def get_ks_statistics(probs, labels):
    probs_pos = probs[labels == 1]
    probs_neg = probs[labels == 0]
    ks, ks_p_value = stats.ks_2samp(probs_pos, probs_neg)
    return ks, ks_p_value

def get_auc_ar(probs, labels):
    auc = metrics.roc_auc_score(labels, probs)
    ar = auc * 2 - 1
    return auc, ar

class Performances:
    def __init__(self, probs, labels, accu_thre=0.5):
        accuracy = get_accuracy(probs, labels, accu_thre=accu_thre)
        ks, ks_p_value = get_ks_statistics(probs, labels)
        auc, ar = get_auc_ar(probs, labels)
        self.accuracy = accuracy
        self.ks = ks * 100  # percent
        self.ks_p_value = ks_p_value
        self.auc = auc * 100  # percent
        self.ar = ar * 100  # percent

In [22]:
prob_tr = gbm.predict_proba(X_train)
prob_valid = gbm.predict_proba(X_valid)
prob_test = gbm.predict_proba(X_test)
prob_model_valid = gbm.predict_proba(X_model_valid)

In [23]:
#Compute ks/ar
result_train = Performances(prob_tr[:, 1], y_train['BAD'])
result_valid = Performances(prob_valid[:, 1], y_valid['BAD'])
result_test = Performances(prob_test[:, 1], y_test['BAD'])

In [24]:
#Print model report:
print("\nModel Report")
print("Train KS : {:.5f}, / AR : {:.5f}".format(result_train.ks, result_train.ar))
print("Valid KS : {:.5f}, / AR : {:.5f}".format(result_valid.ks, result_valid.ar))
print("Test KS : {:.5f}, / AR : {:.5f}".format(result_test.ks, result_test.ar))


Model Report
Train KS : 36.93015, / AR : 49.45567
Valid KS : 33.49129, / AR : 45.58551
Test KS : 36.10840, / AR : 47.68122


In [25]:
prob_train_df = pd.DataFrame(prob_tr[:,1], index = X_train.index, columns = ['gbm_phat'])
prob_valid_df = pd.DataFrame(prob_valid[:,1], index = X_valid.index, columns = ['gbm_phat'])
prob_test_df = pd.DataFrame(prob_test[:,1], index = X_test.index, columns = ['gbm_phat'])
prob_model_valid_df = pd.DataFrame(prob_model_valid[:,1], index = X_model_valid.index, columns = ['gbm_phat'])

In [26]:
final_prob_train = pd.merge(y_train, prob_train_df, left_index = True, right_index = True, how = 'inner')
final_prob_valid = pd.merge(y_valid, prob_valid_df, left_index = True, right_index = True, how = 'inner')
final_prob_test = pd.merge(y_test, prob_test_df, left_index = True, right_index = True, how = 'inner')
final_prob_model_valid = pd.merge(y_model_valid, prob_model_valid_df, left_index = True, right_index = True, how = 'inner')

In [27]:
final_prob_train.head()

Unnamed: 0_level_0,BAD,gbm_phat
no,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0.447528
3,1,0.438492
6,0,0.517598
9,0,0.344496
12,1,0.396566


In [28]:
save_path = r'C:\Users\w10\Desktop\웰컴\AS\8주차(데이터추가추출부터)\phat\seg1\gbm'

In [29]:
final_prob_train.to_csv(os.path.join(save_path, 'GBM_train_phat_df.csv'))
final_prob_valid.to_csv(os.path.join(save_path, 'GBM_valid_phat_df.csv'))
final_prob_test.to_csv(os.path.join(save_path, 'GBM_test_phat_df.csv'))
final_prob_model_valid.to_csv(os.path.join(save_path, 'GBM_model_valid_phat_df.csv'))