In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import re
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn import preprocessing

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier

import joblib

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def make_datetime2(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    #hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day)


Bad key "text.kerning_factor" on line 4 in
C:\Users\YH\anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = metrics.precision_recall_curve(labels, probas_pred)
    score=metrics.auc(r,p) 
    return "pr_auc", score, True

In [3]:
def dt2(X_train, X_test, y_train, y_test):
    model = DecisionTreeClassifier(random_state=0,
                                  max_depth=9,
                                  min_samples_leaf=35)
    model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'dt.model'
    joblib.dump(model, filename)
    
    return metric, acc, precision, recall, auc, f1, model

In [4]:
def mlp(X_train, X_test, y_train, y_test):
    model = MLPClassifier(random_state=1,
                         hidden_layer_sizes=25,
                         learning_rate_init=0.01,
                         max_iter=1200,
                         solver='adam')
    
    model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'mlp.model'
    joblib.dump(model, filename)
    
    return metric, acc, precision, recall, auc, f1, model

In [5]:
def rf(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(random_state=0,
                                  max_depth=14,
                                  min_samples_leaf=30,
                                  min_samples_split=30,
                                  n_estimators=100)
    model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'rf.model'
    joblib.dump(model, filename)
    
    return metric, acc, precision, recall, auc, f1, model

In [6]:
def lgbm(X_train, X_test, y_train, y_test):
    
    model = LGBMClassifier(
        objective = 'binary',
        boosting_type='gbdt',
        learning_rate=0.005,
        max_bin=255,
        n_estimators=16,
        num_leaves=16,
        reg_alpha=1,
        reg_lambda=1,
        subsample=0.7
    )
    model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'lgbm.model'
    joblib.dump(model, filename)
    
    return metric, acc, precision, recall, auc, f1, model

In [7]:
import lightgbm as lgb

def lgbm2(X_train, X_test, y_train, y_test):
    
    params =      {
                    'boosting_type' : 'gbdt',
                    'objective'     : 'binary',
                    'metric'        : 'auc',
                    'seed': 1015
                    }
    
    d_train = lgb.Dataset(X_train, y_train)
    d_val = lgb.Dataset(X_test, y_test)
    
    model = lgb.train(
        params,
        train_set       = d_train,
        num_boost_round = 1000,
        valid_sets      = d_val,
        feval           = f_pr_auc,
        verbose_eval    = 20, 
        early_stopping_rounds = 3
    )

    y_prob = model.predict(X_test)
    y_pred = np.where(y_prob > 0.5, 1, 0)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'lgbm2.model'
    joblib.dump(model, filename)
    
    return acc, precision, recall, auc, f1, model

In [8]:
def xgboost(X_train, X_test, y_train, y_test):
    
    model = XGBClassifier(
        booster='gbtree',
        gamma=3,
        max_depth=3,
        n_estimators=30
    )
    model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'xgboost.model'
    joblib.dump(model, filename)
    
    return metric, acc, precision, recall, auc, f1, model

In [9]:
def GB(X_train, X_test, y_train, y_test):
    model = GradientBoostingClassifier(
                   criterion='friedman_mse',
                   learning_rate=0.1,
                   loss='exponential',
                   max_depth=8,
                   max_features='log2',
                   n_estimators=40)
    model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'GB.model'
    joblib.dump(model, filename)
    
    return metric, acc, precision, recall, auc, f1, model

In [10]:
def catboost(X_train, X_test, y_train, y_test):
    model = CatBoostClassifier(
                   depth=4,
                   eval_metric='AUC',
                   iterations=500,
                   leaf_estimation_iterations=10,
                   logging_level='Silent',
                   loss_function='Logloss',
                   random_seed=42)
    model = model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)
    
    filename = 'catboost.model'
    joblib.dump(model, filename)
    
    return metric, acc, precision, recall, auc, f1, model

In [11]:
if __name__ == '__main__':
    label_encoder = preprocessing.LabelEncoder()
    pd.options.display.max_columns=None
    
    dataframe = pd.read_csv('train0127.csv')
    dataframe.index = np.arange(10000, 25000)
    dataframe2 = pd.read_csv('test0127.csv')
    dataframe2.index = np.arange(30000, 44999)
    dataframe2 = dataframe2.fillna(0)
    
    train_prob = pd.read_csv('train_problem_data.csv')
    problem = np.zeros(15000)
    problem[train_prob.user_id.unique()-10000] = 1 
    
    X = dataframe.astype(float).values
    y = problem
    X_te = dataframe2.astype(float).values
    
    kf = KFold(n_splits=5)
    i=1
    
    acc_list, precision_list, recall_list, f1_list, auc_list = [], [], [], [], []
    acc_list2, precision_list2, recall_list2, f1_list2, auc_list2 = [], [], [], [], []
    acc_list3, precision_list3, recall_list3, f1_list3, auc_list3 = [], [], [], [], []
    acc_list4, precision_list4, recall_list4, f1_list4, auc_list4 = [], [], [], [], []
    acc_list5, precision_list5, recall_list5, f1_list5, auc_list5 = [], [], [], [], []
    acc_list6, precision_list6, recall_list6, f1_list6, auc_list6 = [], [], [], [], []
    acc_list7, precision_list7, recall_list7, f1_list7, auc_list7 = [], [], [], [], []
    acc_list8, precision_list8, recall_list8, f1_list8, auc_list8 = [], [], [], [], []
    
    for train_index, test_index in kf.split(X):
        print('=========================', i, '=================================')
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        metric1, acc1, precision1, recall1, auc1, f11, model = dt2(X_train, X_test, y_train, y_test)
        metric2, acc2, precision2, recall2, auc2, f12, model2 = mlp(X_train, X_test, y_train, y_test)
        metric3, acc3, precision3, recall3, auc3, f13, model3 = rf(X_train, X_test, y_train, y_test)
        metric4, acc4, precision4, recall4, auc4, f14, model4 = lgbm(X_train, X_test, y_train, y_test)
        acc5, precision5, recall5, auc5, f15, model5 = lgbm2(X_train, X_test, y_train, y_test)
        metric6, acc6, precision6, recall6, auc6, f16, model6 = xgboost(X_train, X_test, y_train, y_test)
        metric7, acc7, precision7, recall7, auc7, f17, model7 = GB(X_train, X_test, y_train, y_test)
        metric8, acc8, precision8, recall8, auc8, f18, model8 = catboost(X_train, X_test, y_train, y_test)

        
        # dt
        print('dt accuracy: ', acc1)
        print('dt precision: ', precision1)
        print('dt recall: ', recall1)
        print('dt f1: ', f11)
        print('dt auc: ', auc1)
        print()
        acc_list.append(acc1)
        precision_list.append(precision1)
        recall_list.append(recall1)
        f1_list.append(f11)
        auc_list.append(auc1)
        
        # mlp
        print('mlp accuracy: ', acc2)
        print('mlp precision: ', precision2)
        print('mlp recall: ', recall2)
        print('mlp f1: ', f12)
        print('mlp auc: ', auc2)
        print()
        acc_list2.append(acc2)
        precision_list2.append(precision2)
        recall_list2.append(recall2)
        f1_list2.append(f12)
        auc_list2.append(auc2)
        
        # rf
        print('rf accuracy: ', acc3)
        print('rf precision: ', precision3)
        print('rf recall: ', recall3)
        print('rf f1: ', f13)
        print('rf auc: ', auc3)
        print()
        acc_list3.append(acc3)
        precision_list3.append(precision3)
        recall_list3.append(recall3)
        f1_list3.append(f13)
        auc_list3.append(auc3)
        
        # lgbm
        print('lgbm accuracy: ', acc4)
        print('lgbm precision: ', precision4)
        print('lgbm recall: ', recall4)
        print('lgbm f1: ', f14)
        print('lgbm auc: ', auc4)
        print()
        acc_list4.append(acc4)
        precision_list4.append(precision4)
        recall_list4.append(recall4)
        f1_list4.append(f14)
        auc_list4.append(auc4)
        
        # lgbm2
        print('lgbm2 accuracy: ', acc5)
        print('lgbm2 precision: ', precision5)
        print('lgbm2 recall: ', recall5)
        print('lgbm2 f1: ', f15)
        print('lgbm2 auc: ', auc5)
        print()
        acc_list5.append(acc5)
        precision_list5.append(precision5)
        recall_list5.append(recall5)
        f1_list5.append(f15)
        auc_list5.append(auc5)
        
        # xgboost
        print('xgboost accuracy: ', acc6)
        print('xgboost precision: ', precision6)
        print('xgboost recall: ', recall6)
        print('xgboost f1: ', f16)
        print('xgboost auc: ', auc6)
        print()
        acc_list6.append(acc6)
        precision_list6.append(precision6)
        recall_list6.append(recall6)
        f1_list6.append(f16)
        auc_list6.append(auc6)
        
        # GB
        print('GB accuracy: ', acc7)
        print('GB precision: ', precision7)
        print('GB recall: ', recall7)
        print('GB f1: ', f17)
        print('GB auc: ', auc7)
        print()
        acc_list7.append(acc7)
        precision_list7.append(precision7)
        recall_list7.append(recall7)
        f1_list7.append(f17)
        auc_list7.append(auc7)
        
        # catboost
        print('catboost accuracy: ', acc8)
        print('catboost precision: ', precision8)
        print('catboost recall: ', recall8)
        print('catboost f1: ', f18)
        print('catboost auc: ', auc8)
        
        acc_list8.append(acc8)
        precision_list8.append(precision8)
        recall_list8.append(recall8)
        f1_list8.append(f18)
        auc_list8.append(auc8)
        
        
        
        i+=1
        
    print('----------------------- final result ------------------------------')
    print('dt average of accuracy', np.mean(acc_list))
    print('dt average of precsion', np.mean(precision_list))
    print('dt average of recall', np.mean(recall_list))
    print('dt average of f1', np.mean(f1_list))
    print('dt average of AUC', np.mean(auc_list))
    print()
    print('mlp average of accuracy', np.mean(acc_list2))
    print('mlp average of precsion', np.mean(precision_list2))
    print('mlp average of recall', np.mean(recall_list2))
    print('mlp average of f1', np.mean(f1_list2))
    print('mlp average of AUC', np.mean(auc_list2))
    print()
    print('rf average of accuracy', np.mean(acc_list3))
    print('rf average of precsion', np.mean(precision_list3))
    print('rf average of recall', np.mean(recall_list3))
    print('rf average of f1', np.mean(f1_list3))
    print('rf average of AUC', np.mean(auc_list3))
    print()
    print('lgbm average of accuracy', np.mean(acc_list4))
    print('lgbm average of precsion', np.mean(precision_list4))
    print('lgbm average of recall', np.mean(recall_list4))
    print('lgbm average of f1', np.mean(f1_list4))
    print('lgbm average of AUC', np.mean(auc_list4))
    print()
    print('lgbm2 average of accuracy', np.mean(acc_list5))
    print('lgbm2 average of precsion', np.mean(precision_list5))
    print('lgbm2 average of recall', np.mean(recall_list5))
    print('lgbm2 average of f1', np.mean(f1_list5))
    print('lgbm2 average of AUC', np.mean(auc_list5))
    print()
    print('xgboost average of accuracy', np.mean(acc_list6))
    print('xgboost average of precsion', np.mean(precision_list6))
    print('xgboost average of recall', np.mean(recall_list6))
    print('xgboost average of f1', np.mean(f1_list6))
    print('xgboost average of AUC', np.mean(auc_list6))
    print()
    print('GB average of accuracy', np.mean(acc_list7))
    print('GB average of precsion', np.mean(precision_list7))
    print('GB average of recall', np.mean(recall_list7))
    print('GB average of f1', np.mean(f1_list7))
    print('GB average of AUC', np.mean(auc_list7))
    print()
    print('catboost average of accuracy', np.mean(acc_list8))
    print('catboost average of precsion', np.mean(precision_list8))
    print('catboost average of recall', np.mean(recall_list8))
    print('catboost average of f1', np.mean(f1_list8))
    print('catboost average of AUC', np.mean(auc_list8))

[LightGBM] [Info] Number of positive: 4024, number of negative: 7976
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53521
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 513
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.335333 -> initscore=-0.684161
[LightGBM] [Info] Start training from score -0.684161
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.80627	valid_0's pr_auc: 0.726087
dt accuracy:  0.7746666666666666
dt precision:  0.7329192546583851
dt recall:  0.48360655737704916
dt f1:  0.582716049382716
dt auc:  0.7746946984546104

mlp accuracy:  0.73
mlp precision:  0.7643312101910829
mlp recall:  0.2459016393442623
mlp f1:  0.37209302325581395
mlp auc:  0.7491500558867361

rf accuracy:  0.7873333333333333
rf precision:  0.8039568345323741
rf recall:  0.45799180327868855
rf f1:  0.5835509138381201
rf auc:  0.811929995788246

lgbm ac

[LightGBM] [Info] Number of positive: 3984, number of negative: 8016
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53588
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 513
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.332000 -> initscore=-0.699153
[LightGBM] [Info] Start training from score -0.699153
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[12]	valid_0's auc: 0.83873	valid_0's pr_auc: 0.777246
dt accuracy:  0.779
dt precision:  0.7678300455235205
dt recall:  0.49803149606299213
dt f1:  0.6041791044776119
dt auc:  0.8000839888398527

mlp accuracy:  0.7356666666666667
mlp precision:  0.6632503660322109
mlp recall:  0.44586614173228345
mlp f1:  0.5332548557975281
mlp auc:  0.7726769867602236

rf accuracy:  0.7943333333333333
rf precision:  0.8387096774193549
rf recall:  0.4862204724409449
rf f1:  0.6155763239875389
rf auc:  0.8346714662179324

lgbm