In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import re
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn import preprocessing

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

import joblib

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

In [2]:
def dt2(X_train, X_test, y_train, y_test):
    param_range1 = np.arange(3,15,1)
    param_range2 = np.arange(30,60,5)
    
    param_grid = [{'max_depth':param_range1,
                  'min_samples_leaf':param_range2}]
    
    model = DecisionTreeClassifier()
    
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_pred = gs.predict(X_test)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='macro')
    recall = metrics.recall_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_pred, multi_class='ovr')

    print('DT')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return metric, acc, precision, recall, auc, model

In [3]:
def mlp(X_train, X_test, y_train, y_test):
    
    param_range1 = ['lbfgs', 'adam', 'sgd']
    param_range2 = np.arange(1000,2000,100)
    param_range3 = np.arange(10, 30, 5)
    param_range4 = np.array([0.0001,0.001,0.01])
    
    param_grid = [{'solver':param_range1,
                  'max_iter':param_range2,
                  'hidden_layer_sizes':param_range3,
                  'learning_rate_init':param_range4}]
    
    model = MLPClassifier()
    
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_pred = gs.predict(X_test)

    metric = metrics.confusion_matrix(y_test, y_pred)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='macro')
    recall = metrics.recall_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_pred, multi_class='ovr')

    print('MLP')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return metric, acc, precision, recall, auc, model

In [4]:
def rf(X_train, X_test, y_train, y_test):
    
    param_range1 = np.arange(3,15,1)
    param_range2 = np.arange(30,60,5)
    param_range3 = np.arange(30,70,5)
    param_range4 = np.array([10, 50, 100])
    
    param_grid = [{'max_depth':param_range1,
                  'min_samples_leaf':param_range2,
                  'min_samples_split':param_range3,
                  'n_estimators':param_range4}]
    
    model = RandomForestClassifier()
    
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_pred = gs.predict(X_test)
    metric = metrics.confusion_matrix(y_test, y_pred)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='macro')
    recall = metrics.recall_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
    
    print('RF')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return metric, acc, precision, recall, auc, model

In [5]:
def svm(X_train, X_test, y_train, y_test):
    
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }
    model = SVC()
    
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_pred = gs.predict(X_test)
    metric = metrics.confusion_matrix(y_test, y_pred)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='macro')
    recall = metrics.recall_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
    
    print('SVM')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return metric, acc, precision, recall, auc, model

In [6]:
def knn(X_train, X_test, y_train, y_test):
    
    param_grid = {'n_neighbors':[3,5,11,19],
                 'weights':['uniform','distance'],
                 'metric':['euclidean','manhattan']}
    model = KNeighborsClassifier()
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_pred = gs.predict(X_test)
    metric = metrics.confusion_matrix(y_test, y_pred)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='macro')
    recall = metrics.recall_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
    
    print('KNN')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return metric, acc, precision, recall, auc, model

In [7]:
def lgbm(X_train, X_test, y_train, y_test):
    
    param_grid = {
        'learning_rate': [0.005, 0.01],
        'n_estimators': [8,16,24],
        'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
        'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
        'objective' : ['binary'],
        'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
        'random_state' : [500],
        'colsample_bytree' : [0.64, 0.65, 0.66],
        'subsample' : [0.7,0.75],
        'reg_alpha' : [1,1.2],
        'reg_lambda' : [1,1.2,1.4],
    }
    
    model = LGBMClassifier()
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_pred = gs.predict(X_test)
    metric = metrics.confusion_matrix(y_test, y_pred)
    metric = metrics.confusion_matrix(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average='macro')
    recall = metrics.recall_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
    
    print('LGBM')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return metric, acc, precision, recall, auc, model

In [None]:
if __name__ == '__main__':
    label_encoder = preprocessing.LabelEncoder()
    pd.options.display.max_columns=None
    
    dataframe = pd.read_csv('trainset.csv')
    dataframe.index = np.arange(10000, 25000)
    dataframe = dataframe.drop(['label'], axis=1)
    train_prob = pd.read_csv('train_problem_data.csv')
    problem = np.zeros(15000)
    problem[train_prob.user_id.unique()-10000] = 1 
    
    X = dataframe.astype(float).values
    y = problem
    
    kf = KFold(n_splits=3)
    i=1
    
    acc_list, precision_list, recall_list, auc_list = [], [], [], []
    acc_list2, precision_list2, recall_list2, auc_list2 = [], [], [], []
    acc_list3, precision_list3, recall_list3, auc_list3 = [], [], [], []
    acc_list4, precision_list4, recall_list4, auc_list4 = [], [], [], []
    acc_list5, precision_list5, recall_list5, auc_list5 = [], [], [], []
    acc_list6, precision_list6, recall_list6, auc_list6 = [], [], [], []
    
    for train_index, test_index in kf.split(X):
        print('=========================', i, '=================================')
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        metric1, acc1, precision1, recall1, auc1, model = dt2(X_train, X_test, y_train, y_test)
        metric2, acc2, precision2, recall2, auc2, model2 = mlp(X_train, X_test, y_train, y_test)
        metric3, acc3, precision3, recall3, auc3, model3 = rf(X_train, X_test, y_train, y_test)
        metric4, acc4, precision4, recall4, auc4, model4 = svm(X_train, X_test, y_train, y_test)
        metric5, acc5, precision5, recall5, auc5, model5 = knn(X_train, X_test, y_train, y_test)
        metric6, acc6, precision6, recall6, auc6, model6 = lgbm(X_train, X_test, y_train, y_test)

        
        # dt
        print('dt accuracy: ', acc1)
        print('dt precision: ', precision1)
        print('dt recall: ', recall1)
        print('dt auc: ', auc1)
        
        acc_list.append(acc1)
        precision_list.append(precision1)
        recall_list.append(recall1)
        auc_list.append(auc1)
        
        # mlp
        print('mlp accuracy: ', acc2)
        print('mlp precision: ', precision2)
        print('mlp recall: ', recall2)
        print('mlp auc: ', auc2)
        
        acc_list2.append(acc2)
        precision_list2.append(precision2)
        recall_list2.append(recall2)
        auc_list2.append(auc2)
        
        # rf
        print('rf accuracy: ', acc3)
        print('rf precision: ', precision3)
        print('rf recall: ', recall3)
        print('rf auc: ', auc3)
        
        acc_list3.append(acc3)
        precision_list3.append(precision3)
        recall_list3.append(recall3)
        auc_list3.append(auc3)
        
        # svm
        print('svm accuracy: ', acc4)
        print('svm precision: ', precision4)
        print('svm recall: ', recall4)
        print('svm auc: ', auc4)
        
        acc_list4.append(acc4)
        precision_list4.append(precision4)
        recall_list4.append(recall4)
        auc_list4.append(auc4)
        
        # knn
        print('knn accuracy: ', acc5)
        print('knn precision: ', precision5)
        print('knn recall: ', recall5)
        print('knn auc: ', auc5)
        
        acc_list5.append(acc5)
        precision_list5.append(precision5)
        recall_list5.append(recall5)
        auc_list5.append(auc5)
        
        # lgbm
        print('lgbm accuracy: ', acc6)
        print('lgbm precision: ', precision6)
        print('lgbm recall: ', recall6)
        print('lgbm auc: ', auc6)
        
        acc_list6.append(acc6)
        precision_list6.append(precision6)
        recall_list6.append(recall6)
        auc_list6.append(auc6)
        
        i+=1
        
    print('----------------------- final result ------------------------------')
    print('dt average of accuracy', np.mean(acc_list))
    print('dt average of precision', np.mean(precision_list))
    print('dt average of recall', np.mean(recall_list))
    print('dt average of AUC', np.mean(auc_list))
    print()
    print('mlp average of accuracy', np.mean(acc_list2))
    print('mlp average of precision', np.mean(precision_list2))
    print('mlp average of recall', np.mean(recall_list2))
    print('mlp average of AUC', np.mean(auc_list2))
    print()
    print('rf average of accuracy', np.mean(acc_list3))
    print('rf average of precision', np.mean(precision_list3))
    print('rf average of recall', np.mean(recall_list3))
    print('rf average of AUC', np.mean(auc_list3))
    print()
    print('svm average of accuracy', np.mean(acc_list4))
    print('svm average of precision', np.mean(precision_list4))
    print('svm average of recall', np.mean(recall_list4))
    print('svm average of AUC', np.mean(auc_list4))
    print()
    print('knn average of accuracy', np.mean(acc_list5))
    print('knn average of precision', np.mean(precision_list5))
    print('knn average of recall', np.mean(recall_list5))
    print('knn average of AUC', np.mean(auc_list5))
    print()
    print('lgbm average of accuracy', np.mean(acc_list6))
    print('lgbm average of precision', np.mean(precision_list6))
    print('lgbm average of recall', np.mean(recall_list6))
    print('lgbm average of AUC', np.mean(auc_list6))

DT
0.7787053160765492
{'max_depth': 7, 'min_samples_leaf': 40}
MLP
0.7857254052786181
{'hidden_layer_sizes': 25, 'learning_rate_init': 0.01, 'max_iter': 1200, 'solver': 'adam'}
RF
0.8117116522388853
{'max_depth': 11, 'min_samples_leaf': 30, 'min_samples_split': 30, 'n_estimators': 100}
SVM
0.6652030400907522
{'C': 0.01, 'gamma': 0.001}
KNN
0.6775991283087816
{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'distance'}
LGBM
0.80084261997264
{'boosting_type': 'gbdt', 'colsample_bytree': 0.64, 'learning_rate': 0.005, 'max_bin': 255, 'n_estimators': 16, 'num_leaves': 16, 'objective': 'binary', 'random_state': 500, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
dt accuracy:  0.776
dt precision:  0.7794990042972434
dt recall:  0.6926227376742251
dt auc:  0.6926227376742251
mlp accuracy:  0.7712
mlp precision:  0.7636811508523007
mlp recall:  0.6933017243371934
mlp auc:  0.6933017243371935
rf accuracy:  0.7808
rf precision:  0.7934724215449427
rf recall:  0.6945347352240945
rf auc:  

In [None]:
0.713587