In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import re
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn import preprocessing

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV

import joblib

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

In [2]:
def GB(X_train, X_test, y_train, y_test):

    param_grid = [{'loss':['exponential'],
                  'learning_rate':[0.01,0.1,0.2],
                  'max_depth':[8, 12],
                  'max_features':['log2'],
                  'criterion':['friedman_mse'],
                  'n_estimators':[30, 40]}]
    
    model = GradientBoostingClassifier()
    
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_prob = np.round(gs.predict_proba(X_test), 2)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)

    print('gradient boosting')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return acc, precision, recall, auc, model

In [3]:
def SGD(X_train, X_test, y_train, y_test):

    param_grid = [{'loss':['log', 'modified_huber'],
                  'penalty': ['l2', 'l1', 'elasticnet'],
                  'alpha': [0.0001, 0.001, 0.01, 0.1]
                  #'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 1]
                  }]
    
    model = SGDClassifier()
    
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_prob = np.round(gs.predict_proba(X_test), 2)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)

    print('SGD')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return acc, precision, recall, auc, model

In [4]:
if __name__ == '__main__':
    label_encoder = preprocessing.LabelEncoder()
    pd.options.display.max_columns=None
    
    dataframe = pd.read_csv('train_set.csv')
    dataframe.index = np.arange(10000, 25000)
    
    train_prob = pd.read_csv('train_problem_data.csv')
    problem = np.zeros(15000)
    problem[train_prob.user_id.unique()-10000] = 1 
    
    X = dataframe.astype(float).values
    y = problem
    
    kf = KFold(n_splits=3)
    i=1
    
    acc_list, precision_list, recall_list, auc_list = [], [], [], []
    acc_list2, precision_list2, recall_list2, auc_list2 = [], [], [], []
    
    for train_index, test_index in kf.split(X):
        print('=========================', i, '=================================')
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        acc1, precision1, recall1, auc1, model = GB(X_train, X_test, y_train, y_test)
        acc2, precision2, recall2, auc2, model2 = SGD(X_train, X_test, y_train, y_test)
        
        # GB
        print('GB accuracy: ', acc1)
        print('GB auc: ', auc1)
        
        acc_list.append(acc1)
        precision_list.append(precision1)
        recall_list.append(recall1)
        auc_list.append(auc1)
        
        # SGD
        print('SGD accuracy: ', acc2)
        print('SGD auc: ', auc2)
        
        acc_list2.append(acc2)
        precision_list2.append(precision2)
        recall_list2.append(recall2)
        auc_list2.append(auc2)
        
        i+=1
        
    print('----------------------- final result ------------------------------')
    print('GB average of accuracy', np.mean(acc_list))
    print('GB average of AUC', np.mean(auc_list))
    print('SGD average of accuracy', np.mean(acc_list2))
    print('SGD average of AUC', np.mean(auc_list2))
  

gradient boosting
0.8125282927485984
{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 40}
SGD
0.697383834863951
{'alpha': 0.0001, 'loss': 'log', 'penalty': 'l1'}
GB accuracy:  0.7878
GB auc:  0.8088238109065482
SGD accuracy:  0.6618
SGD auc:  0.6144092527101681
gradient boosting
0.8115920234785365
{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 40}
SGD
0.6845025759430101
{'alpha': 0.001, 'loss': 'modified_huber', 'penalty': 'l1'}
GB accuracy:  0.7896
GB auc:  0.8124587065083542
SGD accuracy:  0.697
SGD auc:  0.6267854465149884
gradient boosting
0.8067572743765432
{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 40}
SGD
0.6903865608728992
{'alpha': 0.001, 'loss': 'log', 'penalty': 'l2'}
GB accuracy:  0.7932
GB auc:  0.8190103548536276
SGD accur