In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import re
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV

import joblib

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

In [2]:
def catboost(X_train, X_test, y_train, y_test):

    param_grid = [{'iterations': [500],
                  'depth': [4, 5, 6],
                  'loss_function': ['Logloss', 'CrossEntropy'],
                  'l2_leaf_reg': np.logspace(-20, -19, 3),
                  'leaf_estimation_iterations': [10],
                  'eval_metric': ['AUC'],
                  # 'use_best_model': ['True'],
                  'logging_level':['Silent'],
                  'random_seed': [42]
                 }]
    
    model = CatBoostClassifier()
    
    gs = GridSearchCV(estimator=model,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    y_prob = np.round(gs.predict_proba(X_test), 2)
    y_prob = y_prob[:, 1]
    y_pred = np.where(y_prob > 0.5, 1, 0)
    acc = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_prob)

    print('catboost')
    print(gs.best_score_)
    print(gs.best_params_)
    
    return acc, precision, recall, auc, model

In [3]:
if __name__ == '__main__':
    label_encoder = preprocessing.LabelEncoder()
    pd.options.display.max_columns=None
    
    dataframe = pd.read_csv('train0125.csv')
    dataframe.index = np.arange(10000, 25000)
    dataframe2 = pd.read_csv('test0125.csv')
    dataframe2.index = np.arange(30000, 44999)
    
    train_prob = pd.read_csv('train_problem_data.csv')
    problem = np.zeros(15000)
    problem[train_prob.user_id.unique()-10000] = 1 
    
    X = dataframe.astype(float).values
    y = problem
    
    kf = KFold(n_splits=3)
    i=1
    
    acc_list, precision_list, recall_list, auc_list = [], [], [], []
    
    for train_index, test_index in kf.split(X):
        print('=========================', i, '=================================')
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        acc1, precision1, recall1, auc1, model = catboost(X_train, X_test, y_train, y_test)

        
        # sgd
        print('catboost accuracy: ', acc1)
        print('catboost auc: ', auc1)
        
        acc_list.append(acc1)
        precision_list.append(precision1)
        recall_list.append(recall1)
        auc_list.append(auc1)
        
        i+=1
        
    print('----------------------- final result ------------------------------')
    print('catboost average of accuracy', np.mean(acc_list))
    print('catboost average of AUC', np.mean(auc_list))
  

catboost
0.8311812033338933
{'depth': 4, 'eval_metric': 'AUC', 'iterations': 500, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42}
catboost accuracy:  0.7948
catboost auc:  0.8248534038901602
catboost
0.8283312301037655
{'depth': 5, 'eval_metric': 'AUC', 'iterations': 500, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42}
catboost accuracy:  0.7994
catboost auc:  0.832126813619241
catboost
0.8271172156187243
{'depth': 4, 'eval_metric': 'AUC', 'iterations': 500, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'logging_level': 'Silent', 'loss_function': 'Logloss', 'random_seed': 42}
catboost accuracy:  0.8056
catboost auc:  0.84061030322071
----------------------- final result ------------------------------
catboost average of accuracy 0.7999333333333333
catboost average of AUC 0.8325301735767038


In [4]:
0.713587

0.713587