In [None]:
import sys
from google.colab import drive
drive.mount('/content/gdrive') 
sys.path.append('/content/gdrive/MyDrive/My_env')
# ______________________________________________________________________

import os
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 

In [None]:
""" 예측 성능 지표 NMAE """
def NMAE(true, pred) :
  score = np.mean(np.abs(true - pred) / true)
  
  return score

In [None]:
""" to Classifier """
def to_binary(pred) :
  pred_conv = pred.copy()
  if type(pred) == list :
    pred_conv = np.array(pred_conv)
  pred_conv[pred_conv >= 0.5] = 1
  pred_conv[pred_conv < 0.5] = 0
  
  return pred_conv

In [None]:
contest = ""
dir = "/content/gdrive/MyDrive/ColabNotes/DACON/{}/".format(contest)
data_dir = dir + "data/"
submission_file = "sample_submission.csv"
test_file = "test_renew.csv"
X_train_file = "X_train.csv"
Y_train_file = "Y_train.csv"

submission = pd.read_csv(data_dir + submission_file)
test = pd.read_csv(data_dir + test_file)
X_train = pd.read_csv(data_dir + X_train_file)
Y_train = pd.read_csv(data_dir + Y_train_file)

In [None]:
rows_train = X_train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
basic_seed = 52
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 10 # 모델 트레이닝을 진행할 때의 kfold 수
num_seed_tr = 10 # 트레이닝 seed 개수

pred_dict = {}
pred_test_dict = {}

## HP Tuning

In [None]:
def objective_XGB(trial : Trial) -> float :
    score_hp = []
    seed_hp = 21

    params = {
    "device" : "gpu", 
    "gpu_platform_id" : 0,
    "gpu_device_id" : 0, 
    "random_state": seed_hp,
    "verbose": None,
    "learning_rate": trial.suggest_uniform("learning_rate", 2e-3, 5e-2), # eta, default=0.3, range=[0,1]
    "gamma": trial.suggest_loguniform("gamma", 1e-2, 1e+2), # min_split_loss, default=0, range=[0,∞]
    "max_depth": trial.suggest_int("max_depth", 4, 10), # default=5, range=[0,∞]
    "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), #default=1
    "max_delta_step" : trial.suggest_int("max_delta_step", 0, 10), #default=0
    "subsample": trial.suggest_uniform("subsample", 0.0, 1.0), # default=1, range=(0,1]
    "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.2, 1.0), # default=1, range=(0,1]
    "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.2, 1.0), # default=1, range=(0,1]
    "colsample_bynode": trial.suggest_uniform("colsample_bynode", 0.2, 1.0), # default=1, range=(0,1]
    "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-1, 1e+1), # default=0, range=[0,∞]
    "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-1, 1e+1), # default=1, range=[0,∞]
    "max_bin": trial.suggest_int("max_bin", 100, 500),
    }

    kfold = StratifiedKFold(n_splits = splits_hp, random_state = seed_hp, shuffle = True)
    cv_pred = np.zeros(rows_train)

    for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):

        x_tr, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = Y_train.iloc[train_idx].values, Y_train.iloc[val_idx].values

        dtrain = xgb.DMatrix(x_tr, label = y_tr)
        dvalid = xgb.DMatrix(x_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                            
        xgbmodel = xgb.train(params, dtrain, 50000, watchlist, early_stopping_rounds = 30, verbose_eval = None) # 100000
        pred = xgbmodel.predict(dvalid)
        """ Classifier """
        cv_pred[val_idx] = to_binary(pred)
        """ Regressor """
        cv_pred[val_idx] = pred

    score_hp.append(accuracy_score(Y_train, cv_pred))
    np.mean(score_hp)

    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed = basic_seed)
xgb_study = optuna.create_study(study_name = "xgb_parameter_opt", direction = "maximize", sampler = sampler)
xgb_study.optimize(objective_XGB, n_trials = num_trial, n_jobs = -1)

In [None]:
# Tuned Params
best_params = xgb_study.best_trial.params
base_params = {"random_state": basic_seed}
best_params.update(base_params)

with open(dir + "pickle/xgb_best_hyperparams.pickle", 'wb') as fw :
    pickle.dump(best_params, fw)
print("The best hyperparameters are:\n", best_params)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(xgb_study) ;

In [None]:
optuna.visualization.matplotlib.plot_slice(xgb_study) ;

## Modeling with XGBoost

In [None]:
with open(dir + "pickle/xgb_best_hyperparams.pickle", 'rb') as fw:
    best_params = pickle.load(fw)

In [None]:
rand_seeds = np.random.randint(0, 1000, num_seed_tr)
test = xgb.DMatrix(test)

for i, seed in enumerate(rand_seeds) : 

  kfold = StratifiedKFold(n_splits = splits_tr, random_state = seed, shuffle = True)
  cv_pred = np.zeros(rows_train)
  pred_test = np.zeros(rows_test)

  for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)) :
        
        x_tr, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = Y_train.iloc[train_idx].values.ravel(), Y_train.iloc[val_idx].values.ravel()
        
        dtrain = xgb.DMatrix(x_tr, label = y_tr)
        dvalid = xgb.DMatrix(x_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
        xgbmodel = xgb.train(best_params, dtrain, 100000, watchlist, early_stopping_rounds = 30, verbose_eval = None)
        pred = xgbmodel.predict(dvalid)
        cv_pred[val_idx] = pred
        pred_test += xgbmodel.predict(test) / splits_tr

  pred_dict['xgb' + str(seed)] = cv_pred
  pred_test_dict['xgb' + str(seed)] = pred_test
  """ Classifier """
  print(f'seed {seed}', 'Acc :', accuracy_score(Y_train, to_binary(cv_pred)))
  """ Regressor """
  print(f'seed {seed}', 'Acc :', accuracy_score(Y_train, cv_pred))

In [None]:
val_result_df = pd.DataFrame(pred_dict)
val_result = []
for ii in range(len(val_result_df)) :
  single_mean = np.mean(val_result_df.iloc[ii, :])
  val_result.append(single_mean)

""" Classifier """
#val_result = to_binary(val_result)

print("최종 모델 Accuracy : ", accuracy_score(Y_train, val_result))

In [None]:
model = "xgb"

pred_train_df = pd.DataFrame(pred_dict)
pred_train_df = pred_train_df[np.sort(pred_train_df.columns)]

with open(dir + "pickle/pred_train_" + model + ".pickle", "wb") as ptr :
  pickle.dump(pred_train_df, ptr)

pred_test_df = pd.DataFrame(pred_test_dict)
pred_test_df = pred_test_df[np.sort(pred_test_df.columns)]

with open(dir + "pickle/pred_test_" + model + ".pickle", "wb") as pte :
  pickle.dump(pred_test_df, pte)

## Submission

In [None]:
with open(dir + "pickle/pred_test_xgb.pickle", 'rb') as pte :
    pred_test_df = pickle.load(pte)

In [None]:
pred_test_df

In [None]:
final_result = []
for ii in range(len(pred_test_df)) :
  single_mean = np.mean(pred_test_df.iloc[ii, :])
  final_result.append(single_mean)

# Classierfier
#final_result = to_binary(final_result)
final_result

In [None]:
submission["target"] = final_result

#Classifier
#submission = submission.astype(int)

submission

In [None]:
save_name = ""
submission.to_csv(dir + "data/results/{}.csv".format(save_name), index = False)
sns.countplot(x = submission["target"])