# Basic Concepts

In [1]:
# 쇼핑데이터를 활용한 머신러닝에서 참고로 나온 Kaggle Data 활용
# https://www.kaggle.com/competitions/tabular-playground-series-jan-2021/overview -> 데이터 사용
# https://www.kaggle.com/code/hamzaghanmi/xgboost-catboost-using-optuna -> Kaggle 필사 

In [2]:
# Google Drive에서 data load
import pandas as pd
sub = pd.read_csv('/content/drive/MyDrive/KaggleData/tabular-playground-series-jan-2021/sample_submission.csv')
test = pd.read_csv('/content/drive/MyDrive/KaggleData/tabular-playground-series-jan-2021/test.csv')
train = pd.read_csv('/content/drive/MyDrive/KaggleData/tabular-playground-series-jan-2021/train.csv')

In [None]:
# optuna 인스톨
!pip install optuna

In [None]:
# catboost 인스톨 
!pip install catboost

In [5]:
import optuna
import xgboost as xgb
from catboost import CatBoostRegressor
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [6]:
train.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,target
0,1,0.67039,0.8113,0.643968,0.291791,0.284117,0.855953,0.8907,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411,7.243043
1,3,0.388053,0.621104,0.686102,0.501149,0.64379,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484,8.203331
2,4,0.83495,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047,7.776091
3,5,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528,6.957716
4,8,0.935278,0.421235,0.303801,0.880214,0.66561,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772,7.951046


In [7]:
train.columns.to_list()

['id',
 'cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14',
 'target']

In [8]:
# train.columns.to_list()는 list 형태로 train의 column을 반환
# 아래의 코드는 id 와 target을 제외한 column을 가져온다
columns = [col for col in train.columns.to_list() if col not in ['id','target']]

In [9]:
columns

['cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14']

In [10]:
# data의 column 재구성
# target series 만들기
data=train[columns]
target=train['target']

In [11]:
data

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0.670390,0.811300,0.643968,0.291791,0.284117,0.855953,0.890700,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411
1,0.388053,0.621104,0.686102,0.501149,0.643790,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484
2,0.834950,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047
3,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528
4,0.935278,0.421235,0.303801,0.880214,0.665610,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.216974,0.735265,0.648648,0.255387,0.616353,0.345197,0.295718,0.304357,0.314351,0.860504,0.315397,0.247682,0.486542,0.288750
299996,0.545799,0.165139,0.220966,0.190053,0.359362,0.386336,0.365767,0.344217,0.466446,0.454581,0.360251,0.360755,0.292535,0.619984
299997,0.284401,0.841542,0.957585,0.340383,0.396279,0.330376,0.525687,0.260039,0.378174,0.526925,0.491735,0.516629,0.173521,0.714552
299998,0.481900,0.622346,0.540032,0.823118,0.283066,0.434283,0.174342,0.710843,0.358690,0.648272,0.984647,1.001110,0.063956,0.377693


In [12]:
print(target)
type(target)

0         7.243043
1         8.203331
2         7.776091
3         6.957716
4         7.951046
            ...   
299995    7.385215
299996    7.242617
299997    9.592487
299998    8.207951
299999    8.890285
Name: target, Length: 300000, dtype: float64


pandas.core.series.Series

# XGBoost using Optuna

In [13]:
def objective(trial, data=data, target=target):
  # train, test의 data에서 x, y split 
  # test의 크기는 0.15, random_state = 42 (자유롭게 정하면 됨)
  train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15, random_state=42)
  # parameters 설정 (각각은 더 알아보아야 할 듯)
  param = {
      'tree_method':'gpu_hist',
      'lambda' : trial.suggest_loguniform('lambda', 1e-3, 10.0),
      'alpha' : trial.suggest_loguniform('alpha', 1e-3, 10.0),
      'colsample_bytree' : trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
      'subsample' : trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
      'learning_rate' : trial.suggest_categorical('learning_rate', [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
      'n_estimators' : 10000,
      'max_depth' : trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17]),
      'random_state' : trial.suggest_categorical('random_state', [2020]),
      'min_child_weight' : trial.suggest_int('min_child_weight', 1, 300),
  }

  model = xgb.XGBRegressor(**param)

  #model 학습
  model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100, verbose=False)
  
  #model이 예측한 값을 할당
  preds = model.predict(test_x)

  #rmse를 사용하여 평가지표
  rmse = mean_squared_error(test_y, preds, squared=False)

  return rmse 







In [15]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-09-29 08:04:19,677][0m A new study created in memory with name: no-name-2e1807ab-0570-4e6d-b56e-a63d9abec40f[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-29 08:04:49,576][0m Trial 0 finished with value: 0.6953031694190056 and parameters: {'lambda': 9.16925241462453, 'alpha': 1.3755051390713275, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.016, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 101}. Best is trial 0 with value: 0.6953031694190056.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-29 08:05:13,100][0m Trial 1 finished with value: 0.6935876181060265 and parameters: {'lambda': 1.3813333848411027, 'alpha': 0.887849853579111, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.018, 'max_depth': 13, 'random_state': 2020, 'min_child_weight': 233}. Best is trial 1 with value: 0.6935876181060265.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-29 08:05:38,066][0m Trial 2 finished with value: 0.6940776910087852 and parameters: {'lambda': 0.3487294319331634, 'alpha': 0.007978089353162751, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 225}. Best is trial 1 with value: 0.6935876181060265.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-29 08:06:12,546][0m Trial 3 finished with value: 0.6941931919489813 and parameters: {'lambda': 0.0016906284851610851, 'alpha': 3.4204813712774964, 'colsample_bytree': 0.9, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 256}. Best is trial 1 with value: 0.6935876181060265.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-29 08:06:43,101][0m Trial 4 finished with value: 0.6956479782838288 and parameters: {'lambda': 0.08956180251476895, 'alpha': 0.002124834540870713, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 9, 'random_state': 2020, 'min_child_weight': 13}. Best is trial 1 with value: 0.6935876181060265.[0m


Number of finished trials: 5
Best trial: {'lambda': 1.3813333848411027, 'alpha': 0.887849853579111, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.018, 'max_depth': 13, 'random_state': 2020, 'min_child_weight': 233}


In [16]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_random_state,params_subsample,state
0,0,0.695303,2022-09-29 08:04:19.684712,2022-09-29 08:04:49.575742,0 days 00:00:29.891030,1.375505,0.9,9.169252,0.016,5,101,2020,0.7,COMPLETE
1,1,0.693588,2022-09-29 08:04:49.578024,2022-09-29 08:05:13.100437,0 days 00:00:23.522413,0.88785,0.5,1.381333,0.018,13,233,2020,0.7,COMPLETE
2,2,0.694078,2022-09-29 08:05:13.102479,2022-09-29 08:05:38.065689,0 days 00:00:24.963210,0.007978,0.6,0.348729,0.014,15,225,2020,0.4,COMPLETE
3,3,0.694193,2022-09-29 08:05:38.067918,2022-09-29 08:06:12.545743,0 days 00:00:34.477825,3.420481,0.9,0.001691,0.01,15,256,2020,0.5,COMPLETE
4,4,0.695648,2022-09-29 08:06:12.547794,2022-09-29 08:06:43.101259,0 days 00:00:30.553465,0.002125,0.9,0.089562,0.012,9,13,2020,1.0,COMPLETE


In [17]:
optuna.visualization.plot_optimization_history(study)

In [18]:
optuna.visualization.plot_parallel_coordinate(study)

In [19]:
optuna.visualization.plot_slice(study)

In [20]:
optuna.visualization.plot_contour(study, params=['alpha',
                            #'max_depth',
                            'lambda',
                            'subsample',
                            'learning_rate',
                            'subsample'])

In [21]:
optuna.visualization.plot_param_importances(study)

In [22]:
optuna.visualization.plot_edf(study)