# Basic Concepts

In [1]:
# 쇼핑데이터를 활용한 머신러닝에서 참고로 나온 Kaggle Data 활용
# https://www.kaggle.com/competitions/tabular-playground-series-jan-2021/overview -> 데이터 사용
# https://www.kaggle.com/code/hamzaghanmi/xgboost-catboost-using-optuna -> Kaggle 필사 

In [2]:
# Google Drive에서 data load
import pandas as pd
sub = pd.read_csv('/content/drive/MyDrive/KaggleData/tabular-playground-series-jan-2021/sample_submission.csv')
test = pd.read_csv('/content/drive/MyDrive/KaggleData/tabular-playground-series-jan-2021/test.csv')
train = pd.read_csv('/content/drive/MyDrive/KaggleData/tabular-playground-series-jan-2021/train.csv')

In [None]:
# optuna 인스톨
!pip install optuna

In [None]:
# catboost 인스톨 
!pip install catboost

In [5]:
import optuna
import xgboost as xgb
from catboost import CatBoostRegressor
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [6]:
train.head()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,target
0,1,0.67039,0.8113,0.643968,0.291791,0.284117,0.855953,0.8907,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411,7.243043
1,3,0.388053,0.621104,0.686102,0.501149,0.64379,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484,8.203331
2,4,0.83495,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047,7.776091
3,5,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528,6.957716
4,8,0.935278,0.421235,0.303801,0.880214,0.66561,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772,7.951046


In [7]:
train.columns.to_list()

['id',
 'cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14',
 'target']

In [8]:
# train.columns.to_list()는 list 형태로 train의 column을 반환
# 아래의 코드는 id 와 target을 제외한 column을 가져온다
columns = [col for col in train.columns.to_list() if col not in ['id','target']]

In [9]:
columns

['cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14']

In [10]:
# data의 column 재구성
# target series 만들기
data=train[columns]
target=train['target']

In [11]:
data

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,0.670390,0.811300,0.643968,0.291791,0.284117,0.855953,0.890700,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411
1,0.388053,0.621104,0.686102,0.501149,0.643790,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484
2,0.834950,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047
3,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528
4,0.935278,0.421235,0.303801,0.880214,0.665610,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.216974,0.735265,0.648648,0.255387,0.616353,0.345197,0.295718,0.304357,0.314351,0.860504,0.315397,0.247682,0.486542,0.288750
299996,0.545799,0.165139,0.220966,0.190053,0.359362,0.386336,0.365767,0.344217,0.466446,0.454581,0.360251,0.360755,0.292535,0.619984
299997,0.284401,0.841542,0.957585,0.340383,0.396279,0.330376,0.525687,0.260039,0.378174,0.526925,0.491735,0.516629,0.173521,0.714552
299998,0.481900,0.622346,0.540032,0.823118,0.283066,0.434283,0.174342,0.710843,0.358690,0.648272,0.984647,1.001110,0.063956,0.377693


In [12]:
print(target)
type(target)

0         7.243043
1         8.203331
2         7.776091
3         6.957716
4         7.951046
            ...   
299995    7.385215
299996    7.242617
299997    9.592487
299998    8.207951
299999    8.890285
Name: target, Length: 300000, dtype: float64


pandas.core.series.Series

# XGBoost using Optuna

In [13]:
def objective(trial, data=data, target=target):
  # train, test의 data에서 x, y split 
  # test의 크기는 0.15, random_state = 42 (자유롭게 정하면 됨)
  train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15, random_state=42)
  # parameters 설정 (각각은 더 알아보아야 할 듯)
  param = {
      'tree_method':'gpu_hist',
      'lambda' : trial.suggest_loguniform('lambda', 1e-3, 10.0),
      'alpha' : trial.suggest_loguniform('alpha', 1e-3, 10.0),
      'colsample_bytree' : trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
      'subsample' : trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
      'learning_rate' : trial.suggest_categorical('learning_rate', [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
      'n_estimators' : 10000,
      'max_depth' : trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17]),
      'random_state' : trial.suggest_categorical('random_state', [2020]),
      'min_child_weight' : trial.suggest_int('min_child_weight', 1, 300),
  }

  model = xgb.XGBRegressor(**param)

  #model 학습
  model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100, verbose=False)
  
  #model이 예측한 값을 할당
  preds = model.predict(test_x)

  #rmse를 사용하여 평가지표
  rmse = mean_squared_error(test_y, preds, squared=False)

  return rmse 







In [15]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-09-30 07:21:31,952][0m A new study created in memory with name: no-name-40b183bd-83a3-4d10-92b2-610955cc3fc1[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-30 07:22:12,857][0m Trial 0 finished with value: 0.6948411505306249 and parameters: {'lambda': 0.0049959261065759585, 'alpha': 1.5443870500056027, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 208}. Best is trial 0 with value: 0.6948411505306249.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-30 07:23:25,987][0m Trial 1 finished with value: 0.6941950966827731 and parameters: {'lambda': 0.08670440835299113, 'alpha': 2.382384028210191, 'colsample_bytree': 0.9, 'subsample': 0.5, 'learning_rate': 0.008, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 122}. Best is trial 1 with value: 0.6941950966827731.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-30 07:23:58,542][0m Trial 2 finished with value: 0.6945521407665127 and parameters: {'lambda': 0.07084282930619408, 'alpha': 0.001232256068946141, 'colsample_bytree': 0.6, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 100}. Best is trial 1 with value: 0.6941950966827731.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-30 07:24:20,328][0m Trial 3 finished with value: 0.6949681243280454 and parameters: {'lambda': 2.772854435635693, 'alpha': 3.021630227727984, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.016, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 158}. Best is trial 1 with value: 0.6941950966827731.[0m
  
  if __name__ == '__main__':




[32m[I 2022-09-30 07:25:03,906][0m Trial 4 finished with value: 0.6940325089695049 and parameters: {'lambda': 0.15321838018206482, 'alpha': 0.734803502696085, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.01, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 215}. Best is trial 4 with value: 0.6940325089695049.[0m


Number of finished trials: 5
Best trial: {'lambda': 0.15321838018206482, 'alpha': 0.734803502696085, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.01, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 215}


In [16]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_random_state,params_subsample,state
0,0,0.694841,2022-09-30 07:21:31.955260,2022-09-30 07:22:12.857476,0 days 00:00:40.902216,1.544387,0.6,0.004996,0.01,7,208,2020,1.0,COMPLETE
1,1,0.694195,2022-09-30 07:22:12.859717,2022-09-30 07:23:25.987124,0 days 00:01:13.127407,2.382384,0.9,0.086704,0.008,17,122,2020,0.5,COMPLETE
2,2,0.694552,2022-09-30 07:23:25.989537,2022-09-30 07:23:58.542228,0 days 00:00:32.552691,0.001232,0.6,0.070843,0.01,7,100,2020,0.5,COMPLETE
3,3,0.694968,2022-09-30 07:23:58.544310,2022-09-30 07:24:20.327930,0 days 00:00:21.783620,3.02163,1.0,2.772854,0.016,7,158,2020,0.4,COMPLETE
4,4,0.694033,2022-09-30 07:24:20.330063,2022-09-30 07:25:03.905872,0 days 00:00:43.575809,0.734804,0.9,0.153218,0.01,15,215,2020,0.4,COMPLETE


In [17]:
optuna.visualization.plot_optimization_history(study)

In [18]:
optuna.visualization.plot_parallel_coordinate(study)

In [19]:
optuna.visualization.plot_slice(study)

In [20]:
optuna.visualization.plot_contour(study, params=['alpha',
                            #'max_depth',
                            'lambda',
                            'subsample',
                            'learning_rate',
                            'subsample'])

In [21]:
optuna.visualization.plot_param_importances(study)

In [22]:
optuna.visualization.plot_edf(study)

In [23]:
Best_trial = study.best_trial.params
Best_trial["n_estimators"], Best_trial["tree_method"] = 10000, 'gpu_hist'
Best_trial

{'lambda': 0.15321838018206482,
 'alpha': 0.734803502696085,
 'colsample_bytree': 0.9,
 'subsample': 0.4,
 'learning_rate': 0.01,
 'max_depth': 15,
 'random_state': 2020,
 'min_child_weight': 215,
 'n_estimators': 10000,
 'tree_method': 'gpu_hist'}

In [28]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5, random_state=42, shuffle=True)
rmse = [] # 각각의 list가 매 fold 마다 rmse를 가지게 만든다.
n = 0 
for trn_idx, test_idx in kf.split(train[columns], train['target']):
  X_tr, X_val = train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
  y_tr, y_val = train['target'].iloc[trn_idx], train['target'].iloc[test_idx]
  model = xgb.XGBRegressor(**Best_trial)
  model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],early_stopping_rounds = 100, verbose=False)
  preds+=model.predict(test[columns])/kf.n_splits
  rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
  print(f"fold: {n+1} ==> rmse: {rmse[n]}")
  n+=1

fold: 1 ==> rmse: 0.694860001497967
fold: 2 ==> rmse: 0.6965315151585909
fold: 3 ==> rmse: 0.6953200935123252
fold: 4 ==> rmse: 0.6979811701320121
fold: 5 ==> rmse: 0.699000322229533


In [29]:
np.mean(rmse)

0.6967386205060857

# Catboost Using Optuna

In [33]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        'loss_function': 'RMSE',
        'task_type': 'GPU',
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        #'rsm': trial.suggest_uniform('rsm', 0.3, 1.0),
        # 'subsample': trial.suggest_uniform('bagging_fraction', 0.4, 1.0), # Error: default bootstrap type (bayesian) doesn't support taken fraction option 로 인해 비활성화 
        'learning_rate': trial.suggest_uniform('learning_rate', 0.006, 0.018),
        'n_estimators':  25000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
    }
    model = CatBoostRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=200,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [35]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-09-30 07:34:48,707][0m A new study created in memory with name: no-name-422fac61-988d-401d-9c85-e3f9c276871b[0m

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.


suggest_uniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.

[32m[I 2022-09-30 07:36:31,129][0m Trial 0 finished with value: 0.6989275566566537 and parameters: {'l2_leaf_reg': 0.0011750232960752493, 'max_bin': 364, 'learning_rate': 0.013656326854334384, 'max_depth': 13, 'random_state': 2020, 'min_data_in_leaf': 195}. Best is trial 0 with value: 0.6989275566566537.[0m

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :

Number of finished trials: 5
Best trial: {'l2_leaf_reg': 4.595653682935372, 'max_bin': 232, 'learning_rate': 0.008376992894071872, 'max_depth': 9, 'random_state': 2020, 'min_data_in_leaf': 115}


In [36]:
optuna.visualization.plot_optimization_history(study)

In [37]:
optuna.visualization.plot_parallel_coordinate(study)

In [38]:
optuna.visualization.plot_slice(study)

In [39]:
optuna.visualization.plot_param_importances(study)

In [40]:
optuna.visualization.plot_edf(study)

In [43]:
Best_trial = {'l2_leaf_reg': 4.595653682935372, 'max_bin': 232, 'learning_rate': 0.008376992894071872, 'max_depth': 9, 'random_state': 2020, 'min_data_in_leaf': 115}

In [44]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=10,random_state=48,shuffle=True)
rmse=[]   # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train[columns],train['target']):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    model = CatBoostRegressor(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=200,verbose=False)
    preds+=model.predict(test[columns])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(f"fold: {n+1} ==> rmse: {rmse[n]}")
    n+=1

fold: 1 ==> rmse: 0.7097585473899558
fold: 2 ==> rmse: 0.7060180429665153
fold: 3 ==> rmse: 0.7060435274745309
fold: 4 ==> rmse: 0.7063483594067501
fold: 5 ==> rmse: 0.7086069457458282
fold: 6 ==> rmse: 0.706898503024904
fold: 7 ==> rmse: 0.7049450024983802
fold: 8 ==> rmse: 0.7079464210047175
fold: 9 ==> rmse: 0.7064541398573513
fold: 10 ==> rmse: 0.7047775795902657


In [45]:
np.mean(rmse)

0.7067797068959198

In [46]:
sub['target']=preds
sub.to_csv('submission.csv', index=False)