In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[?25l[K     |█                               | 10 kB 14.9 MB/s eta 0:00:01[K     |██▏                             | 20 kB 19.5 MB/s eta 0:00:01[K     |███▏                            | 30 kB 13.0 MB/s eta 0:00:01[K     |████▎                           | 40 kB 9.4 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 5.6 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 6.1 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 5.8 MB/s eta 0:00:01[K     |████████▌                       | 81 kB 6.4 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 5.3 MB/s eta 0:00:01[K     |██████████▋                     | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 5.3 MB/s eta 0:00:01[K     |████████████▊                   | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████████▉                  | 133 kB 5.3 MB/s eta 0:00:01[K 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import pickle
import xgboost
import optuna
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('train_data.csv', index_col=0)
weight = []

In [None]:
df=df[df["Weight"]!='Not Available']

In [None]:
for index,row in df.iterrows():
    weight.append(row["Weight"].split()[0])

In [None]:
df["weight"]=weight

In [None]:
df = df.astype({"weight": int})

In [None]:
X = df.drop(["Weight","weight","ID"],axis=1).to_numpy()

In [None]:
y = df["weight"].to_numpy()

In [None]:
kf = KFold(n_splits=5)

In [None]:
def define_model(trial):
    lr = trial.suggest_float("lr",1e-5,1e-1,log=True)
    n_estimators = trial.suggest_int("n_estimators",1000,3000)
    max_depth = trial.suggest_int("max_depth",3,10)
    subsample = trial.suggest_float("subsample",0.8,1,log=True)
    colsample_bytree = trial.suggest_float("colsample_bytree",0.3,0.8,log=True)
    gamma = trial.suggest_int("gamma",0,5)
    early_stopping_rounds = trial.suggest_int("early_stopping_rounds",10,100)
    return lr,n_estimators,max_depth,subsample,colsample_bytree,gamma,early_stopping_rounds

In [None]:
def objective(trial):
    lr,n_estimators,max_depth,subsample,colsample_bytree,gamma,early_stopping_rounds = define_model(trial)
    
    mae = []
    
    for train_index, test_index in kf.split(X):
            
            X_train, X_test, y_train, y_test = X[train_index],X[test_index],y[train_index],y[test_index]

            model_xgboost = xgboost.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = n_estimators,learning_rate = lr,max_depth = max_depth,subsample = subsample,colsample_bytree = colsample_bytree,gamma = gamma, seed = 1)

            eval_set = [(X_test,y_test)]
            model_xgboost.fit(X_train,y_train,early_stopping_rounds = early_stopping_rounds,eval_set = eval_set,verbose = False,eval_metric = "mae")
            y_pred = model_xgboost.predict(X_test)
            mae.append(mean_absolute_error(y_test, y_pred))

    filename = str(lr)+str(n_estimators)+str(max_depth)+str(subsample)+str(colsample_bytree)+str(gamma)+str(early_stopping_rounds)
    filename = filename + ".json"

    model_xgboost.save_model(filename)

    acc = sum(mae)/len(mae)

    return acc

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective,n_trials=50)

print("Best trial:")
trial = study.best_trial

print(" Value: ",trial.value)

print(" Params: ")

for key,value in trial.params.items():
    print("  {}: {}".format(key,value))

[32m[I 2021-11-02 12:43:24,039][0m A new study created in memory with name: no-name-ad9200fe-cab6-4dac-a7cd-3886278dc742[0m
[32m[I 2021-11-02 12:55:28,880][0m Trial 0 finished with value: 170.35404799258873 and parameters: {'lr': 4.486365122761064e-05, 'n_estimators': 2311, 'max_depth': 7, 'subsample': 0.9571045003672071, 'colsample_bytree': 0.6949873096950373, 'gamma': 3, 'early_stopping_rounds': 59}. Best is trial 0 with value: 170.35404799258873.[0m
[32m[I 2021-11-02 13:05:57,294][0m Trial 1 finished with value: 27.414316247957277 and parameters: {'lr': 0.003063497890536736, 'n_estimators': 1747, 'max_depth': 8, 'subsample': 0.9634927963360753, 'colsample_bytree': 0.6915140941626904, 'gamma': 0, 'early_stopping_rounds': 81}. Best is trial 1 with value: 27.414316247957277.[0m
[32m[I 2021-11-02 13:07:13,055][0m Trial 2 finished with value: 27.387844246802054 and parameters: {'lr': 0.023458942153364148, 'n_estimators': 2761, 'max_depth': 5, 'subsample': 0.9544373049552803, '

Best trial:
 Value:  27.376270629318903
 Params: 
  lr: 0.011334813373338989
  n_estimators: 2333
  max_depth: 5
  subsample: 0.8486443999034436
  colsample_bytree: 0.767195468916397
  gamma: 2
  early_stopping_rounds: 55
