In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import importlib
import lightgbm as lgb
import optuna
#import airbnb_test_func_ver2 as tf
#importlib.reload(tf)

%matplotlib inline

In [2]:
df = pd.read_csv("train_perday.csv")

#df = tf.data_pre(df)

#model = tf.model_lgb(df)

In [3]:
def objective(trial):

    df_train, df_val = train_test_split(df, test_size=0.2)
    
    col = "n"
    train_y = df_train[col]
    train_x = df_train.drop(col, axis=1)

    val_y = df_val[col]
    val_x = df_val.drop(col, axis=1)

    trains = lgb.Dataset(train_x, train_y)
    valids = lgb.Dataset(val_x, val_y)

    #https://lightgbm.readthedocs.io/en/latest/Parameters.html
    #https://knknkn.hatenablog.com/entry/2021/06/29/125226
    #https://zenn.dev/megane_otoko/articles/2021ad_09_optuna_optimization
    
    
    learning_rate = trial.suggest_float('learning_rate', 0.0, 1.0)
    num_leaves =  trial.suggest_int("num_leaves", 2, 50)
    tree_learner = trial.suggest_categorical('tree_learner', ["serial", "feature", "data", "voting"])
    lambda_l1= trial.suggest_float("lambda_l1", 0.0, 200.0)
    lambda_l2 = trial.suggest_float("lambda_l2", 0.0, 200.0)
    max_depth = trial.suggest_int("max_depth", 2, 12)
    
    params = {
        "task": "train", 
        "objective": "regression",
        "boosting_type": "gbdt", 
        "metrics": {"rmse"}, 
        "learning_rate": learning_rate, 
        "num_leaves": num_leaves, 
        "tree_learner": tree_learner,
        "lambda_l1": lambda_l1, 
        "lambda_l2": lambda_l2, 
        "seed": 123, 
        "max_depth": max_depth,
        "num_iterations": 100      
    }

        #カテゴリカルデータをリストで渡す
    categorical_list = [
                        "DoW",
                        "Holiday",
                        "shopID",
                        "Cat",
                        "itemID"
                        ]
    #https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html
    #https://lightgbm.readthedocs.io/en/latest/Python-API.html
    model = lgb.train(params, 
                        trains, 
                        valid_sets=valids, 
                        categorical_feature=categorical_list, 
                        num_boost_round=1000, 
                        )

    pred_y = model.predict(val_x)
    rmse = np.sqrt(mse(val_y, pred_y))
    
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2023-09-06 20:45:24,189] A new study created in memory with name: no-name-e599c9c1-7dad-43f0-86f8-83b027fe683b
[W 2023-09-06 20:45:24,195] Trial 0 failed with parameters: {'learning_rate': 0.44679913709014296, 'num_leaves': 49, 'tree_learner': 'voting', 'lambda_l1': 63.28673550440549, 'lambda_l2': 108.2203089676629, 'max_depth': 12} because of the following error: ValueError('pandas dtypes must be int, float or bool.\nFields with bad pandas dtypes: DoW: object').
Traceback (most recent call last):
  File "/Users/makoto/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/y6/cy0bdhj54v19n79mw0xn1vq40000gn/T/ipykernel_3480/2645578566.py", line 52, in objective
    model = lgb.train(params,
  File "/Users/makoto/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lightgbm/engine.py", line 245, in train
    booster = Booster(params=params, train_set=train_set)
  

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: DoW: object

In [None]:
print("Best trial: {}".format(study.best_trial.number))
print("Best params: {}".format(study.best_trial.params))
print("Best score: {}".format(study.best_trial.value)) 

Best trial: 88
Best params: {'learning_rate': 0.7762661554061182, 'num_leaves': 39, 'tree_learner': 'voting', 'lambda_l1': 85.68378855219873, 'lambda_l2': 154.16051855835047, 'max_depth': 9}
Best score: 0.9858088852159445


In [None]:
df_train, df_val = train_test_split(df, test_size=0.2)
    
col = "n"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

categorical_list = [
                    "DoW",
                    "Holiday",
                    "shopID",
                    "Cat",
                    "itemID"
                    ]

model = lgb.train(study.best_trial.params, 
                    trains, 
                    valid_sets=valids, 
                    categorical_feature=categorical_list, 
                    num_boost_round=1000, 
                    )

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 457
[LightGBM] [Info] Number of data points in the train set: 53856, number of used features: 5
[LightGBM] [Info] Start training from score 0.748942


In [None]:
#テストデータを予測
df_test = pd.read_csv("test_perday.csv")

predict = model.predict(df_test)
df_test["n"] = predict


In [None]:
df_test

In [None]:
#df_test.to_csv("predict_perday.csv", index=True, header=False)

In [None]:
#lightgbmのfeature_importance関数を使って特徴量重要度を見てみる
#pd.DataFrame(model.feature_importance(), index=val_x.columns, columns=["importance"]).sort_values("importance", ascending=False)

Unnamed: 0,importance
MeanPrice,1292
itemID,832
shopID,535
Month,432
Cat,5
