In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
import importlib
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import optuna

%matplotlib inline

In [118]:
df = pd.read_csv("train_meanprice.csv")
df_test = pd.read_csv("test_meanprice.csv")


In [119]:
df_train, df_val = train_test_split(df, test_size=0.2, shuffle=True, random_state=123)
    
col = "n"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

stacking_answer= df_val[col].to_list()

categorical_features = [
                    "Month",
                    "shopID",
                    "itemID"
                    ]

In [120]:
'''
# OptunaによるlightGBMの最適パラメータ探索

def objective(trial):

    lgb_trains = lgb.Dataset(train_x, train_y)
    lgb_valids = lgb.Dataset(val_x, val_y)
    
    learning_rate = trial.suggest_float('learning_rate', 0.0, 1.0)
    num_leaves =  trial.suggest_int("num_leaves", 2, 100)
    tree_learner = trial.suggest_categorical('tree_learner', ["serial", "feature", "data", "voting"])
    lambda_l1= trial.suggest_float("lambda_l1", 0.0, 200.0)
    lambda_l2 = trial.suggest_float("lambda_l2", 0.0, 200.0)
    max_depth = trial.suggest_int("max_depth", 2, 12)
    num_iterations = trial.suggest_int("num_iterations", 10, 100)

    lgb_params = {
        "objective": "regression",
        "boosting_type": "gbdt", 
        "metrics": "rmse", 
        "learning_rate": learning_rate, 
        "num_leaves": num_leaves, 
        "tree_learner": tree_learner,
        "lambda_l1": lambda_l1, 
        "lambda_l2": lambda_l2, 
        "seed": 123, 
        "max_depth": max_depth,
        "num_iterations": num_iterations      
    }

    regressor = lgb.train(lgb_params,
                        lgb_trains,
                        valid_sets=lgb_valids, 
                        categorical_feature=categorical_features,
                        )

    lgb_pred_val_y = regressor.predict(val_x)

    rmse = np.sqrt(mse(val_y, lgb_pred_val_y))
    
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
'''

'\n# OptunaによるlightGBMの最適パラメータ探索\n\ndef objective(trial):\n\n    lgb_trains = lgb.Dataset(train_x, train_y)\n    lgb_valids = lgb.Dataset(val_x, val_y)\n    \n    learning_rate = trial.suggest_float(\'learning_rate\', 0.0, 1.0)\n    num_leaves =  trial.suggest_int("num_leaves", 2, 100)\n    tree_learner = trial.suggest_categorical(\'tree_learner\', ["serial", "feature", "data", "voting"])\n    lambda_l1= trial.suggest_float("lambda_l1", 0.0, 200.0)\n    lambda_l2 = trial.suggest_float("lambda_l2", 0.0, 200.0)\n    max_depth = trial.suggest_int("max_depth", 2, 12)\n    num_iterations = trial.suggest_int("num_iterations", 10, 100)\n\n    lgb_params = {\n        "objective": "regression",\n        "boosting_type": "gbdt", \n        "metrics": "rmse", \n        "learning_rate": learning_rate, \n        "num_leaves": num_leaves, \n        "tree_learner": tree_learner,\n        "lambda_l1": lambda_l1, \n        "lambda_l2": lambda_l2, \n        "seed": 123, \n        "max_depth": max_depth,\

In [121]:
'''
# パラメータ最適化したlightGBMによる予想

trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

model_lgb = lgb.train(study.best_trial.params, 
                    trains, 
                    valid_sets=valids, 
                    categorical_feature=categorical_features
                    )

val_lgb = model_lgb.predict(val_x)
rmse = np.sqrt(mse(val_y, val_lgb))

lgb_pred = model_lgb.predict(df_test)

rmse
'''

'\n# パラメータ最適化したlightGBMによる予想\n\ntrains = lgb.Dataset(train_x, train_y)\nvalids = lgb.Dataset(val_x, val_y)\n\nmodel_lgb = lgb.train(study.best_trial.params, \n                    trains, \n                    valid_sets=valids, \n                    categorical_feature=categorical_features\n                    )\n\nval_lgb = model_lgb.predict(val_x)\nrmse = np.sqrt(mse(val_y, val_lgb))\n\nlgb_pred = model_lgb.predict(df_test)\n\nrmse\n'

In [122]:
def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = cb.CatBoostRegressor(**params, silent=True)
    model.fit(train_x, train_y)
    predictions = model.predict(val_x)
    rmse = mse(val_y, predictions, squared=False)
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2023-09-08 21:23:57,182] A new study created in memory with name: no-name-381a2710-09e2-4825-815b-eb1e2ea83c4c
[I 2023-09-08 21:23:58,109] Trial 0 finished with value: 1.1912550209764636 and parameters: {'learning_rate': 0.006481634893725254, 'depth': 10, 'subsample': 0.09635724972896968, 'colsample_bylevel': 0.26271434702369945, 'min_data_in_leaf': 80}. Best is trial 0 with value: 1.1912550209764636.
[I 2023-09-08 21:24:02,273] Trial 1 finished with value: 0.8783206635347132 and parameters: {'learning_rate': 0.017933261894093496, 'depth': 10, 'subsample': 0.8940943744644371, 'colsample_bylevel': 0.8273851212221864, 'min_data_in_leaf': 55}. Best is trial 1 with value: 0.8783206635347132.
[I 2023-09-08 21:24:04,655] Trial 2 finished with value: 1.1495031382863004 and parameters: {'learning_rate': 0.0021294857378186304, 'depth': 8, 'subsample': 0.5223602347636219, 'colsample_bylevel': 0.6942229613686828, 'min_data_in_leaf': 56}. Best is trial 1 with value: 0.8783206635347132.
[I 2023-

In [123]:
#Catboostによる予想

cb_train = cb.Pool(train_x, label=train_y, cat_features=categorical_features)
cb_test = cb.Pool(val_x, label=val_y, cat_features=categorical_features)

model_cb = cb.CatBoostRegressor(**study.best_params, loss_function='RMSE')
model_cb.fit(cb_train)

val_cb = model_cb.predict(val_x)
rmse = np.sqrt(mse(val_y, val_cb))

cb_pred = model_cb.predict(df_test)

rmse


0:	learn: 1.3022765	total: 8.73ms	remaining: 8.72s
1:	learn: 1.2755493	total: 15.5ms	remaining: 7.75s
2:	learn: 1.2523330	total: 21.2ms	remaining: 7.05s
3:	learn: 1.2350987	total: 27.5ms	remaining: 6.84s
4:	learn: 1.2194961	total: 34.7ms	remaining: 6.91s
5:	learn: 1.2063174	total: 41.5ms	remaining: 6.87s
6:	learn: 1.1932540	total: 47.6ms	remaining: 6.75s
7:	learn: 1.1780692	total: 54.9ms	remaining: 6.81s
8:	learn: 1.1681914	total: 61ms	remaining: 6.72s
9:	learn: 1.1582223	total: 69.8ms	remaining: 6.91s
10:	learn: 1.1582214	total: 71ms	remaining: 6.38s
11:	learn: 1.1494967	total: 77.2ms	remaining: 6.35s
12:	learn: 1.1409472	total: 83.6ms	remaining: 6.35s
13:	learn: 1.1356748	total: 89.4ms	remaining: 6.3s
14:	learn: 1.1260433	total: 97ms	remaining: 6.37s
15:	learn: 1.1219393	total: 101ms	remaining: 6.23s
16:	learn: 1.1133689	total: 109ms	remaining: 6.3s
17:	learn: 1.1051984	total: 117ms	remaining: 6.4s
18:	learn: 1.1015783	total: 123ms	remaining: 6.37s
19:	learn: 1.0952374	total: 132ms	r

0.776492722588476

In [124]:
submit = pd.DataFrame(cb_pred)
submit.to_csv("submit20.csv", index=True, header=False)


In [125]:
'''
# XGBoostによる予想

xgb_train = xgb.DMatrix(train_x, label=train_y)
xgb_val = xgb.DMatrix(val_x, label=val_y)
xgb_test = xgb.DMatrix(df_test)

params = {'objective':'reg:squarederror',
          'random_state':123}

model_xgb = xgb.train(params, dtrain=xgb_train, num_boost_round=1000)

val_xgb = model_xgb.predict(xgb_val)
rmse = np.sqrt(mse(val_y, val_xgb))

xgb_pred = model_xgb.predict(xgb_test)

rmse

'''

"\n# XGBoostによる予想\n\nxgb_train = xgb.DMatrix(train_x, label=train_y)\nxgb_val = xgb.DMatrix(val_x, label=val_y)\nxgb_test = xgb.DMatrix(df_test)\n\nparams = {'objective':'reg:squarederror',\n          'random_state':123}\n\nmodel_xgb = xgb.train(params, dtrain=xgb_train, num_boost_round=1000)\n\nval_xgb = model_xgb.predict(xgb_val)\nrmse = np.sqrt(mse(val_y, val_xgb))\n\nxgb_pred = model_xgb.predict(xgb_test)\n\nrmse\n\n"

In [126]:
'''
stacking_list = []
stacking_list.append(val_lgb.tolist())
stacking_list.append(val_cb.tolist())
#stacking_list.append(val_xgb.tolist())
stacking_list.append(stacking_answer)

stacking_df_t = pd.DataFrame(stacking_list, index=["lgb","cb", "answer"])

stacking_df = stacking_df_t.T
'''

'\nstacking_list = []\nstacking_list.append(val_lgb.tolist())\nstacking_list.append(val_cb.tolist())\n#stacking_list.append(val_xgb.tolist())\nstacking_list.append(stacking_answer)\n\nstacking_df_t = pd.DataFrame(stacking_list, index=["lgb","cb", "answer"])\n\nstacking_df = stacking_df_t.T\n'

In [127]:
'''
st_train, st_val = train_test_split(stacking_df, test_size=0.2, shuffle=True, random_state=123)

st_train_y = st_train["answer"]
st_train_x = st_train.drop("answer", axis=1)

st_val_y = st_val["answer"]
st_val_x = st_val.drop("answer", axis=1)
'''


'\nst_train, st_val = train_test_split(stacking_df, test_size=0.2, shuffle=True, random_state=123)\n\nst_train_y = st_train["answer"]\nst_train_x = st_train.drop("answer", axis=1)\n\nst_val_y = st_val["answer"]\nst_val_x = st_val.drop("answer", axis=1)\n'

In [128]:
'''
def objective(trial):

    st_lgb_trains = lgb.Dataset(st_train_x, st_train_y)
    st_lgb_valids = lgb.Dataset(st_val_x, st_val_y)
    
    learning_rate = trial.suggest_float('learning_rate', 0.0, 1.0)
    num_leaves =  trial.suggest_int("num_leaves", 2, 100)
    tree_learner = trial.suggest_categorical('tree_learner', ["serial", "feature", "data", "voting"])
    lambda_l1= trial.suggest_float("lambda_l1", 0.0, 200.0)
    lambda_l2 = trial.suggest_float("lambda_l2", 0.0, 200.0)
    max_depth = trial.suggest_int("max_depth", 2, 12)
    num_iterations = trial.suggest_int("num_iterations", 10, 100)

    lgb_params = {
        "objective": "regression",
        "boosting_type": "gbdt", 
        "metrics": "rmse", 
        "learning_rate": learning_rate, 
        "num_leaves": num_leaves, 
        "tree_learner": tree_learner,
        "lambda_l1": lambda_l1, 
        "lambda_l2": lambda_l2, 
        "seed": 123, 
        "max_depth": max_depth,
        "num_iterations": num_iterations      
    }

    regressor = lgb.train(lgb_params,
                        st_lgb_trains,
                        valid_sets=st_lgb_valids
                        )

    lgb_pred_st_val_y = regressor.predict(st_val_x)

    rmse = np.sqrt(mse(st_val_y, lgb_pred_st_val_y))
    
    return rmse

st_study = optuna.create_study(direction='minimize')
st_study.optimize(objective, n_trials=100)
'''

'\ndef objective(trial):\n\n    st_lgb_trains = lgb.Dataset(st_train_x, st_train_y)\n    st_lgb_valids = lgb.Dataset(st_val_x, st_val_y)\n    \n    learning_rate = trial.suggest_float(\'learning_rate\', 0.0, 1.0)\n    num_leaves =  trial.suggest_int("num_leaves", 2, 100)\n    tree_learner = trial.suggest_categorical(\'tree_learner\', ["serial", "feature", "data", "voting"])\n    lambda_l1= trial.suggest_float("lambda_l1", 0.0, 200.0)\n    lambda_l2 = trial.suggest_float("lambda_l2", 0.0, 200.0)\n    max_depth = trial.suggest_int("max_depth", 2, 12)\n    num_iterations = trial.suggest_int("num_iterations", 10, 100)\n\n    lgb_params = {\n        "objective": "regression",\n        "boosting_type": "gbdt", \n        "metrics": "rmse", \n        "learning_rate": learning_rate, \n        "num_leaves": num_leaves, \n        "tree_learner": tree_learner,\n        "lambda_l1": lambda_l1, \n        "lambda_l2": lambda_l2, \n        "seed": 123, \n        "max_depth": max_depth,\n        "num_i

In [129]:
'''
st_trains = lgb.Dataset(st_train_x, st_train_y)
st_valids = lgb.Dataset(st_val_x, st_val_y)

st_model_lgb = lgb.train(st_study.best_trial.params, 
                    st_trains, 
                    valid_sets=st_valids
                    )

st_val_lgb = st_model_lgb.predict(st_val_x)
rmse = np.sqrt(mse(st_val_y, st_val_lgb))
'''

'\nst_trains = lgb.Dataset(st_train_x, st_train_y)\nst_valids = lgb.Dataset(st_val_x, st_val_y)\n\nst_model_lgb = lgb.train(st_study.best_trial.params, \n                    st_trains, \n                    valid_sets=st_valids\n                    )\n\nst_val_lgb = st_model_lgb.predict(st_val_x)\nrmse = np.sqrt(mse(st_val_y, st_val_lgb))\n'

In [130]:
'''
test_stacking_list = []
test_stacking_list.append(lgb_pred.tolist())
test_stacking_list.append(cb_pred.tolist())
#test_stacking_list.append(xgb_pred.tolist())

test_stacking_df_t = pd.DataFrame(test_stacking_list, index=["lgb","cb"])

test_stacking_df = test_stacking_df_t.T
'''

'\ntest_stacking_list = []\ntest_stacking_list.append(lgb_pred.tolist())\ntest_stacking_list.append(cb_pred.tolist())\n#test_stacking_list.append(xgb_pred.tolist())\n\ntest_stacking_df_t = pd.DataFrame(test_stacking_list, index=["lgb","cb"])\n\ntest_stacking_df = test_stacking_df_t.T\n'

In [131]:
#final_predict = st_model_lgb.predict(test_stacking_df)

In [132]:
#test_stacking_df["final"] = final_predict

In [133]:
#test_stacking_df

In [134]:
#submit = np.exp(test_stacking_df["final"]) - 10

In [135]:
#submit.to_csv("submit19.csv", index=True, header=False)

In [136]:
#df_test["n"].to_csv("submit16.csv", index=True, header=False)

In [137]:
#lightgbmのfeature_importance関数を使って特徴量重要度を見てみる
#pd.DataFrame(model_lgbm.feature_importance(), index=val_x.columns, columns=["importance"]).sort_values("importance", ascending=False)