In [3]:
#!pip install shap optuna
#from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
import shap

import optuna
from sklearn.model_selection import train_test_split
from optuna.integration import LightGBMPruningCallback
from google.colab import files

#Reading the csv data into a variable. CSV was stored on Google Drive.
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Project/train.csv")

#Removing the non-number columns from df. Storing all but last columns in x, last column in y
df_int = df._get_numeric_data()
x=df_int.drop("SalePrice",axis=1)
y=df_int.SalePrice
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

#Tuning the hyperparameters with Optuna
def objective(trial):
    """
    Objective function to be minimized.
    """
    param = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        'metric': 'rmse',
        "n_estimators": trial.suggest_categorical("n_estimators", [20000]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 10, 100),
        #"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.9, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.9, step=0.1
        ),
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 150),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    olgbm = lgbm.LGBMRegressor(**param)
    olgbm.fit(X_train, y_train,eval_set=[(X_test,y_test)], callbacks=[lgbm.early_stopping(100), lgbm.log_evaluation, LightGBMPruningCallback(trial,"rmse")])
    predictions = olgbm.predict(X_test)
    accuracy = mean_squared_error(y_test, predictions)
    return accuracy

study = optuna.create_study(study_name="lightgbm", direction="minimize")
study.optimize(objective, n_trials=50)

model = lgbm.LGBMRegressor(**study.best_params)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
#model.booster_.save_model("OptimizedModel.txt")
#files.download("OptimizedModel.txt")

print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

fig1 = optuna.visualization.plot_optimization_history(study)
fig2 = optuna.visualization.plot_slice(study)
fig3 = optuna.visualization.plot_param_importances(study)
fig1.show()
fig2.show()
fig3.show()

for i in range(len(prediction)):
  print("Optuna Price: {}".format(prediction[i]))

[32m[I 2023-04-17 03:04:46,888][0m A new study created in memory with name: lightgbm[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:04:48,469][0m Trial 0 finished with value: 850025325.7283984 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.1029805430313656, 'num_leaves': 1760, 'max_depth': 23, 'lambda_l1': 50, 'lambda_l2': 75, 'min_gain_to_split': 8.532899149230442, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 74}. Best is trial 0 with value: 850025325.7283984.[0m


Early stopping, best iteration is:
[680]	valid_0's rmse: 29155.2
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:04:51,837][0m Trial 1 finished with value: 808212359.0774509 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.6, 'subsample': 0.8, 'learning_rate': 0.16926232578759773, 'num_leaves': 2800, 'max_depth': 59, 'lambda_l1': 5, 'lambda_l2': 0, 'min_gain_to_split': 4.051592564686907, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.5, 'min_data_per_groups': 39}. Best is trial 1 with value: 808212359.0774509.[0m


Early stopping, best iteration is:
[43]	valid_0's rmse: 28429.1
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:04:52,607][0m Trial 2 finished with value: 812943515.9793749 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.17330069864250813, 'num_leaves': 2340, 'max_depth': 24, 'lambda_l1': 30, 'lambda_l2': 60, 'min_gain_to_split': 3.1752223871790903, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.8, 'min_data_per_groups': 94}. Best is trial 1 with value: 808212359.0774509.[0m


Early stopping, best iteration is:
[170]	valid_0's rmse: 28512.2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1499]	valid_0's rmse: 28182.4


[32m[I 2023-04-17 03:04:56,427][0m Trial 3 finished with value: 794245034.0183307 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.017684339539257493, 'num_leaves': 120, 'max_depth': 34, 'lambda_l1': 70, 'lambda_l2': 70, 'min_gain_to_split': 3.0631499585597903, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.5, 'min_data_per_groups': 45}. Best is trial 3 with value: 794245034.0183307.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:04:57,312][0m Trial 4 finished with value: 930135052.1832384 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.20514584099913596, 'num_leaves': 2980, 'max_depth': 29, 'lambda_l1': 70, 'lambda_l2': 95, 'min_gain_to_split': 3.673022488126125, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.9, 'min_data_per_groups': 79}. Best is trial 3 with value: 794245034.0183307.[0m
[32m[I 2023-04-17 03:04:57,354][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[376]	valid_0's rmse: 30498.1
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:04:58,732][0m Trial 6 finished with value: 907419441.4837323 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.23694080769245046, 'num_leaves': 2160, 'max_depth': 75, 'lambda_l1': 45, 'lambda_l2': 95, 'min_gain_to_split': 4.977464274689595, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.9, 'min_data_per_groups': 77}. Best is trial 3 with value: 794245034.0183307.[0m


Early stopping, best iteration is:
[316]	valid_0's rmse: 30123.4
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:04:59,288][0m Trial 7 finished with value: 846860169.5800471 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.23004073848984968, 'num_leaves': 2120, 'max_depth': 22, 'lambda_l1': 25, 'lambda_l2': 35, 'min_gain_to_split': 10.814441149776977, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 86}. Best is trial 3 with value: 794245034.0183307.[0m


Early stopping, best iteration is:
[61]	valid_0's rmse: 29100.9
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:04:59,740][0m Trial 8 finished with value: 859537277.6015306 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.2678309206541556, 'num_leaves': 1720, 'max_depth': 29, 'lambda_l1': 95, 'lambda_l2': 10, 'min_gain_to_split': 11.337307834790998, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 2}. Best is trial 3 with value: 794245034.0183307.[0m
[32m[I 2023-04-17 03:04:59,774][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:04:59,839][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:04:59,917][0m Trial 11 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[54]	valid_0's rmse: 29317.9


[32m[I 2023-04-17 03:04:59,994][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,067][0m Trial 13 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,147][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,245][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,339][0m Trial 16 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,423][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,519][0m Trial 18 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,606][0m Trial 19 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,697][0m Trial 20 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,790][0m Trial 21 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:00,897][0m Trial 22 pruned. Trial was pruned at iteration 0.[0m

Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:05,842][0m Trial 31 finished with value: 853854774.0806085 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.9, 'subsample': 0.6, 'learning_rate': 0.21364663081300395, 'num_leaves': 2280, 'max_depth': 20, 'lambda_l1': 25, 'lambda_l2': 35, 'min_gain_to_split': 5.496195944697495, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 88}. Best is trial 3 with value: 794245034.0183307.[0m


Early stopping, best iteration is:
[126]	valid_0's rmse: 29220.8
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:06,100][0m Trial 32 pruned. Trial was pruned at iteration 51.[0m
[32m[I 2023-04-17 03:05:06,201][0m Trial 33 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:06,921][0m Trial 34 finished with value: 814667495.6535538 and parameters: {'n_estimators': 20000, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.14845353467486364, 'num_leaves': 2940, 'max_depth': 23, 'lambda_l1': 15, 'lambda_l2': 0, 'min_gain_to_split': 4.581064023902234, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 81}. Best is trial 3 with value: 794245034.0183307.[0m
[32m[I 2023-04-17 03:05:07,039][0m Trial 35 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[53]	valid_0's rmse: 28542.4


[32m[I 2023-04-17 03:05:07,146][0m Trial 36 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:07,252][0m Trial 37 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:07,870][0m Trial 38 finished with value: 782499428.8953199 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.4, 'subsample': 0.6, 'learning_rate': 0.24489617225435922, 'num_leaves': 2800, 'max_depth': 17, 'lambda_l1': 70, 'lambda_l2': 5, 'min_gain_to_split': 3.318183082127133, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 71}. Best is trial 38 with value: 782499428.8953199.[0m
[32m[I 2023-04-17 03:05:07,983][0m Trial 39 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[51]	valid_0's rmse: 27973.2


[32m[I 2023-04-17 03:05:08,085][0m Trial 40 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-17 03:05:08,361][0m Trial 41 pruned. Trial was pruned at iteration 64.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:08,452][0m Trial 42 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:09,058][0m Trial 43 finished with value: 777931457.2987448 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.247984272795805, 'num_leaves': 2480, 'max_depth': 24, 'lambda_l1': 75, 'lambda_l2': 0, 'min_gain_to_split': 3.160242356640873, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 69}. Best is trial 43 with value: 777931457.2987448.[0m


Early stopping, best iteration is:
[19]	valid_0's rmse: 27891.4
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:09,735][0m Trial 44 finished with value: 824237709.4418552 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.24451379549708505, 'num_leaves': 2400, 'max_depth': 28, 'lambda_l1': 75, 'lambda_l2': 10, 'min_gain_to_split': 3.246366260955959, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.5, 'min_data_per_groups': 44}. Best is trial 43 with value: 777931457.2987448.[0m
[32m[I 2023-04-17 03:05:09,843][0m Trial 45 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[86]	valid_0's rmse: 28709.5
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:10,330][0m Trial 46 finished with value: 823267149.9444742 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.2600871057655526, 'num_leaves': 220, 'max_depth': 17, 'lambda_l1': 90, 'lambda_l2': 5, 'min_gain_to_split': 0.8330052369698073, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 59}. Best is trial 43 with value: 777931457.2987448.[0m


Early stopping, best iteration is:
[43]	valid_0's rmse: 28692.6
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-17 03:05:11,006][0m Trial 47 finished with value: 849572619.6934067 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.23975455867196552, 'num_leaves': 1780, 'max_depth': 54, 'lambda_l1': 75, 'lambda_l2': 10, 'min_gain_to_split': 2.9288248148567515, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 70}. Best is trial 43 with value: 777931457.2987448.[0m
[32m[I 2023-04-17 03:05:11,108][0m Trial 48 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[98]	valid_0's rmse: 29147.4


[32m[I 2023-04-17 03:05:11,214][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m


	Best value (rmse): 777931457.29874
	Best params:
		n_estimators: 20000
		colsample_bytree: 0.7
		subsample: 0.6
		learning_rate: 0.247984272795805
		num_leaves: 2480
		max_depth: 24
		lambda_l1: 75
		lambda_l2: 0
		min_gain_to_split: 3.160242356640873
		bagging_fraction: 0.9
		bagging_freq: 1
		feature_fraction: 0.7
		min_data_per_groups: 69


Optuna Price: 225337.30034241217
Optuna Price: 182019.6523214714
Optuna Price: 111352.21606774292
Optuna Price: 66325.56311194341
Optuna Price: 151272.80504644866
Optuna Price: 346979.96954410255
Optuna Price: 315074.5425273202
Optuna Price: 146623.18295361853
Optuna Price: 215871.43893933244
Optuna Price: 215912.4565592424
Optuna Price: 179227.20201527252
Optuna Price: 80636.76889063501
Optuna Price: 183350.28622159772
Optuna Price: 316231.5072902837
Optuna Price: 243178.67528713957
Optuna Price: 109117.04577070524
Optuna Price: 110324.31948691314
Optuna Price: 113779.51182541635
Optuna Price: 238683.56555092987
Optuna Price: 119952.39492711765
Optuna Price: 118924.975305371
Optuna Price: 123200.95857740453
Optuna Price: 275322.7634276986
Optuna Price: 307252.02703249076
Optuna Price: 102577.61861432229
Optuna Price: 218139.89452493846
Optuna Price: 139251.53452757892
Optuna Price: 175445.78364563372
Optuna Price: 515635.4439715337
Optuna Price: 138877.07914641002
Optuna Price: 111410