In [17]:
#!pip install shap optuna
#from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
import shap


import optuna
from sklearn.model_selection import train_test_split
from optuna.integration import LightGBMPruningCallback

#Reading the csv data into a variable. CSV was stored on Google Drive.
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Project/train.csv")

#Removing the non-number columns from df. Storing all but last columns in x, last column in y
df_int = df._get_numeric_data()
x=df_int.drop("SalePrice",axis=1)
y=df_int.SalePrice
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

#Tuning the hyperparameters with Optuna
def objective(trial):
    """
    Objective function to be minimized.
    """
    param = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        'metric': 'rmse',
        "n_estimators": trial.suggest_categorical("n_estimators", [20000]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 10, 100),
        #"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.9, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.9, step=0.1
        ),
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 150),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    olgbm = lgbm.LGBMRegressor(**param)
    olgbm.fit(X_train, y_train,eval_set=[(X_test,y_test)], callbacks=[lgbm.early_stopping(100), lgbm.log_evaluation, LightGBMPruningCallback(trial,"rmse")])
    predictions = olgbm.predict(X_test)
    accuracy = mean_squared_error(y_test, predictions)
    return accuracy

study = optuna.create_study(study_name="lightgbm", direction="minimize")
study.optimize(objective, n_trials=50)

model = lgbm.LGBMRegressor(**study.best_params)
model.fit(X_train, y_train)
prediction = model.predict(X_test)

print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

fig1 = optuna.visualization.plot_optimization_history(study)
fig2 = optuna.visualization.plot_slice(study)
fig3 = optuna.visualization.plot_param_importances(study)
fig1.show()
fig2.show()
fig3.show()

#for i in range(len(prediction)):
#  print("Optuna Price: {}".format(prediction[i]))

[32m[I 2023-04-16 21:32:24,789][0m A new study created in memory with name: lightgbm[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:25,646][0m Trial 0 finished with value: 876583143.4029151 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.16476035391376567, 'num_leaves': 520, 'max_depth': 19, 'lambda_l1': 30, 'lambda_l2': 55, 'min_gain_to_split': 11.360904177316035, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.5, 'min_data_per_groups': 1}. Best is trial 0 with value: 876583143.4029151.[0m


Early stopping, best iteration is:
[368]	valid_0's rmse: 29607.1
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:26,991][0m Trial 1 finished with value: 805510584.6822413 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.06934656400876338, 'num_leaves': 1580, 'max_depth': 85, 'lambda_l1': 65, 'lambda_l2': 45, 'min_gain_to_split': 8.214765513950292, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 52}. Best is trial 1 with value: 805510584.6822413.[0m


Early stopping, best iteration is:
[487]	valid_0's rmse: 28381.5
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:27,704][0m Trial 2 finished with value: 864819671.9079615 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.2661346318580753, 'num_leaves': 2640, 'max_depth': 57, 'lambda_l1': 75, 'lambda_l2': 45, 'min_gain_to_split': 0.48799759991146663, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.8, 'min_data_per_groups': 57}. Best is trial 1 with value: 805510584.6822413.[0m


Early stopping, best iteration is:
[172]	valid_0's rmse: 29407.8
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:28,217][0m Trial 3 finished with value: 864368795.2849247 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.6, 'subsample': 0.5, 'learning_rate': 0.19144603127386298, 'num_leaves': 2200, 'max_depth': 98, 'lambda_l1': 5, 'lambda_l2': 30, 'min_gain_to_split': 14.453684196294947, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 21}. Best is trial 1 with value: 805510584.6822413.[0m


Early stopping, best iteration is:
[99]	valid_0's rmse: 29400.1
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:28,609][0m Trial 4 finished with value: 799428141.3458573 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.2818208429070861, 'num_leaves': 1440, 'max_depth': 83, 'lambda_l1': 90, 'lambda_l2': 20, 'min_gain_to_split': 8.997291176189481, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 36}. Best is trial 4 with value: 799428141.3458573.[0m
[32m[I 2023-04-16 21:32:28,652][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[112]	valid_0's rmse: 28274.2
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:28,965][0m Trial 6 finished with value: 823010371.297512 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.28633065285838893, 'num_leaves': 340, 'max_depth': 42, 'lambda_l1': 45, 'lambda_l2': 20, 'min_gain_to_split': 1.8237835462611995, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.4, 'min_data_per_groups': 24}. Best is trial 4 with value: 799428141.3458573.[0m
[32m[I 2023-04-16 21:32:28,999][0m Trial 7 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:29,062][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[43]	valid_0's rmse: 28688.2
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:29,454][0m Trial 9 finished with value: 867538523.2420225 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.2672089551425359, 'num_leaves': 1280, 'max_depth': 89, 'lambda_l1': 55, 'lambda_l2': 5, 'min_gain_to_split': 11.05458412473893, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.2, 'min_data_per_groups': 7}. Best is trial 4 with value: 799428141.3458573.[0m
[32m[I 2023-04-16 21:32:29,534][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:29,632][0m Trial 11 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[42]	valid_0's rmse: 29454


[32m[I 2023-04-16 21:32:29,717][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:29,793][0m Trial 13 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:30,020][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:30,176][0m Trial 15 pruned. Trial was pruned at iteration 37.[0m
[32m[I 2023-04-16 21:32:30,255][0m Trial 16 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:30,336][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:30,433][0m Trial 18 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:30,515][0m Trial 19 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:30,603][0m Trial 20 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:30,975][0m Trial 21 finished with value: 874880006.9041759 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.29947229367836553, 'num_leaves': 40, 'max_depth': 34, 'lambda_l1': 45, 'lambda_l2': 20, 'min_gain_to_split': 2.5095940962150314, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.4, 'min_data_per_groups': 23}. Best is trial 4 with value: 799428141.3458573.[0m
[32m[I 2023-04-16 21:32:31,056][0m Trial 22 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:31,130][0m Trial 23 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[44]	valid_0's rmse: 29578.4


[32m[I 2023-04-16 21:32:31,206][0m Trial 24 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:31,346][0m Trial 25 pruned. Trial was pruned at iteration 22.[0m
[32m[I 2023-04-16 21:32:31,432][0m Trial 26 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:31,883][0m Trial 27 finished with value: 732968729.6324134 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.230947124846262, 'num_leaves': 640, 'max_depth': 84, 'lambda_l1': 65, 'lambda_l2': 0, 'min_gain_to_split': 3.8593312879416466, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 18}. Best is trial 27 with value: 732968729.6324134.[0m


Early stopping, best iteration is:
[22]	valid_0's rmse: 27073.4
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:32,353][0m Trial 28 finished with value: 851373358.8096335 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.2148197713054671, 'num_leaves': 1000, 'max_depth': 85, 'lambda_l1': 65, 'lambda_l2': 0, 'min_gain_to_split': 4.411987062665667, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.7, 'min_data_per_groups': 17}. Best is trial 27 with value: 732968729.6324134.[0m
[32m[I 2023-04-16 21:32:32,484][0m Trial 29 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[19]	valid_0's rmse: 29178.3


[32m[I 2023-04-16 21:32:32,699][0m Trial 30 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:33,277][0m Trial 31 pruned. Trial was pruned at iteration 40.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:36,110][0m Trial 32 finished with value: 812489805.23791 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.2743194526870926, 'num_leaves': 300, 'max_depth': 52, 'lambda_l1': 30, 'lambda_l2': 0, 'min_gain_to_split': 3.4080848570782662, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 18}. Best is trial 27 with value: 732968729.6324134.[0m


Early stopping, best iteration is:
[22]	valid_0's rmse: 28504.2
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:36,508][0m Trial 33 finished with value: 860300009.125689 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.26125252169176333, 'num_leaves': 620, 'max_depth': 52, 'lambda_l1': 0, 'lambda_l2': 0, 'min_gain_to_split': 3.4022632626776743, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.8, 'min_data_per_groups': 16}. Best is trial 27 with value: 732968729.6324134.[0m


Early stopping, best iteration is:
[16]	valid_0's rmse: 29330.9
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:36,912][0m Trial 34 finished with value: 780155490.6117944 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.26867008072150655, 'num_leaves': 200, 'max_depth': 93, 'lambda_l1': 30, 'lambda_l2': 5, 'min_gain_to_split': 3.016547077429239, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 52}. Best is trial 27 with value: 732968729.6324134.[0m
[32m[I 2023-04-16 21:32:37,004][0m Trial 35 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:37,085][0m Trial 36 pruned. Trial was pruned at iteration 1.[0m


Early stopping, best iteration is:
[24]	valid_0's rmse: 27931.3
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:37,172][0m Trial 37 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:37,264][0m Trial 38 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:37,349][0m Trial 39 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:37,462][0m Trial 40 pruned. Trial was pruned at iteration 12.[0m


Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:37,921][0m Trial 41 finished with value: 760640348.1798556 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.2759449419622397, 'num_leaves': 280, 'max_depth': 87, 'lambda_l1': 35, 'lambda_l2': 5, 'min_gain_to_split': 3.0234862552530943, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 12}. Best is trial 27 with value: 732968729.6324134.[0m


Early stopping, best iteration is:
[42]	valid_0's rmse: 27579.7
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:38,326][0m Trial 42 finished with value: 805485858.780007 and parameters: {'n_estimators': 20000, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.2699919442964921, 'num_leaves': 320, 'max_depth': 87, 'lambda_l1': 35, 'lambda_l2': 5, 'min_gain_to_split': 1.819184512506061, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 12}. Best is trial 27 with value: 732968729.6324134.[0m


Early stopping, best iteration is:
[26]	valid_0's rmse: 28381.1
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:38,800][0m Trial 43 finished with value: 759706584.8334247 and parameters: {'n_estimators': 20000, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.2720389359513816, 'num_leaves': 320, 'max_depth': 89, 'lambda_l1': 30, 'lambda_l2': 5, 'min_gain_to_split': 1.610474403916649, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001, 'min_data_per_groups': 2}. Best is trial 27 with value: 732968729.6324134.[0m
[32m[I 2023-04-16 21:32:38,891][0m Trial 44 pruned. Trial was pruned at iteration 2.[0m


Early stopping, best iteration is:
[44]	valid_0's rmse: 27562.8
Training until validation scores don't improve for 100 rounds
Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:39,036][0m Trial 45 pruned. Trial was pruned at iteration 24.[0m
[32m[I 2023-04-16 21:32:39,117][0m Trial 46 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-04-16 21:32:39,191][0m Trial 47 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 100 rounds


[32m[I 2023-04-16 21:32:39,684][0m Trial 48 finished with value: 814150748.4501517 and parameters: {'n_estimators': 20000, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.23892306989780568, 'num_leaves': 980, 'max_depth': 85, 'lambda_l1': 10, 'lambda_l2': 0, 'min_gain_to_split': 3.900707739501007, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.8, 'min_data_per_groups': 21}. Best is trial 27 with value: 732968729.6324134.[0m
[32m[I 2023-04-16 21:32:39,766][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m


Early stopping, best iteration is:
[51]	valid_0's rmse: 28533.3
	Best value (rmse): 732968729.63241
	Best params:
		n_estimators: 20000
		colsample_bytree: 0.8
		subsample: 0.7
		learning_rate: 0.230947124846262
		num_leaves: 640
		max_depth: 84
		lambda_l1: 65
		lambda_l2: 0
		min_gain_to_split: 3.8593312879416466
		bagging_fraction: 0.9
		bagging_freq: 1
		feature_fraction: 0.7
		min_data_per_groups: 18
