In [None]:
# !pip install --upgrade tensorflow 

In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from IPython.display import display
import optuna

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

In [3]:
# setting a base path variable for easy access
BASE_PATH = Path("/kaggle/input/playground-series-s3e6")
train = pd.read_csv(BASE_PATH / "train.csv").drop(columns=["id"])

test = pd.read_csv(BASE_PATH / "test.csv")
test_idx = test.id
test = test.drop(columns=["id"])

In [4]:
# Note that we won't seprate the target variable since we won't use cross validation for training models
X = train.drop(columns="price")
y = train.price

In [48]:
pd.DataFrame({"N_Folds": [3, 4, 5], "XGB_results": [20, 20, 30]})

Unnamed: 0,N_Folds,XGB_results
0,3,20
1,4,20
2,5,30


In [76]:
# let's find the fold with lowest rmse score and use that to overfit publicLeaderboard :D
# we will use three models to calculate the rmse, and will find which fold on average gives the least rmse
N_FOLDS = [3, 4, 5, 6, 7, 8]

all_results = []
for N_F in N_FOLDS:
    print("Number of folds: ", N_F)
    kf = KFold(n_splits=N_F, random_state=1337, shuffle=True)
    cv_scores = np.zeros(N_F)

    for fold_id, (train_idx, val_idx) in enumerate(kf.split(X)):
        print("\t fold_id: ", fold_id)  
        fold_result = {}
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        fold_data = {"train": X_tr, "train_target": y_tr, 
                     "val": X_val, "val_target": y_val}
        all_folds_datasets["fold_"+str(fold_id+1)] = fold_data

        xgb_model = xgb.XGBRegressor(tree_method="gpu_hist")
        xgb_model.fit(X_tr, y_tr, verbose=False)
        y_preds_xgb = xgb_model.predict(X_val)
        rmse_xgb = mean_squared_error(y_val, y_preds_xgb, squared=False)
        print("\t\t XGBoost: ", rmse_xgb)
        
        lgbm_model = lgbm.LGBMRegressor(device_type="gpu")
        lgbm_model.fit(X_tr, y_tr, verbose=-1)
        y_preds_lgbm = lgbm_model.predict(X_val)
        rmse_lgbm = mean_squared_error(y_val, y_preds_lgbm, squared=False)
        print("\t\t LGBM: ", rmse_lgbm)
        
        cat_model = catboost.CatBoostRegressor()
        cat_model.fit(X_tr, y_tr, verbose=False)
        y_preds_cat = cat_model.predict(X_val)
        rmse_cat = mean_squared_error(y_val, y_preds_cat, squared=False)
        print("\t\t CATBOOST: ", rmse_cat)
        
        average = np.mean([rmse_xgb, rmse_lgbm, rmse_cat])
        print("\t\t\t AVERAGE: ", average)

#         print(f"Fold {fold_id+1} | RMSE: {rmse}")
    #     print("Fold length: ",len(X_tr))
#     avg_rmse = np.mean(cv_scores)
#     print(f"Average RMSE: {avg_rmse}")

Number of folds:  3
	 fold_id:  0
		 XGBoost:  159432.1585679272
		 LGBM:  145281.55768531864
		 CATBOOST:  147042.50201901066
			 AVERAGE:  150585.40609075216
	 fold_id:  1
		 XGBoost:  176691.47354847848
		 LGBM:  170494.04018474574
		 CATBOOST:  169620.16982847906
			 AVERAGE:  172268.56118723445
	 fold_id:  2
		 XGBoost:  202772.90152157153
		 LGBM:  204252.09813681437
		 CATBOOST:  198732.53077452036
			 AVERAGE:  201919.17681096875
Number of folds:  4
	 fold_id:  0
		 XGBoost:  158581.13665705323
		 LGBM:  164256.53966637346
		 CATBOOST:  158943.65310916188
			 AVERAGE:  160593.77647752952
	 fold_id:  1
		 XGBoost:  162610.85191605688
		 LGBM:  145905.82938447042
		 CATBOOST:  147061.81966354063
			 AVERAGE:  151859.50032135597
	 fold_id:  2
		 XGBoost:  212867.11395255371
		 LGBM:  210559.09452542642
		 CATBOOST:  216335.28822046594
			 AVERAGE:  213253.83223281536
	 fold_id:  3
		 XGBoost:  155590.86809423703
		 LGBM:  174764.44050747747
		 CATBOOST:  146728.93826268555
			 AVE

In [77]:
del all_folds_datasets

In [82]:
N_FOLDS = 8

all_folds_datasets = {}
cv_scores = np.zeros(N_FOLDS)
kf_8 = KFold(n_splits=N_FOLDS, shuffle=True, random_state=1337)

for fold_id, (train_idx, val_idx) in enumerate(kf_8.split(X)):
#     print("\t fold_id: ", fold_id+1)  
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    fold_data = {"train": X_tr, "train_target": y_tr, 
                 "val": X_val, "val_target": y_val}
    all_folds_datasets["fold_"+str(fold_id+1)] = fold_data
    
    xgb_model = xgb.XGBRegressor(tree_method="gpu_hist")
    xgb_model.fit(X_tr, y_tr, verbose=False)
    y_preds_xgb = xgb_model.predict(X_val)
    rmse = mean_squared_error(y_val, y_preds_xgb, squared=False)

    print(f"Fold {fold_id+1} | RMSE: {rmse}")
    cv_scores[fold_id] = rmse
#         print("Fold length: ",len(X_tr))
avg_rmse = np.mean(cv_scores)
print(f"Average RMSE: {avg_rmse}")

Fold 1 | RMSE: 185137.4392055286
Fold 2 | RMSE: 140142.53591379174
Fold 3 | RMSE: 77068.04573786366
Fold 4 | RMSE: 198702.14350566862
Fold 5 | RMSE: 132782.73050542094
Fold 6 | RMSE: 296780.37450702506
Fold 7 | RMSE: 191349.7818681119
Fold 8 | RMSE: 91946.05486614653
Average RMSE: 164238.63826369465


In [83]:
all_folds_datasets.keys()

dict_keys(['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5', 'fold_6', 'fold_7', 'fold_8'])

In [84]:
# let's use every fold 3 train data

f_X = all_folds_datasets["fold_3"]["train"]
f_y = all_folds_datasets["fold_3"]["train_target"]

X_val = all_folds_datasets["fold_3"]["val"]
y_val = all_folds_datasets["fold_3"]["val_target"]

In [23]:
# def objective_xgb(trial, X, y, X_val, y_val):
#     params = {
#         'tree_method': "gpu_hist",
#         'n_estimators': trial.suggest_int('n_estimators', 50, 600),
#         'max_depth': trial.suggest_int('max_depth', 2, 30),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),
#         'gamma': trial.suggest_loguniform('gamma', 0.00001, 0.3),
#         'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.05),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0, step=0.05),
#         'early_stopping_rounds': trial.suggest_int("early_stoppig_rounds", 40, 100)
#     }
    
    
#     model = xgb.XGBRegressor(**params)
#     model.fit(X, y, eval_set=[(X_val, y_val)], verbose=False)

#     y_pred = model.predict(X_val)

#     auc = mean_squared_error(y_val, y_pred, squared=False)        
        
#     print(f"AUC: {auc}")
    
#     return auc

In [24]:
# study_xgb = optuna.create_study(study_name="xgboost_tuning", direction="minimize")
# func = lambda trial: objective_xgb(trial, f_X, f_y, X_val, y_val)
# study_xgb.optimize(func, n_trials=100)

[32m[I 2023-02-16 09:58:40,643][0m A new study created in memory with name: xgboost_tuning[0m
[32m[I 2023-02-16 09:58:52,527][0m Trial 0 finished with value: 1196424.893800973 and parameters: {'n_estimators': 476, 'max_depth': 19, 'learning_rate': 0.010792949917844335, 'min_child_weight': 25, 'gamma': 0.0833587296491035, 'subsample': 0.8, 'colsample_bytree': 0.2, 'early_stoppig_rounds': 96}. Best is trial 0 with value: 1196424.893800973.[0m


AUC: 1196424.893800973


[33m[W 2023-02-16 09:59:47,143][0m Trial 1 failed with parameters: {'n_estimators': 283, 'max_depth': 26, 'learning_rate': 0.023662574432960402, 'min_child_weight': 22, 'gamma': 0.21335029876439185, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 63} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_24/1245654727.py", line 2, in <lambda>
    func = lambda trial: objective_xgb(trial, f_X, f_y, X_val, y_val)
  File "/tmp/ipykernel_24/3155177625.py", line 16, in objective_xgb
    model.fit(X, y, eval_set=[(X_val, y_val)], verbose=False)
  File "/opt/conda/lib/python3.7/site-packages/xgboost/core.py", line 575, in inner_f
    return f(**kwargs)
  File "/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py", line 972, in fit
    callbacks=cal

KeyboardInterrupt: 

In [87]:
# from optuna.integration import LightGBMPruningCallback

# def objective_lgbm(trial, f_X, f_y, X_val, y_val):
#     param_grid = {
#         "device_type": "gpu",
#         "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
#         "num_rounds": trial.suggest_int("num_rounds", 100, 1000),
#         "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.3),
#         "num_leaves": trial.suggest_int("num_leaves", 2, 300),
#         "max_depth": trial.suggest_int("max_depth", 2, 30),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 1000),
#         "lambda_l1": trial.suggest_loguniform('lambda_l1', 0.00001, 1.0),
#         "lambda_l2": trial.suggest_loguniform('lambda_l2', 0.00001, 1.0),
#         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
#         "bagging_fraction":  trial.suggest_loguniform('bagging_fraction', 0.2, 1.0),
#         "feature_fraction": trial.suggest_loguniform('feature_fraction', 0.2, 1.0),
#         "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 50, 200),
#         "verbose": -1,
#     }

            
#     model = lgbm.LGBMRegressor(**param_grid)
#     model.fit(f_X, f_y,
#         eval_set=[(X_val, y_val)],
#         eval_metric="auc",
#         verbose=-1,
#     )
#     y_preds = model.predict(X_val)
#     rmse = mean_squared_error(y_val, y_preds, squared=False)
    
#     print(f"RMSE: \t {rmse}")
#     return rmse

In [88]:
# study_lgbm = optuna.create_study(direction="minimize", study_name="LGBM Tuning")
# func = lambda trial: objective_lgbm(trial, f_X, f_y, X_val, y_val)
# study_lgbm.optimize(func, n_trials=100, show_progress_bar=True)

[32m[I 2023-02-16 10:57:34,350][0m A new study created in memory with name: LGBM Tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

RMSE: 	 2935219.9329150515
[32m[I 2023-02-16 10:57:35,084][0m Trial 0 finished with value: 2935219.9329150515 and parameters: {'n_estimators': 1893, 'num_rounds': 626, 'learning_rate': 0.051941549782402605, 'num_leaves': 160, 'max_depth': 13, 'min_data_in_leaf': 205, 'lambda_l1': 0.21621144981711818, 'lambda_l2': 0.059767748509954435, 'min_gain_to_split': 12.797260236190297, 'bagging_fraction': 0.6931773116431083, 'feature_fraction': 0.5106276523995611, 'early_stopping_rounds': 87}. Best is trial 0 with value: 2935219.9329150515.[0m
RMSE: 	 2897145.494925644
[32m[I 2023-02-16 10:57:35,789][0m Trial 1 finished with value: 2897145.494925644 and parameters: {'n_estimators': 1123, 'num_rounds': 529, 'learning_rate': 0.225405395069397, 'num_leaves': 193, 'max_depth': 30, 'min_data_in_leaf': 561, 'lambda_l1': 0.019227984817903433, 'lambda_l2': 2.3972883761592648e-05, 'min_gain_to_split': 0.11900481077082525, 'bagging_fraction': 0.949340498769101, 'feature_fraction': 0.6006627724566631, 

KeyboardInterrupt: 

In [93]:
lgbm_model = lgbm.LGBMRegressor()
lgbm_model.fit(f_X, f_y)
y_preds_val_lgbm = lgbm_model.predict(X_val)
rmse = mean_squared_error(y_val, y_preds_val_lgbm, squared=False)
rmse

75316.96471475216

In [94]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(f_X, f_y)
y_preds_val_xgb = xgb_model.predict(X_val)
rmse = mean_squared_error(y_val, y_preds_val_xgb, squared=False)
rmse

96701.27300983373

In [97]:
cat_model = catboost.CatBoostRegressor()
cat_model.fit(f_X, f_y, verbose=False)
y_preds_val_cat = cat_model.predict(X_val)
rmse = mean_squared_error(y_val, y_preds_val_cat, squared=False)
rmse

64595.906140480845

In [98]:
xgb_preds = xgb_model.predict(test)
lgbm_preds = lgbm_model.predict(test)
cat_preds = cat_model.predict(test)

In [102]:
y_preds_final = xgb_preds

In [103]:
# xgb
submission = pd.DataFrame({"id": test_idx, "price": y_preds_final})
submission.head()

Unnamed: 0,id,price
0,22730,4760994.0
1,22731,6197591.0
2,22732,9066677.0
3,22733,1633763.875
4,22734,6760227.5


In [100]:
# # cat
# submission = pd.DataFrame({"id": test_idx, "price": y_preds_final})
# submission.head()

Unnamed: 0,id,price
0,22730,4754707.0
1,22731,6225202.0
2,22732,9067827.0
3,22733,1605656.0
4,22734,6736176.0


In [31]:
# submission = pd.DataFrame({"id": test_idx, "price": y_preds_final})
# submission.head()

Unnamed: 0,id,price
0,22730,4765106.0
1,22731,6203692.0
2,22732,9070060.0
3,22733,1610015.375
4,22734,6775811.0


In [104]:
submission.to_csv("submission.csv", index=False)