In [1]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    print(f'Data successfully loaded with {df.shape[0]} rows and {df.shape[1]} features')
    return df

train_set = load_data('../data/processed/train_processed_v1.csv')
dev_set = load_data('../data/processed/dev_processed_v1.csv')
test_set = load_data('../data/processed/test_processed_v1.csv')

Data successfully loaded with 4751 rows and 19 features
Data successfully loaded with 1356 rows and 19 features
Data successfully loaded with 910 rows and 19 features


In [3]:
def split_feature_target(df, target_variable: str = 'selling_price_log'):
    y = df[target_variable]
    x = df.drop(columns=[target_variable])
    print(f'target_shape: {y.shape} - feature_shape: {x.shape}')
    return x, y

x_train, y_train = split_feature_target(train_set)
x_dev, y_dev = split_feature_target(dev_set)
x_test, y_test = split_feature_target(test_set)

target_shape: (4751,) - feature_shape: (4751, 18)
target_shape: (1356,) - feature_shape: (1356, 18)
target_shape: (910,) - feature_shape: (910, 18)


In [4]:
# import libraries
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.metrics import root_mean_squared_error, r2_score
import time

In [5]:
def format_duration(seconds: float) -> str:
    if seconds < 60:
        return f"{seconds:.2f}s"
    elif seconds < 3600:
        minutes = int(seconds // 60)
        secs = seconds % 60
        return f"{minutes}m {secs:.2f}s"
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = seconds % 60
        return f"{hours}h {minutes}m {secs:.2f}s"

In [6]:
# baseline model training
baseline_models = {
    'Ridge' : Ridge(alpha=0.3, max_iter=2000, random_state=1),
    'XGBoost' : XGBRegressor(random_state=42, n_estimators=200, max_depth=3),
    'LightGBM' : LGBMRegressor(max_depth=5, learning_rate=0.3, n_estimators=200, random_state=42),
    'RandomForest' : RandomForestRegressor(n_estimators=200, max_depth=4, random_state=42)
}

In [7]:
results = {}
print("="*50)
print(f"BASELINE TRAINING INITIALIZED")
print("="*50)
for name, model in baseline_models.items():
    print(f'Training {name} model...')
    start_time = time.time()
    model.fit(x_train, y_train)
    time_elapsed = time.time() - start_time

    preds_train = model.predict(x_train)

    results[name] = {
        "Time_elapsed" : format_duration(time_elapsed),
        "R^2_score" : round(r2_score(y_train, preds_train),4),
        "RMSE" : round(root_mean_squared_error(y_train, preds_train))
    }

for name, result in results.items():
        print(f"{name}")
        print('-'*20)
        print("Time elapsed: ",result['Time_elapsed'])
        print("R^2_Score: ", result['R^2_score'])
        print("RMSE: ",result['RMSE'])
        print ("="*30)

print(f'BASELINE MODEL TRAINING COMPLETED')

BASELINE TRAINING INITIALIZED
Training Ridge model...
Training XGBoost model...
Training LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 870
[LightGBM] [Info] Number of data points in the train set: 4751, number of used features: 18
[LightGBM] [Info] Start training from score -0.000000
Training RandomForest model...
Ridge
--------------------
Time elapsed:  0.01s
R^2_Score:  0.842
RMSE:  0
XGBoost
--------------------
Time elapsed:  0.27s
R^2_Score:  0.947
RMSE:  0
LightGBM
--------------------
Time elapsed:  6.90s
R^2_Score:  0.9626
RMSE:  0
RandomForest
--------------------
Time elapsed:  1.62s
R^2_Score:  0.8355
RMSE:  0
BASELINE MODEL TRAINING COMPLETED


In [8]:
results = {}
print("="*50)
print(f"CROSS VALIDATION INITIALIZED")
print(f"="*50)

for name, model in baseline_models.items():
    start_time = time.time()
    cv_scores = cross_val_score(model, x_train, y_train, cv=KFold(5), scoring='neg_root_mean_squared_error', n_jobs=-1)
    time_elapsed = time.time() - start_time

    abs_cv_score = abs(cv_scores)
    
    results[name] = {
        'cv_scores_across_folds' : abs_cv_score,
        'cv_score_mean' : round(abs_cv_score.mean(), 4),
        'cv_score_std' : round(abs_cv_score.std(), 4),
        'Time_elapsed' : format_duration(time_elapsed)
    }

for name, result in results.items():
    print(f'Model_name: {name}')
    print(f'='*40)
    print(f'CV Scores: ',result['cv_scores_across_folds'])
    print(f'CV scores_mean: ',result['cv_score_mean'])
    print(f'CV score_std: +/-',result['cv_score_std'])
    print(f'Time_elapsed: ',result['Time_elapsed'])
    print(f'='*40)

print(f'CROSS-VALIDATION MODEL TRAINING COMPLETED')

CROSS VALIDATION INITIALIZED
Model_name: Ridge
CV Scores:  [0.41716765 0.3952506  0.39828129 0.39223018 0.40086055]
CV scores_mean:  0.4008
CV score_std: +/- 0.0087
Time_elapsed:  6.38s
Model_name: XGBoost
CV Scores:  [0.29129551 0.30042265 0.2986982  0.27874239 0.29366269]
CV scores_mean:  0.2926
CV score_std: +/- 0.0077
Time_elapsed:  4.92s
Model_name: LightGBM
CV Scores:  [0.28973033 0.30676242 0.3128978  0.29391002 0.30420172]
CV scores_mean:  0.3015
CV score_std: +/- 0.0085
Time_elapsed:  5.43s
Model_name: RandomForest
CV Scores:  [0.44352197 0.41886004 0.40985105 0.42159388 0.41798678]
CV scores_mean:  0.4224
CV score_std: +/- 0.0113
Time_elapsed:  1.81s
CROSS-VALIDATION MODEL TRAINING COMPLETED


In [9]:
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

In [10]:
def objective(trial):
            n_estimators = trial.suggest_int('n_estimators', 100,1000)
            max_depth = trial.suggest_int('max_depth', 3, 15)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
            min_child_weight = trial.suggest_int('min_child_weight', 1 , 10)

            xgbr = XGBRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                min_child_weight=min_child_weight
            )


            scores = cross_val_score(
                    estimator=xgbr,
                    X=x_train, y=y_train,
                    cv=KFold(5),scoring='neg_mean_squared_error',
                    n_jobs=-1
            )

            return -scores.mean()

In [11]:
study = optuna.create_study(direction='minimize',sampler=TPESampler(),pruner=MedianPruner())
study.optimize(objective, n_trials=160)

[32m[I 2026-01-21 15:46:52,415][0m A new study created in memory with name: no-name-48fc3725-5cf7-4e6f-b9bc-892a981ed8cb[0m
[32m[I 2026-01-21 15:46:55,538][0m Trial 0 finished with value: 0.10113028937406301 and parameters: {'n_estimators': 410, 'max_depth': 14, 'learning_rate': 0.06038829392804373, 'min_child_weight': 4}. Best is trial 0 with value: 0.10113028937406301.[0m
[32m[I 2026-01-21 15:46:56,774][0m Trial 1 finished with value: 0.08837976943545418 and parameters: {'n_estimators': 884, 'max_depth': 5, 'learning_rate': 0.10708184112555015, 'min_child_weight': 1}. Best is trial 1 with value: 0.08837976943545418.[0m
[32m[I 2026-01-21 15:46:58,682][0m Trial 2 finished with value: 0.0866038962032633 and parameters: {'n_estimators': 655, 'max_depth': 7, 'learning_rate': 0.025008013666044538, 'min_child_weight': 1}. Best is trial 2 with value: 0.0866038962032633.[0m
[32m[I 2026-01-21 15:47:02,252][0m Trial 3 finished with value: 0.10035126443691153 and parameters: {'n_es

In [15]:
import matplotlib.pyplot as plt
import plotly
optuna.visualization.plot_optimization_history(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [17]:
optuna.visualization.plot_param_importances(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
optuna.visualization.plot_slice(study)