## Cleaning The Data

In [79]:
import pandas as pd
import numpy as np

In [24]:
df=pd.read_csv('quikr_car.csv')
df.sample(5)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
455,Mahindra TUV300 T4 Plus,Mahindra,2016,540000,"35,000 kms",Diesel
862,Toyota Qualis,Toyota,2003,180000,"1,00,000 kms",Diesel
132,Toyota Corolla,Toyota,2009,275000,"26,000 kms",
608,tata Indica,tata,sale,130000,,
405,Hyundai Sonata Transform 2.4 GDi MT,Hyundai,2017,190000,"36,469 kms",Diesel


In [47]:
df.isnull().mean()*100

name          0.000000
company       0.000000
year          0.000000
Price         0.000000
kms_driven    0.000000
fuel_type     0.122399
dtype: float64

In [26]:
df.dtypes

name          object
company       object
year          object
Price         object
kms_driven    object
fuel_type     object
dtype: object

In [29]:
df['kms_driven']=df['kms_driven'].astype(str).str.split(' ').str[0].str.replace(',','')
df=df[df['kms_driven'].astype(str).str.isnumeric()]
df['kms_driven']=df['kms_driven'].astype(int)

In [35]:
df['Price']=df['Price'].astype(str).str.replace(',','')
df=df[df['Price'].astype(str).str.isnumeric()]
df['Price']=df['Price'].astype(int)

In [38]:
df['year']=df['year'].astype(int)

In [45]:
df['fuel_type'].value_counts()

fuel_type
Petrol    428
Diesel    386
LPG         2
Name: count, dtype: int64

In [49]:
mode=df['fuel_type'].mode()[0]
df['fuel_type']=df['fuel_type'].fillna(mode)

In [59]:
df['name']=df['name'].astype(str).str.split(' ').str.slice(0,3).str.join(' ')
df['name']

0         Hyundai Santro Xing
1         Mahindra Jeep CL550
3           Hyundai Grand i10
4      Ford EcoSport Titanium
6                   Ford Figo
                ...          
883        Maruti Suzuki Ritz
885            Tata Indica V2
886      Toyota Corolla Altis
888              Tata Zest XM
889        Mahindra Quanto C8
Name: name, Length: 817, dtype: object

In [62]:
df.isnull().mean()*100

name          0.0
company       0.0
year          0.0
Price         0.0
kms_driven    0.0
fuel_type     0.0
dtype: float64

In [63]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,817.0,817.0,817.0
mean,2012.440636,411550.3,46250.71481
std,4.002354,474917.3,34283.745254
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,490000.0,56758.0
max,2019.0,8500003.0,400000.0


In [68]:
df=df[df['Price']<5e6]

In [77]:
df=df[df['kms_driven']<190000]

In [78]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,811.0,811.0,811.0
mean,2012.453761,402184.4,45063.913687
std,3.991196,381510.7,30221.014354
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,26750.0
50%,2013.0,299999.0,41000.0
75%,2015.0,490000.0,56000.0
max,2019.0,3100000.0,175430.0


## Model Training

In [81]:
x=df.drop(columns='Price')
df['Price']=np.log1p(df['Price'])
y=df['Price']

In [83]:
import optuna
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

In [85]:
categorical_cols = ['fuel_type','company','name']
numerical_cols = ['kms_driven', 'year']

preprocessor=ColumnTransformer(transformers=[
    ('ohe',OneHotEncoder(handle_unknown='ignore'),categorical_cols),
    ('num',StandardScaler(),numerical_cols)
])

In [86]:
def objective(trial):
    test_size = trial.suggest_float('test_size', 0.15, 0.30)
    random_state = trial.suggest_int('random_state', 0, 1000)
    model_name = trial.suggest_categorical('model', ['XGBoost', 'RandomForest', 'LinearRegression', 'LightGBM'])

    if model_name == 'XGBoost':
        model = XGBRegressor(
            n_estimators=trial.suggest_int('n_estimators', 100, 500),
            max_depth=trial.suggest_int('max_depth', 3, 15),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
            reg_alpha=trial.suggest_float('reg_alpha', 0, 5),
            reg_lambda=trial.suggest_float('reg_lambda', 0, 5),
            random_state=random_state,
            n_jobs=-1
        )

    elif model_name == 'RandomForest':
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int('n_estimators', 100, 500),
            max_depth=trial.suggest_int('max_depth', 3, 20),
            min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
            random_state=random_state,
            n_jobs=-1
        )

    elif model_name == 'LightGBM':
        model = LGBMRegressor(
            n_estimators=trial.suggest_int('n_estimators', 100, 500),
            max_depth=trial.suggest_int('max_depth', 3, 15),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
            reg_alpha=trial.suggest_float('reg_alpha', 0, 5),
            reg_lambda=trial.suggest_float('reg_lambda', 0, 5),
            random_state=random_state,
            n_jobs=-1
        )

    else:
        model = LinearRegression()

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(x, y, test_size=test_size, random_state=random_state)

    pipeline.fit(X_train_sub, y_train_sub)
    y_pred_sub = pipeline.predict(X_test_sub)
    score = r2_score(y_test_sub, y_pred_sub)

    return score

In [87]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150,show_progress_bar=True)

[I 2025-07-10 19:09:06,921] A new study created in memory with name: no-name-37a74ab1-4d08-4a6a-92d4-64eec09aabf3
Best trial: 0. Best value: 0.758012:   1%|▏         | 2/150 [00:00<00:09, 14.97it/s]

[I 2025-07-10 19:09:06,953] Trial 0 finished with value: 0.7580123741213222 and parameters: {'test_size': 0.17623126621004537, 'random_state': 775, 'model': 'LinearRegression'}. Best is trial 0 with value: 0.7580123741213222.
[I 2025-07-10 19:09:07,054] Trial 1 finished with value: 0.47480708849627307 and parameters: {'test_size': 0.21638432780594208, 'random_state': 343, 'model': 'XGBoost', 'n_estimators': 344, 'max_depth': 6, 'learning_rate': 0.21373526727897507, 'subsample': 0.5412861165444609, 'colsample_bytree': 0.7355855631242694, 'reg_alpha': 2.371332080829775, 'reg_lambda': 1.7910633449030806}. Best is trial 0 with value: 0.7580123741213222.
[I 2025-07-10 19:09:07,127] Trial 2 finished with value: 0.4662151327665569 and parameters: {'test_size': 0.2113312142558985, 'random_state': 994, 'model': 'XGBoost', 'n_estimators': 365, 'max_depth': 6, 'learning_rate': 0.2639107326059824, 'subsample': 0.9259131628880357, 'colsample_bytree': 0.5555799729878976, 'reg_alpha': 2.6258752629808

Best trial: 0. Best value: 0.758012:   4%|▍         | 6/150 [00:00<00:09, 14.55it/s]

[I 2025-07-10 19:09:07,219] Trial 3 finished with value: 0.4084671767126443 and parameters: {'test_size': 0.20189339375005244, 'random_state': 681, 'model': 'LightGBM', 'n_estimators': 394, 'max_depth': 12, 'learning_rate': 0.21616510844815906, 'subsample': 0.5273989973820081, 'colsample_bytree': 0.687333844143152, 'reg_alpha': 2.9399290653381005, 'reg_lambda': 0.7364467335054897}. Best is trial 0 with value: 0.7580123741213222.
[I 2025-07-10 19:09:07,249] Trial 4 finished with value: 0.7394193473796764 and parameters: {'test_size': 0.2825907123402033, 'random_state': 783, 'model': 'LinearRegression'}. Best is trial 0 with value: 0.7580123741213222.
[I 2025-07-10 19:09:07,339] Trial 5 finished with value: 0.606752159977402 and parameters: {'test_size': 0.2974212879571465, 'random_state': 436, 'model': 'XGBoost', 'n_estimators': 441, 'max_depth': 12, 'learning_rate': 0.17490645733412957, 'subsample': 0.9184183572268014, 'colsample_bytree': 0.5174455023766755, 'reg_alpha': 1.458119099310

Best trial: 9. Best value: 0.778553:   6%|▌         | 9/150 [00:00<00:11, 12.39it/s]

[I 2025-07-10 19:09:07,452] Trial 6 finished with value: 0.6582930395870545 and parameters: {'test_size': 0.2919128551372298, 'random_state': 80, 'model': 'XGBoost', 'n_estimators': 473, 'max_depth': 13, 'learning_rate': 0.23757725293319915, 'subsample': 0.7717908061795029, 'colsample_bytree': 0.9821070921460799, 'reg_alpha': 1.7039813494801137, 'reg_lambda': 4.790657584840236}. Best is trial 0 with value: 0.7580123741213222.
[I 2025-07-10 19:09:07,537] Trial 7 finished with value: 0.46692721381486324 and parameters: {'test_size': 0.1524221060735846, 'random_state': 475, 'model': 'XGBoost', 'n_estimators': 411, 'max_depth': 5, 'learning_rate': 0.11225408061248737, 'subsample': 0.5470279326528177, 'colsample_bytree': 0.6923942184002495, 'reg_alpha': 1.9927697291591588, 'reg_lambda': 2.1496723342385677}. Best is trial 0 with value: 0.7580123741213222.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_col_wise=true` 

Best trial: 13. Best value: 0.825357:  10%|█         | 15/150 [00:01<00:08, 15.75it/s]

[I 2025-07-10 19:09:07,875] Trial 10 finished with value: 0.665912648646046 and parameters: {'test_size': 0.2512605071841907, 'random_state': 986, 'model': 'RandomForest', 'n_estimators': 108, 'max_depth': 20, 'min_samples_split': 2}. Best is trial 9 with value: 0.7785533575612352.
[I 2025-07-10 19:09:07,903] Trial 11 finished with value: 0.7857150466860545 and parameters: {'test_size': 0.17035996070655174, 'random_state': 750, 'model': 'LinearRegression'}. Best is trial 11 with value: 0.7857150466860545.
[I 2025-07-10 19:09:07,929] Trial 12 finished with value: 0.7278936622271002 and parameters: {'test_size': 0.24310209680325995, 'random_state': 680, 'model': 'LinearRegression'}. Best is trial 11 with value: 0.7857150466860545.
[I 2025-07-10 19:09:07,955] Trial 13 finished with value: 0.825356934806017 and parameters: {'test_size': 0.15153007633014848, 'random_state': 883, 'model': 'LinearRegression'}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:07,984] Trial 1

Best trial: 13. Best value: 0.825357:  13%|█▎        | 20/150 [00:01<00:09, 13.23it/s]

[I 2025-07-10 19:09:08,294] Trial 15 finished with value: 0.7169648320284361 and parameters: {'test_size': 0.17296180649091156, 'random_state': 599, 'model': 'RandomForest', 'n_estimators': 208, 'max_depth': 20, 'min_samples_split': 10}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:08,321] Trial 16 finished with value: 0.7731611467278539 and parameters: {'test_size': 0.16484146051765702, 'random_state': 859, 'model': 'LinearRegression'}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:08,349] Trial 17 finished with value: 0.7329080512957126 and parameters: {'test_size': 0.19251627481435415, 'random_state': 582, 'model': 'LinearRegression'}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:08,380] Trial 18 finished with value: 0.7294843435403648 and parameters: {'test_size': 0.2324206750808187, 'random_state': 895, 'model': 'LinearRegression'}. Best is trial 13 with value: 0.825356934806017.
[LightGBM] [Info] Auto-choosing co

Best trial: 25. Best value: 0.829953:  17%|█▋        | 26/150 [00:02<00:09, 12.72it/s]

[I 2025-07-10 19:09:08,883] Trial 20 finished with value: 0.7379108286795477 and parameters: {'test_size': 0.26521236710662327, 'random_state': 339, 'model': 'RandomForest', 'n_estimators': 278, 'max_depth': 16, 'min_samples_split': 6}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:08,914] Trial 21 finished with value: 0.7184219733069084 and parameters: {'test_size': 0.18759997570911702, 'random_state': 916, 'model': 'LinearRegression'}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:08,941] Trial 22 finished with value: 0.8250100482677533 and parameters: {'test_size': 0.15208358120022974, 'random_state': 942, 'model': 'LinearRegression'}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:08,972] Trial 23 finished with value: 0.7064034490469726 and parameters: {'test_size': 0.15302844250297684, 'random_state': 744, 'model': 'LinearRegression'}. Best is trial 13 with value: 0.825356934806017.
[I 2025-07-10 19:09:09,002] Trial 

Best trial: 25. Best value: 0.829953:  19%|█▉        | 29/150 [00:02<00:09, 12.80it/s]

[I 2025-07-10 19:09:09,088] Trial 27 finished with value: 0.8254288917647901 and parameters: {'test_size': 0.1589188205296671, 'random_state': 362, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:09,259] Trial 28 finished with value: 0.6596233115675705 and parameters: {'test_size': 0.20133910026683496, 'random_state': 260, 'model': 'RandomForest', 'n_estimators': 100, 'max_depth': 9, 'min_samples_split': 10}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:09,288] Trial 29 finished with value: 0.773828627198917 and parameters: {'test_size': 0.16148433292874578, 'random_state': 369, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.


Best trial: 25. Best value: 0.829953:  23%|██▎       | 34/150 [00:02<00:07, 16.05it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 665, number of used features: 16
[LightGBM] [Info] Start training from score 2.604957
[I 2025-07-10 19:09:09,372] Trial 30 finished with value: 0.5914783033986488 and parameters: {'test_size': 0.17973181269454674, 'random_state': 236, 'model': 'LightGBM', 'n_estimators': 171, 'max_depth': 9, 'learning_rate': 0.298930892441157, 'subsample': 0.6826102483804961, 'colsample_bytree': 0.8424549366453491, 'reg_alpha': 0.1142307452269895, 'reg_lambda': 3.502636638426562}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:09,420] Trial 31 finished with value: 0.6641661398137158 and parameters: {'test_size': 0.15222230322364147, 'random_state': 3, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.
[I 

Best trial: 25. Best value: 0.829953:  25%|██▌       | 38/150 [00:02<00:06, 16.62it/s]

[I 2025-07-10 19:09:09,554] Trial 35 finished with value: 0.7631452954976689 and parameters: {'test_size': 0.15039200188731006, 'random_state': 634, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:09,586] Trial 36 finished with value: 0.8103865820364915 and parameters: {'test_size': 0.19957589187841895, 'random_state': 821, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:09,720] Trial 37 finished with value: 0.35506572807331194 and parameters: {'test_size': 0.22521228003424243, 'random_state': 937, 'model': 'XGBoost', 'n_estimators': 499, 'max_depth': 3, 'learning_rate': 0.011465687508909639, 'subsample': 0.99735165917729, 'colsample_bytree': 0.8189447226120233, 'reg_alpha': 4.688487390252275, 'reg_lambda': 1.3872040283982745}. Best is trial 25 with value: 0.8299529621720793.


Best trial: 25. Best value: 0.829953:  27%|██▋       | 40/150 [00:02<00:06, 15.99it/s]

[I 2025-07-10 19:09:09,762] Trial 38 finished with value: 0.7611703881391267 and parameters: {'test_size': 0.16002082180060784, 'random_state': 495, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 157
[LightGBM] [Info] Number of data points in the train set: 669, number of used features: 16
[LightGBM] [Info] Start training from score 2.607069
[I 2025-07-10 19:09:09,862] Trial 39 finished with value: 0.3103811037437145 and parameters: {'test_size': 0.17437396090522833, 'random_state': 861, 'model': 'LightGBM', 'n_estimators': 303, 'max_depth': 8, 'learning_rate': 0.08408110871115568, 'subsample': 0.6462094981772978, 'colsample_bytree': 0.996730450838033, 'reg_alpha': 3.8282610355369897, 'reg_lambda': 3.180127427188221}. Best is trial 25 with value: 0.8299529621720793.


Best trial: 43. Best value: 0.848853:  30%|███       | 45/150 [00:03<00:07, 14.58it/s]

[I 2025-07-10 19:09:10,107] Trial 40 finished with value: 0.7477378390667161 and parameters: {'test_size': 0.18524213177101126, 'random_state': 164, 'model': 'XGBoost', 'n_estimators': 157, 'max_depth': 14, 'learning_rate': 0.07381241491811887, 'subsample': 0.8374163717580833, 'colsample_bytree': 0.5930280220949256, 'reg_alpha': 0.059356312041581205, 'reg_lambda': 0.020566525186546336}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:10,154] Trial 41 finished with value: 0.7699704474385196 and parameters: {'test_size': 0.1999755118860354, 'random_state': 800, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:10,198] Trial 42 finished with value: 0.8151914267464337 and parameters: {'test_size': 0.20662294853652094, 'random_state': 830, 'model': 'LinearRegression'}. Best is trial 25 with value: 0.8299529621720793.
[I 2025-07-10 19:09:10,230] Trial 43 finished with value: 0.8488529234204701 and parameters: {'test_size':

Best trial: 43. Best value: 0.848853:  31%|███▏      | 47/150 [00:03<00:07, 14.06it/s]

[I 2025-07-10 19:09:10,319] Trial 45 finished with value: 0.8353404716614988 and parameters: {'test_size': 0.15773919032314115, 'random_state': 993, 'model': 'LinearRegression'}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:10,437] Trial 46 finished with value: 0.6031586798943112 and parameters: {'test_size': 0.16949291402798972, 'random_state': 411, 'model': 'XGBoost', 'n_estimators': 266, 'max_depth': 5, 'learning_rate': 0.13595566672308584, 'subsample': 0.6232153506619391, 'colsample_bytree': 0.805306724841219, 'reg_alpha': 0.9600522775859133, 'reg_lambda': 4.870828563103173}. Best is trial 43 with value: 0.8488529234204701.


Best trial: 43. Best value: 0.848853:  34%|███▍      | 51/150 [00:04<00:10,  9.71it/s]

[I 2025-07-10 19:09:10,920] Trial 47 finished with value: 0.7247113298250052 and parameters: {'test_size': 0.2241335065741612, 'random_state': 997, 'model': 'RandomForest', 'n_estimators': 321, 'max_depth': 17, 'min_samples_split': 2}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:10,951] Trial 48 finished with value: 0.8366546781891817 and parameters: {'test_size': 0.15704758586876236, 'random_state': 867, 'model': 'LinearRegression'}. Best is trial 43 with value: 0.8488529234204701.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 655, number of used features: 16
[LightGBM] [Info] Start training from score 2.607037
[I 2025-07-10 19:09:11,028] Trial 49 finished with value: 0.30097684430457317 and parameters: {'test_size': 0.1919317603494324, 'random_state': 743,

Best trial: 43. Best value: 0.848853:  38%|███▊      | 57/150 [00:04<00:06, 14.61it/s]

[I 2025-07-10 19:09:11,170] Trial 52 finished with value: 0.7704394148751518 and parameters: {'test_size': 0.1576361714320999, 'random_state': 777, 'model': 'LinearRegression'}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:11,212] Trial 53 finished with value: 0.7615130000750221 and parameters: {'test_size': 0.1684200472328835, 'random_state': 890, 'model': 'LinearRegression'}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:11,242] Trial 54 finished with value: 0.7343938448670624 and parameters: {'test_size': 0.17224205192611458, 'random_state': 699, 'model': 'LinearRegression'}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:11,275] Trial 55 finished with value: 0.8328292505012933 and parameters: {'test_size': 0.16315847164649044, 'random_state': 841, 'model': 'LinearRegression'}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:11,307] Trial 56 finished with value: 0.6784765998425768 and parameter

Best trial: 43. Best value: 0.848853:  41%|████      | 61/150 [00:04<00:07, 12.07it/s]

[I 2025-07-10 19:09:11,578] Trial 58 finished with value: 0.6380070273992378 and parameters: {'test_size': 0.15712079800623885, 'random_state': 641, 'model': 'RandomForest', 'n_estimators': 159, 'max_depth': 17, 'min_samples_split': 6}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:11,622] Trial 59 finished with value: 0.7424521667366276 and parameters: {'test_size': 0.16663171857314857, 'random_state': 457, 'model': 'LinearRegression'}. Best is trial 43 with value: 0.8488529234204701.
[I 2025-07-10 19:09:11,763] Trial 60 finished with value: 0.6836083069449523 and parameters: {'test_size': 0.1921181065709028, 'random_state': 840, 'model': 'XGBoost', 'n_estimators': 373, 'max_depth': 7, 'learning_rate': 0.18269741382064156, 'subsample': 0.7270435642596749, 'colsample_bytree': 0.6354377048667539, 'reg_alpha': 0.7559132077441792, 'reg_lambda': 2.944898824128637}. Best is trial 43 with value: 0.8488529234204701.


Best trial: 61. Best value: 0.856831:  44%|████▍     | 66/150 [00:05<00:05, 14.41it/s]

[I 2025-07-10 19:09:11,816] Trial 61 finished with value: 0.856830985236027 and parameters: {'test_size': 0.1511681996015597, 'random_state': 998, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:11,862] Trial 62 finished with value: 0.7577631717952916 and parameters: {'test_size': 0.15654933211148767, 'random_state': 990, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:11,892] Trial 63 finished with value: 0.803705571374496 and parameters: {'test_size': 0.16225939426510846, 'random_state': 913, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:11,927] Trial 64 finished with value: 0.8548068513592264 and parameters: {'test_size': 0.1501446177440342, 'random_state': 998, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:11,959] Trial 65 finished with value: 0.8030885584868616 and parameters: {'t

Best trial: 61. Best value: 0.856831:  47%|████▋     | 71/150 [00:05<00:04, 18.36it/s]

[I 2025-07-10 19:09:12,022] Trial 67 finished with value: 0.7763953827357122 and parameters: {'test_size': 0.15005945401453247, 'random_state': 917, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 153
[LightGBM] [Info] Number of data points in the train set: 620, number of used features: 16
[LightGBM] [Info] Start training from score 2.607169
[I 2025-07-10 19:09:12,098] Trial 68 finished with value: 0.2370807251890279 and parameters: {'test_size': 0.23508494302008953, 'random_state': 857, 'model': 'LightGBM', 'n_estimators': 194, 'max_depth': 4, 'learning_rate': 0.2909821843898417, 'subsample': 0.9998219319775574, 'colsample_bytree': 0.7625523450453955, 'reg_alpha': 4.235961220512495, 'reg_lambda': 4.107665922286728}. Best is trial 61 with value: 0.856830985236027.
[I 

Best trial: 61. Best value: 0.856831:  50%|█████     | 75/150 [00:05<00:03, 20.59it/s]

[I 2025-07-10 19:09:12,225] Trial 71 finished with value: 0.8387029028796413 and parameters: {'test_size': 0.1574075071612547, 'random_state': 925, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,257] Trial 72 finished with value: 0.8403835360076096 and parameters: {'test_size': 0.15449111081568714, 'random_state': 926, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,290] Trial 73 finished with value: 0.8363454089020456 and parameters: {'test_size': 0.1554269652444677, 'random_state': 928, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,321] Trial 74 finished with value: 0.7993883404995044 and parameters: {'test_size': 0.15685026837637223, 'random_state': 924, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,353] Trial 75 finished with value: 0.8345056968078015 and parameters: {

Best trial: 61. Best value: 0.856831:  55%|█████▍    | 82/150 [00:05<00:03, 19.42it/s]

[I 2025-07-10 19:09:12,580] Trial 76 finished with value: 0.7310117085163544 and parameters: {'test_size': 0.15495378923856007, 'random_state': 975, 'model': 'RandomForest', 'n_estimators': 139, 'max_depth': 14, 'min_samples_split': 4}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,609] Trial 77 finished with value: 0.726602220027911 and parameters: {'test_size': 0.16675827074476537, 'random_state': 893, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,642] Trial 78 finished with value: 0.7688780357836524 and parameters: {'test_size': 0.2625705140331856, 'random_state': 997, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,676] Trial 79 finished with value: 0.7755848482527785 and parameters: {'test_size': 0.2992451660345178, 'random_state': 921, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,706] Trial 80 

Best trial: 61. Best value: 0.856831:  57%|█████▋    | 86/150 [00:06<00:02, 22.25it/s]

[I 2025-07-10 19:09:12,801] Trial 83 finished with value: 0.7752308866388032 and parameters: {'test_size': 0.1600306550890328, 'random_state': 870, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,835] Trial 84 finished with value: 0.8287083512563638 and parameters: {'test_size': 0.16020236588817416, 'random_state': 968, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,868] Trial 85 finished with value: 0.7292104715428747 and parameters: {'test_size': 0.16718879455096186, 'random_state': 904, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:12,966] Trial 86 finished with value: 0.4641462391026998 and parameters: {'test_size': 0.15394552358467153, 'random_state': 1000, 'model': 'XGBoost', 'n_estimators': 268, 'max_depth': 10, 'learning_rate': 0.11136059642395081, 'subsample': 0.5984138573622498, 'colsample_bytree': 0.5011234231454385, '

Best trial: 61. Best value: 0.856831:  61%|██████    | 91/150 [00:06<00:02, 19.68it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 679, number of used features: 16
[LightGBM] [Info] Start training from score 2.606266
[I 2025-07-10 19:09:13,071] Trial 87 finished with value: 0.3697792510979021 and parameters: {'test_size': 0.16213864456482638, 'random_state': 945, 'model': 'LightGBM', 'n_estimators': 323, 'max_depth': 8, 'learning_rate': 0.2624423420926819, 'subsample': 0.7451952682494218, 'colsample_bytree': 0.910261221621202, 'reg_alpha': 3.4454020595723396, 'reg_lambda': 2.797247925828869}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:13,117] Trial 88 finished with value: 0.7909569168424584 and parameters: {'test_size': 0.18275245276455468, 'random_state': 884, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 

Best trial: 61. Best value: 0.856831:  63%|██████▎   | 95/150 [00:06<00:02, 21.78it/s]

[I 2025-07-10 19:09:13,264] Trial 92 finished with value: 0.7267977327986823 and parameters: {'test_size': 0.21738031952149364, 'random_state': 932, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:13,299] Trial 93 finished with value: 0.7447771052974401 and parameters: {'test_size': 0.1647645182189889, 'random_state': 755, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:13,336] Trial 94 finished with value: 0.7690587881891148 and parameters: {'test_size': 0.15912560638240997, 'random_state': 917, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:13,398] Trial 95 finished with value: 0.7306572206385576 and parameters: {'test_size': 0.15224991114074476, 'random_state': 979, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.


Best trial: 61. Best value: 0.856831:  67%|██████▋   | 101/150 [00:07<00:03, 12.72it/s]

[I 2025-07-10 19:09:13,961] Trial 96 finished with value: 0.6777975074917414 and parameters: {'test_size': 0.16250509320814416, 'random_state': 954, 'model': 'RandomForest', 'n_estimators': 429, 'max_depth': 18, 'min_samples_split': 8}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:13,992] Trial 97 finished with value: 0.6849301098262042 and parameters: {'test_size': 0.15005738984454964, 'random_state': 832, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,023] Trial 98 finished with value: 0.783306426828319 and parameters: {'test_size': 0.17276426448356988, 'random_state': 790, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,052] Trial 99 finished with value: 0.7402139144824023 and parameters: {'test_size': 0.16986006113782767, 'random_state': 883, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,146] Trial 1

Best trial: 61. Best value: 0.856831:  70%|███████   | 105/150 [00:07<00:03, 14.63it/s]

[I 2025-07-10 19:09:14,190] Trial 101 finished with value: 0.6962088497252381 and parameters: {'test_size': 0.1528532625218539, 'random_state': 978, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,238] Trial 102 finished with value: 0.7364114007823896 and parameters: {'test_size': 0.1623403600244979, 'random_state': 949, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,271] Trial 103 finished with value: 0.8228309731738843 and parameters: {'test_size': 0.15531984207391794, 'random_state': 724, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,303] Trial 104 finished with value: 0.7417432176799992 and parameters: {'test_size': 0.16597559884859484, 'random_state': 927, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,333] Trial 105 finished with value: 0.7646762322827438 and paramete

Best trial: 61. Best value: 0.856831:  73%|███████▎  | 110/150 [00:07<00:02, 16.79it/s]

[I 2025-07-10 19:09:14,448] Trial 106 finished with value: 0.48431575968932916 and parameters: {'test_size': 0.1542817183094089, 'random_state': 852, 'model': 'LightGBM', 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.04592997216242255, 'subsample': 0.5038350121045336, 'colsample_bytree': 0.9497114824198709, 'reg_alpha': 0.8893496299486559, 'reg_lambda': 1.6483751651148575}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,498] Trial 107 finished with value: 0.7965832386943582 and parameters: {'test_size': 0.15869794634311957, 'random_state': 817, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,549] Trial 108 finished with value: 0.8005318174635695 and parameters: {'test_size': 0.16193171829434236, 'random_state': 547, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,581] Trial 109 finished with value: 0.7450981233628934 and parameters: {'test_size': 0

Best trial: 61. Best value: 0.856831:  77%|███████▋  | 115/150 [00:07<00:01, 19.12it/s]

[I 2025-07-10 19:09:14,616] Trial 110 finished with value: 0.8459603535177593 and parameters: {'test_size': 0.15683531933699532, 'random_state': 877, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,651] Trial 111 finished with value: 0.7544070626090712 and parameters: {'test_size': 0.15602400937096678, 'random_state': 874, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,685] Trial 112 finished with value: 0.7408152231066752 and parameters: {'test_size': 0.15353976415463272, 'random_state': 960, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,718] Trial 113 finished with value: 0.756107277727619 and parameters: {'test_size': 0.1586842656736602, 'random_state': 982, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,751] Trial 114 finished with value: 0.8518155108737895 and paramete

Best trial: 61. Best value: 0.856831:  78%|███████▊  | 117/150 [00:07<00:01, 21.95it/s]

[I 2025-07-10 19:09:14,818] Trial 116 finished with value: 0.787512145685666 and parameters: {'test_size': 0.16388225068545514, 'random_state': 900, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:14,851] Trial 117 finished with value: 0.7912963531788442 and parameters: {'test_size': 0.24035626807336793, 'random_state': 923, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.


Best trial: 61. Best value: 0.856831:  82%|████████▏ | 123/150 [00:08<00:01, 13.99it/s]

[I 2025-07-10 19:09:15,359] Trial 118 finished with value: 0.7622375071062818 and parameters: {'test_size': 0.16496548783990272, 'random_state': 1000, 'model': 'RandomForest', 'n_estimators': 380, 'max_depth': 15, 'min_samples_split': 8}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:15,391] Trial 119 finished with value: 0.6339218420694052 and parameters: {'test_size': 0.16061174155864383, 'random_state': 943, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:15,427] Trial 120 finished with value: 0.7570532202950385 and parameters: {'test_size': 0.2812921012147205, 'random_state': 867, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:15,463] Trial 121 finished with value: 0.6610734090200037 and parameters: {'test_size': 0.15211580324217466, 'random_state': 933, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:15,497] Tr

Best trial: 61. Best value: 0.856831:  84%|████████▍ | 126/150 [00:08<00:01, 16.41it/s]

[I 2025-07-10 19:09:15,564] Trial 124 finished with value: 0.6539215785109489 and parameters: {'test_size': 0.1596179144148527, 'random_state': 836, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:15,602] Trial 125 finished with value: 0.6773047967186234 and parameters: {'test_size': 0.16758323595842345, 'random_state': 167, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:15,637] Trial 126 finished with value: 0.6851762879862532 and parameters: {'test_size': 0.2919259955109176, 'random_state': 956, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.


Best trial: 61. Best value: 0.856831:  87%|████████▋ | 130/150 [00:09<00:01, 14.87it/s]

[I 2025-07-10 19:09:15,802] Trial 127 finished with value: 0.6657037020120675 and parameters: {'test_size': 0.15419287182592764, 'random_state': 917, 'model': 'XGBoost', 'n_estimators': 469, 'max_depth': 12, 'learning_rate': 0.11165839600785021, 'subsample': 0.5958742901293215, 'colsample_bytree': 0.8686781072487413, 'reg_alpha': 1.3601734019711258, 'reg_lambda': 4.4303971321639315}. Best is trial 61 with value: 0.856830985236027.
[I 2025-07-10 19:09:15,850] Trial 128 finished with value: 0.7468300572316864 and parameters: {'test_size': 0.27025389020904605, 'random_state': 985, 'model': 'LinearRegression'}. Best is trial 61 with value: 0.856830985236027.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157
[LightGBM] [Info] Number of data points in the train set: 683, number of used 

Best trial: 133. Best value: 0.88034:  91%|█████████ | 136/150 [00:09<00:00, 17.39it/s] 

[I 2025-07-10 19:09:16,043] Trial 131 finished with value: 0.8674712451330541 and parameters: {'test_size': 0.16058284784313048, 'random_state': 966, 'model': 'LinearRegression'}. Best is trial 131 with value: 0.8674712451330541.
[I 2025-07-10 19:09:16,082] Trial 132 finished with value: 0.8259353876160718 and parameters: {'test_size': 0.16158953332843798, 'random_state': 934, 'model': 'LinearRegression'}. Best is trial 131 with value: 0.8674712451330541.
[I 2025-07-10 19:09:16,120] Trial 133 finished with value: 0.8803401641685645 and parameters: {'test_size': 0.15892940339566472, 'random_state': 959, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,152] Trial 134 finished with value: 0.7382966919816571 and parameters: {'test_size': 0.1662989579692765, 'random_state': 960, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,184] Trial 135 finished with value: 0.7205064425822498 and

Best trial: 133. Best value: 0.88034:  95%|█████████▌| 143/150 [00:09<00:00, 22.95it/s]

[I 2025-07-10 19:09:16,252] Trial 137 finished with value: 0.8382991220291005 and parameters: {'test_size': 0.1633123164995904, 'random_state': 944, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,286] Trial 138 finished with value: 0.8291004863941801 and parameters: {'test_size': 0.16094020995784342, 'random_state': 999, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,318] Trial 139 finished with value: 0.7365479467049989 and parameters: {'test_size': 0.1744686672555651, 'random_state': 945, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,352] Trial 140 finished with value: 0.8315589634069012 and parameters: {'test_size': 0.22093947609321787, 'random_state': 909, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,384] Trial 141 finished with value: 0.8787015320954925 and 

Best trial: 133. Best value: 0.88034:  98%|█████████▊| 147/150 [00:09<00:00, 24.31it/s]

[I 2025-07-10 19:09:16,481] Trial 144 finished with value: 0.7847810768888182 and parameters: {'test_size': 0.15515798234775385, 'random_state': 984, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,513] Trial 145 finished with value: 0.7862614890454683 and parameters: {'test_size': 0.16842719062242148, 'random_state': 951, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,563] Trial 146 finished with value: 0.6840176197872657 and parameters: {'test_size': 0.15237049122846905, 'random_state': 912, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.


Best trial: 133. Best value: 0.88034: 100%|██████████| 150/150 [00:09<00:00, 15.09it/s]

[I 2025-07-10 19:09:16,796] Trial 147 finished with value: 0.8105046695975897 and parameters: {'test_size': 0.15953962974597494, 'random_state': 966, 'model': 'RandomForest', 'n_estimators': 130, 'max_depth': 19, 'min_samples_split': 4}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,826] Trial 148 finished with value: 0.6752878913212851 and parameters: {'test_size': 0.16438165953876877, 'random_state': 872, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.
[I 2025-07-10 19:09:16,858] Trial 149 finished with value: 0.8459042586663004 and parameters: {'test_size': 0.15622798097235246, 'random_state': 930, 'model': 'LinearRegression'}. Best is trial 133 with value: 0.8803401641685645.





In [88]:
print("Best Parameters:", study.best_trial.params)
print("Best R² Score:", round(study.best_value, 4))

Best Parameters: {'test_size': 0.15892940339566472, 'random_state': 959, 'model': 'LinearRegression'}
Best R² Score: 0.8803


In [90]:
best_params = study.best_trial.params

if best_params['model'] == 'XGBoost':
    best_model = XGBRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        subsample=best_params['subsample'],
        colsample_bytree=best_params['colsample_bytree'],
        reg_alpha=best_params['reg_alpha'],
        reg_lambda=best_params['reg_lambda'],
        random_state=best_params['random_state'],
        n_jobs=-1
    )
elif best_params['model'] == 'RandomForest':
    best_model = RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        random_state=best_params['random_state'],
        n_jobs=-1
    )
elif best_params['model'] == 'LightGBM':
    best_model = LGBMRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        subsample=best_params['subsample'],
        colsample_bytree=best_params['colsample_bytree'],
        reg_alpha=best_params['reg_alpha'],
        reg_lambda=best_params['reg_lambda'],
        random_state=best_params['random_state'],
        n_jobs=-1
    )
else:
    best_model = LinearRegression()

final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(x, y, test_size=best_params['test_size'], random_state=best_params['random_state'])

final_pipeline.fit(X_train_final, y_train_final)
y_pred_final = final_pipeline.predict(X_test_final)

final_r2 = r2_score(y_test_final, y_pred_final)
print("Final R² on Test (log-scale):", round(final_r2, 4))

Final R² on Test (log-scale): 0.8803


In [92]:
with open('best_price_model.pkl', 'wb') as f:
    pickle.dump(final_pipeline, f)

print("Model saved as best_price_model.pkl")

Model saved as best_price_model.pkl
