In [2]:
!pip install optuna
!pip install lazypredict

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0
Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl

In [1]:
import pandas as pd
import numpy as np
import kagglehub
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import optuna

In [19]:
# Download latest version of the dataset.
path = kagglehub.dataset_download("talhabarkaatahmad/pakistan-used-car-prices-2023")
print("Path to dataset files:", path)

data_path = os.path.join(path, os.listdir(path)[1])
data = pd.read_csv(data_path)
data.head()

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/talhabarkaatahmad/pakistan-used-car-prices-2023/versions/4


Unnamed: 0,addref,city,assembly,body,make,model,year,engine,transmission,fuel,color,registered,mileage,price
0,7943732,Peshawar,,Sedan,Toyota,Corolla,2013.0,1300.0,Manual,Petrol,Silver Metallic,Lahore,145000,2870000.0
1,7730314,Lahore,,Sedan,Honda,City,2000.0,1300.0,Manual,Petrol,Blue,Lahore,230000,995000.0
2,7943737,Lahore,,Sedan,Toyota,Yaris,2021.0,1300.0,Manual,Petrol,Super White,Punjab,60500,3585000.0
3,7943733,Lahore,,Hatchback,Suzuki,Swift,2017.0,1300.0,Manual,Petrol,Grey,Islamabad,87000,2250000.0
4,7923484,Lahore,,Sedan,Honda,Civic,2017.0,1800.0,Automatic,Petrol,Grey,Lahore,86000,4850000.0


## Data Wrangling

- handle missing values
- convert year column to age
- generate a column of current_city/registration city comparism
- drop redundant columns
- encode the categorical variables
- scale the dataset.

In [20]:
data.shape

(77878, 14)

In [21]:
data.isnull().sum()

addref              0
city                0
assembly        53689
body             8904
make                0
model               0
year             4779
engine              3
transmission        0
fuel              906
color            1480
registered          0
mileage             0
price             583
dtype: int64

In [22]:
data['assembly'].unique()

array([nan, 'Imported'], dtype=object)

In [23]:
data['assembly'].fillna(value="not imported",inplace=True)

In [24]:
data['body'] = data['body'].fillna(value="unknown")

In [25]:
data['body'].unique()

array(['Sedan', 'Hatchback', 'SUV', 'MPV', 'unknown', 'Crossover',
       'Micro Van', 'Mini Van', 'Double Cabin', 'Compact sedan',
       'High Roof', 'Van', 'Compact SUV', 'Pick Up', 'Coupe',
       'Station Wagon', 'Convertible', 'Truck', 'Off-Road Vehicles',
       'Mini Vehicles', 'Single Cabin', 'Compact hatchback'], dtype=object)

In [26]:
data.isnull().sum()

addref             0
city               0
assembly           0
body               0
make               0
model              0
year            4779
engine             3
transmission       0
fuel             906
color           1480
registered         0
mileage            0
price            583
dtype: int64

In [27]:
data.dropna(inplace=True)

In [28]:
data.isnull().sum()

addref          0
city            0
assembly        0
body            0
make            0
model           0
year            0
engine          0
transmission    0
fuel            0
color           0
registered      0
mileage         0
price           0
dtype: int64

In [29]:
data.shape

(70577, 14)

In [30]:
data.reset_index(drop=True, inplace=True)

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70577 entries, 0 to 70576
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   addref        70577 non-null  int64  
 1   city          70577 non-null  object 
 2   assembly      70577 non-null  object 
 3   body          70577 non-null  object 
 4   make          70577 non-null  object 
 5   model         70577 non-null  object 
 6   year          70577 non-null  float64
 7   engine        70577 non-null  float64
 8   transmission  70577 non-null  object 
 9   fuel          70577 non-null  object 
 10  color         70577 non-null  object 
 11  registered    70577 non-null  object 
 12  mileage       70577 non-null  int64  
 13  price         70577 non-null  float64
dtypes: float64(3), int64(2), object(9)
memory usage: 7.5+ MB


In [32]:
data.head(3)

Unnamed: 0,addref,city,assembly,body,make,model,year,engine,transmission,fuel,color,registered,mileage,price
0,7943732,Peshawar,not imported,Sedan,Toyota,Corolla,2013.0,1300.0,Manual,Petrol,Silver Metallic,Lahore,145000,2870000.0
1,7730314,Lahore,not imported,Sedan,Honda,City,2000.0,1300.0,Manual,Petrol,Blue,Lahore,230000,995000.0
2,7943737,Lahore,not imported,Sedan,Toyota,Yaris,2021.0,1300.0,Manual,Petrol,Super White,Punjab,60500,3585000.0


In [33]:
"""
handle missing values
convert year column to age
generate a column of current_city/registration city comparism
drop redundant columns
encode the categorical variables
scale the dataset.

"""

# convert year to age
data['age'] = 2025 - data['year']
# generate was_registered_in_city
data['was_reg'] = [0 if x == y else 1 for x,y in zip(data['city'].tolist(),data['registered'].tolist())]

# drop redundant columns
data.drop(columns=['addref','year'], inplace=True)

# encode the categorical variables
encoder = LabelEncoder()
cat_cols = list(data.select_dtypes(include="object").columns)
for col in cat_cols:
  data[col] = encoder.fit_transform(data[col])

# split the data set
X = data.drop(columns=['price'])
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=23)

X_train, X_test, y_train, y_test = train_test_split(X_test, y_test, test_size=0.2,
                                                    random_state=23)

# scale the dataset
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### AUTO-ML FOR THE BASE MODEL

In [34]:
from lazypredict.Supervised import LazyRegressor


reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 823
[LightGBM] [Info] Number of data points in the train set: 5646, number of used features: 13
[LightGBM] [Info] Start training from score 3764765.674814


In [35]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.95,0.95,1239881.3,1.38
XGBRegressor,0.93,0.93,1456006.47,0.18
HistGradientBoostingRegressor,0.92,0.92,1547390.14,0.2
BaggingRegressor,0.91,0.91,1571441.23,0.21
LGBMRegressor,0.91,0.91,1573455.65,0.07
RandomForestRegressor,0.9,0.91,1646280.34,2.01
GradientBoostingRegressor,0.88,0.88,1817581.59,0.61
ExtraTreeRegressor,0.87,0.87,1896047.8,0.03
DecisionTreeRegressor,0.86,0.86,1994633.25,0.05
KNeighborsRegressor,0.77,0.77,2550661.69,0.11


### HYPER PARAMETER OPTIMIZATION

In [36]:
objectives = [
    'reg:squarederror',
    'reg:squaredlogerror',
    'reg:pseudohubererror',
    'reg:absoluteerror']

In [37]:
# define objective function

from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score,KFold


reg_kfold = KFold(n_splits = 5, shuffle=True, random_state=23)

def objective(trial):
  n_estimators = trial.suggest_int('n_estimators', 3, 1000)
  learning_rate = trial.suggest_float('learning_rate', 0.005, 0.07, log=True)
  max_depth = trial.suggest_int('max_depth', 3, 12)
  min_child_weight = trial.suggest_int('min_child_weight', 1, 7)
  gamma = trial.suggest_float('gamma', 0, 1)
  subsample = trial.suggest_float('subsample', 0.5, 1.0)
  colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
  reg_lambda = trial.suggest_float('reg_lambda', 0.5, 10, log=True)
  colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.5, 1.0)
  colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 1.0)

  regressor = XGBRegressor(n_estimators=n_estimators,objective='reg:squarederror',
                           learning_rate=learning_rate,max_depth = max_depth,
                           min_child_weight=min_child_weight,gamma=gamma, subsample=subsample,
                           colsample_bytree=colsample_bytree,reg_lambda=reg_lambda,
                           colsample_bylevel=colsample_bylevel, colsample_bynode=colsample_bynode,
                           random_state=23)
  rmse = -cross_val_score(estimator=regressor, X=X_train, y=y_train,cv=reg_kfold.split(X=X_train),
                          scoring = "neg_root_mean_squared_error").mean()
  return rmse

study = optuna.create_study(sampler = optuna.samplers.RandomSampler(seed = 23), direction= 'minimize')
study.optimize(objective, n_trials=100)

[I 2025-06-05 12:09:27,381] A new study created in memory with name: no-name-48987e8c-c4bd-4f76-b950-3881bffc515c
[I 2025-06-05 12:09:31,031] Trial 0 finished with value: 2037544.3932466567 and parameters: {'n_estimators': 519, 'learning_rate': 0.06085697949786892, 'max_depth': 10, 'min_child_weight': 2, 'gamma': 0.22104536326165258, 'subsample': 0.8431110426187334, 'colsample_bytree': 0.5835696015500281, 'reg_lambda': 1.6201283617322018, 'colsample_bylevel': 0.8090261736264046, 'colsample_bynode': 0.705965047429373}. Best is trial 0 with value: 2037544.3932466567.
[I 2025-06-05 12:09:31,119] Trial 1 finished with value: 4470133.520232599 and parameters: {'n_estimators': 5, 'learning_rate': 0.05154470672991925, 'max_depth': 11, 'min_child_weight': 3, 'gamma': 0.5895818652147146, 'subsample': 0.9892134580074154, 'colsample_bytree': 0.9225469110631556, 'reg_lambda': 0.6076242563014229, 'colsample_bylevel': 0.6473722323586828, 'colsample_bynode': 0.6439672204836232}. Best is trial 0 with 

In [38]:
study.best_params

{'n_estimators': 696,
 'learning_rate': 0.04006587613221814,
 'max_depth': 8,
 'min_child_weight': 1,
 'gamma': 0.49078881889177617,
 'subsample': 0.5360244590347243,
 'colsample_bytree': 0.823678872134052,
 'reg_lambda': 0.8821236538626824,
 'colsample_bylevel': 0.7566058640374164,
 'colsample_bynode': 0.678747093974772}

In [None]:
optuna.visualization.plot_param_importances(study)