### Load model
- Catboost
loaded_cat = CatBoostRegressor()
loaded_cat.load_model("catboost_model.cbm")

- XGBoost
loaded_xgbr = XGBRegressor()
loaded_xgbr.load_model("xgboost_model.json")

- LightGBM
import lightgbm as lgb
loaded_lgbmr = lgb.Booster(model_file="lightgbm_model.txt")



In [172]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import regex as re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
#import pygwalker as pyg

train = pd.read_csv("Data/processed_data.csv")
train.head()

Unnamed: 0,brand,price,old,new,cpu,cpu_brand,ram_capacity,ram_brand,hard_drive_type,hard_drive_capacity,card,card_brand,screen_size,screen_type
0,Asus,26990000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc Graphics,Intel,14.0,OLED
1,Lenovo,16390000.0,0,1,AMD Ryzen 77730U,AMD,16.0,DDR4,SSD,512GB,AMD Radeon Graphics,AMD,15.6,HD
2,Asus,28990000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc Graphics,Intel,14.0,OLED
3,LG,36290000.0,0,1,Intel Core Ultra 5 125H,Intel,16.0,DDR5,SSD,512GB,Intel Arc Graphics,Intel,14.0,IPS
4,Dell,18990000.0,0,1,Intel Core 5 processor 120U,Intel,8.0,DDR5,SSD,512GB,Intel Graphics,Intel,14.0,HD


In [173]:
unique_brands = train['cpu_brand'].unique()
print(unique_brands)

['Intel' 'AMD' 'Qualcomm' 'Apple' nan]


In [174]:
brand_counts = train['cpu_brand'].value_counts()
print(brand_counts)

cpu_brand
Intel       1833
AMD          293
Apple         35
Qualcomm      16
Name: count, dtype: int64


In [175]:
train.columns

Index(['brand', 'price', 'old', 'new', 'cpu', 'cpu_brand', 'ram_capacity',
       'ram_brand', 'hard_drive_type', 'hard_drive_capacity', 'card',
       'card_brand', 'screen_size', 'screen_type'],
      dtype='object')

In [176]:
cat_cols = ['brand', 'cpu', 'cpu_brand', 
       'ram_brand', 'hard_drive_type', 'hard_drive_capacity', 'card',
       'card_brand', 'screen_size', 'screen_type']

In [177]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2179 entries, 0 to 2178
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                2176 non-null   object 
 1   price                2108 non-null   float64
 2   old                  2179 non-null   int64  
 3   new                  2179 non-null   int64  
 4   cpu                  2170 non-null   object 
 5   cpu_brand            2177 non-null   object 
 6   ram_capacity         2175 non-null   float64
 7   ram_brand            2063 non-null   object 
 8   hard_drive_type      2179 non-null   object 
 9   hard_drive_capacity  2121 non-null   object 
 10  card                 2169 non-null   object 
 11  card_brand           2115 non-null   object 
 12  screen_size          2148 non-null   object 
 13  screen_type          2064 non-null   object 
dtypes: float64(2), int64(2), object(10)
memory usage: 238.5+ KB


In [178]:
ram_counts = train['ram_brand'].value_counts()
print(ram_counts)

ram_brand
DDR5       1131
DDR4        591
LPDDR5      161
LPDDR5X     121
LPDDR4X      36
LPDDR4       11
LPDDR3       11
LDDR4         1
Name: count, dtype: int64


In [179]:
print('trainframe encoded by OHE dimension : ', pd.get_dummies(train, columns = cat_cols, drop_first = True).shape)

trainframe encoded by OHE dimension :  (2179, 620)


In [180]:
# train[cat_cols] = train[cat_cols].fillna('missing')
train.head()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2179 entries, 0 to 2178
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                2176 non-null   object 
 1   price                2108 non-null   float64
 2   old                  2179 non-null   int64  
 3   new                  2179 non-null   int64  
 4   cpu                  2170 non-null   object 
 5   cpu_brand            2177 non-null   object 
 6   ram_capacity         2175 non-null   float64
 7   ram_brand            2063 non-null   object 
 8   hard_drive_type      2179 non-null   object 
 9   hard_drive_capacity  2121 non-null   object 
 10  card                 2169 non-null   object 
 11  card_brand           2115 non-null   object 
 12  screen_size          2148 non-null   object 
 13  screen_type          2064 non-null   object 
dtypes: float64(2), int64(2), object(10)
memory usage: 238.5+ KB


In [181]:
en = LabelEncoder()

for cols in cat_cols:
    train[cols] = en.fit_transform(train[cols])

print('Dataframe encoded by Label encoding dimension : ', train.shape)

Dataframe encoded by Label encoding dimension :  (2179, 14)


In [182]:
train.head()

Unnamed: 0,brand,price,old,new,cpu,cpu_brand,ram_capacity,ram_brand,hard_drive_type,hard_drive_capacity,card,card_brand,screen_size,screen_type
0,1,26990000.0,0,1,104,2,16.0,1,0,5,44,2,9,14
1,6,16390000.0,0,1,46,0,16.0,0,0,5,13,0,16,12
2,1,28990000.0,0,1,104,2,16.0,1,0,5,44,2,9,14
3,5,36290000.0,0,1,104,2,16.0,1,0,5,44,2,9,13
4,2,18990000.0,0,1,97,2,8.0,1,0,5,47,2,9,12


In [183]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2179 entries, 0 to 2178
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   brand                2179 non-null   int64  
 1   price                2108 non-null   float64
 2   old                  2179 non-null   int64  
 3   new                  2179 non-null   int64  
 4   cpu                  2179 non-null   int64  
 5   cpu_brand            2179 non-null   int64  
 6   ram_capacity         2175 non-null   float64
 7   ram_brand            2179 non-null   int64  
 8   hard_drive_type      2179 non-null   int64  
 9   hard_drive_capacity  2179 non-null   int64  
 10  card                 2179 non-null   int64  
 11  card_brand           2179 non-null   int64  
 12  screen_size          2179 non-null   int64  
 13  screen_type          2179 non-null   int64  
dtypes: float64(2), int64(12)
memory usage: 238.5 KB


In [184]:
print(train['price'].describe())

count    2.108000e+03
mean     3.013708e+07
std      1.741148e+07
min      6.990000e+06
25%      1.876500e+07
50%      2.499000e+07
75%      3.556500e+07
max      1.489900e+08
Name: price, dtype: float64


In [185]:
print("Max value in y_train:", train.max())
print("Min value in y_train:", train.min())


Max value in y_train: brand                         10.0
price                  148990000.0
old                            1.0
new                            1.0
cpu                          344.0
cpu_brand                      4.0
ram_capacity                 512.0
ram_brand                      8.0
hard_drive_type                0.0
hard_drive_capacity            6.0
card                         186.0
card_brand                     5.0
screen_size                   42.0
screen_type                   20.0
dtype: float64
Min value in y_train: brand                        0.0
price                  6990000.0
old                          0.0
new                          0.0
cpu                          0.0
cpu_brand                    0.0
ram_capacity                 4.0
ram_brand                    0.0
hard_drive_type              0.0
hard_drive_capacity          0.0
card                         0.0
card_brand                   0.0
screen_size                  0.0
screen_type           

In [186]:
def clean_price_data(df):
    # Remove any potential negative prices
    df = df[df['price'] > 0]
    
    # Remove extreme outliers using IQR method
    Q1 = df['price'].quantile(0.25)
    Q3 = df['price'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    
    df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
    
    return df

In [187]:
train = clean_price_data(train)

In [188]:
X = train.drop('price', axis = 1)
y = train['price']

In [189]:
print(X.shape)
print(y.shape)

(2068, 13)
(2068,)


In [190]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.05,random_state=0)

In [191]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1964, 13)
(1964,)
(104, 13)
(104,)


## Training


In [192]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

In [193]:
xgboost_model = XGBRegressor()
lgbm_model = LGBMRegressor()
catboost_model = CatBoostRegressor(cat_features=cat_cols, verbose=0)

In [194]:
param_grid_catboost = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5, 7]
}

# Perform grid search for CatBoost
grid_search_catboost = GridSearchCV(estimator=catboost_model, param_grid=param_grid_catboost, cv=3, scoring='r2', verbose=1)
grid_search_catboost.fit(X_train, y_train)
best_params_catboost = grid_search_catboost.best_params_
print(f"Best parameters for CatBoost: {best_params_catboost}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits


Best parameters for CatBoost: {'depth': 6, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.1}


In [195]:
param_grid_xgboost = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'reg_alpha': [0, 0.1, 0.5, 1]
}

# Perform grid search for XGBoost
grid_search_xgboost = GridSearchCV(estimator=xgboost_model, param_grid=param_grid_xgboost, cv=3, scoring='r2', verbose=1)
grid_search_xgboost.fit(X_train, y_train)
best_params_xgboost = grid_search_xgboost.best_params_
print(f"Best parameters for XGBoost: {best_params_xgboost}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters for XGBoost: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300, 'reg_alpha': 0.5}


In [196]:
param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'reg_alpha': [0, 0.1, 0.5, 1]
}

# Perform grid search for LightGBM
grid_search_lgbm = GridSearchCV(estimator=lgbm_model, param_grid=param_grid_lgbm, cv=3, scoring='r2', verbose=1)
grid_search_lgbm.fit(X_train, y_train)
best_params_lgbm = grid_search_lgbm.best_params_
print(f"Best parameters for LightGBM: {best_params_lgbm}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 394
[LightGBM] [Info] Number of data points in the train set: 1309, number of used features: 12
[LightGBM] [Info] Start training from score 28837797.555386
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 1309, number of used features: 12
[LightGBM] [Info] Start training from score 28782325.439267
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000515 seconds.
You can set

In [197]:
catboost_model.set_params(**best_params_catboost)
xgboost_model.set_params(**best_params_xgboost)
lgbm_model.set_params(**best_params_lgbm)

In [198]:
catboost_model.fit(X_train, y_train)
xgboost_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 416
[LightGBM] [Info] Number of data points in the train set: 1964, number of used features: 12
[LightGBM] [Info] Start training from score 28711217.922607


In [199]:
catboost_score = catboost_model.score(X_test, y_test)
print(f"CatBoost R2 score: {catboost_score}")

CatBoost R2 score: 0.8190740191277031


In [200]:
xgboost_score = xgboost_model.score(X_test, y_test)
print(f"XGBoost R2 score: {xgboost_score}")

XGBoost R2 score: 0.8559209992188671


In [201]:
lgbm_score = lgbm_model.score(X_test, y_test)
print(f"LightGBM R2 score: {lgbm_score}")

LightGBM R2 score: 0.8555337677826931


In [202]:
catboost_train_score = catboost_model.score(X_train, y_train)
xgboost_train_score = xgboost_model.score(X_train, y_train)
lgbm_train_score = lgbm_model.score(X_train, y_train)
print(f"CatBoost R2 score on training set: {catboost_train_score}")
print(f"XGBoost R2 score on training set: {xgboost_train_score}")
print(f"LightGBM R2 score on training set: {lgbm_train_score}")

CatBoost R2 score on training set: 0.8856808737261415
XGBoost R2 score on training set: 0.9525404440315418
LightGBM R2 score on training set: 0.9392219564533828


In [203]:
catboost_model.save_model("model/catboost_model.cbm")
lgbm_model.booster_.save_model("model/lightgbm_model.txt")
xgboost_model.save_model("model/xgboost_model.json")

In [204]:
loaded_cat = CatBoostRegressor()
loaded_cat.load_model("model/catboost_model.cbm")

loaded_xgbr = XGBRegressor()
loaded_xgbr.load_model("model/xgboost_model.json")

import lightgbm as lgb
loaded_lgbmr = lgb.Booster(model_file="model/lightgbm_model.txt")