# Model 1 - Z.K.

Import packages

In [18]:
import pandas as pd

Read data

In [28]:
df = pd.read_csv("data/cleaned_data.csv.gz", compression="gzip")

In [20]:
df

Unnamed: 0,year_production,mileage,engine_capacity,power,gearbox,price,Abarth,Acura,Aixam,Alfa Romeo,...,Navy_blue,Orange,Other_color,Purple,Red,Silver,White,Yellow,New_cars,Used_cars
0,2018,48000,1368,145,0,82999,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
1,2016,54500,1368,145,0,59900,True,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,2023,5578,1368,165,1,135000,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,2019,116000,1368,165,0,79900,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,2022,31190,1368,145,0,82000,True,False,False,False,...,False,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184247,2017,85000,1499,105,0,39900,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
184248,2015,122300,1998,165,0,68000,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
184249,2019,97711,2488,194,1,119900,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
184250,2015,189300,2191,175,1,65500,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True


Target variable - "price".

## 1. Prepare for modeling

#### 1.1 StandardScaler

In [29]:
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
numeric_cols.remove('price')
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['year_production', 'mileage', 'engine_capacity', 'power', 'gearbox']
Categorical columns: ['Abarth', 'Acura', 'Aixam', 'Alfa Romeo', 'Alpine', 'Aston Martin', 'Audi', 'Austin', 'BMW', 'BMW-ALPINA', 'BYD', 'Baic', 'Bentley', 'Brilliance', 'Buick', 'Cadillac', 'Casalini', 'Chatenet', 'Chevrolet', 'Chrysler', 'Citroën', 'Cupra', 'DFSK', 'DKW', 'DS Automobiles', 'Dacia', 'Daewoo', 'Daihatsu', 'Dodge', 'Ferrari', 'Fiat', 'Ford', 'GMC', 'Gaz', 'Honda', 'Hummer', 'Hyundai', 'Ineos', 'Infiniti', 'Inny', 'Isuzu', 'Iveco', 'Jaguar', 'Jeep', 'Jetour', 'Kia', 'Lada', 'Lamborghini', 'Lancia', 'Land Rover', 'Lexus', 'Ligier', 'Lincoln', 'Lotus', 'MAN', 'MG', 'MINI', 'Maserati', 'Maybach', 'Mazda', 'McLaren', 'Mercedes-Benz', 'Mercury', 'Microcar', 'Mitsubishi', 'Nissan', 'Nysa', 'Oldsmobile', 'Omoda', 'Opel', 'Peugeot', 'Plymouth', 'Polonez', 'Pontiac', 'Porsche', 'RAM', 'Renault', 'Rolls-Royce', 'Rover', 'Saab', 'Seat', 'Skoda', 'Skywell', 'Smart', 'SsangYong', 'Subaru', 'Suzuki',

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [32]:
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

df = pd.concat([df[numeric_cols + ['price']], df[categorical_cols]], axis=1)

In [33]:
df = df*1
df.head(5)

Unnamed: 0,year_production,mileage,engine_capacity,power,gearbox,price,Abarth,Acura,Aixam,Alfa Romeo,...,Navy_blue,Orange,Other_color,Purple,Red,Silver,White,Yellow,New_cars,Used_cars
0,0.480689,-0.968906,-0.699431,-0.254407,-0.944106,82999,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.161685,-0.899704,-0.699431,-0.254407,-0.944106,59900,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,1.2782,-1.420546,-0.699431,-0.016959,1.059204,135000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.640191,-0.244952,-0.699431,-0.016959,-0.944106,79900,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.118698,-1.147871,-0.699431,-0.254407,-0.944106,82000,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1


#### 1.2 Train test split

In [34]:
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 2. Lasso regressor

#### 2.1 Estimate lasso

In [35]:
from sklearn.linear_model import LassoCV

# Set up the LassoCV regressor
lasso_cv = LassoCV(cv=5, random_state=42, max_iter=10000)

# Fit the model on the training data
lasso_cv.fit(X_train, y_train)

# Print the best alpha found
print("Best alpha using built-in LassoCV: ", lasso_cv.alpha_)

Best alpha using built-in LassoCV:  65.02589543828756


Best alpha using built-in LassoCV:  65.02589543828756

#### 2.2 Evaluate the model

In [36]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on training and test data
y_train_pred = lasso_cv.predict(X_train)
y_test_pred = lasso_cv.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Test MSE:", test_mse)
print("Train R²:", train_r2)
print("Test R²:", test_r2)

Train MSE: 2782834729.7345786
Test MSE: 2926085206.831737
Train R²: 0.7243004177761863
Test R²: 0.7089960470467618


## 3. XGBoost

In [37]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RepeatedKFold, RandomizedSearchCV

In [38]:
XGBR = xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [None]:
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

In [39]:
param_grid = {
    'n_estimators': [700, 800, 1000, 1100, 1200],  # Number of trees
    'max_depth': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15],  # Maximum depth of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Learning rate
    'subsample': [0.6, 0.7, 0.8, 0.85],  # Fraction of samples used for tree building
    'colsample_bytree': [0.7, 0.8, 0.9],  # Fraction of features used for tree building
    'min_child_weight': [2, 3, 4],  # Minimum sum of instance weight needed in a child
}

In [42]:
random_search = RandomizedSearchCV(
    estimator = XGBR,
    param_distributions=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_iter=30,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [43]:
random_search.fit(X_train, y_train)
best_params_noloss_random = random_search.best_params_
best_params_noloss_random

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=3, n_estimators=700, subsample=0.6; total time=  25.8s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=3, n_estimators=700, subsample=0.6; total time=  25.9s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=3, n_estimators=700, subsample=0.6; total time=  26.2s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=3, n_estimators=700, subsample=0.6; total time=  26.5s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=10, min_child_weight=3, n_estimators=700, subsample=0.6; total time=  26.5s
[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=15, min_child_weight=4, n_estimators=1000, subsample=0.8; total time= 1.2min
[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=15, min_child_weight=4, n_estimators=1000, subsample=0



[CV] END colsample_bytree=0.7, learning_rate=0.2, max_depth=15, min_child_weight=4, n_estimators=1000, subsample=0.85; total time= 1.7min
[CV] END colsample_bytree=0.7, learning_rate=0.2, max_depth=15, min_child_weight=4, n_estimators=1000, subsample=0.85; total time= 1.7min
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=9, min_child_weight=3, n_estimators=1000, subsample=0.85; total time=  52.5s
[CV] END colsample_bytree=0.7, learning_rate=0.2, max_depth=15, min_child_weight=4, n_estimators=1000, subsample=0.85; total time= 1.7min
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=9, min_child_weight=3, n_estimators=1000, subsample=0.85; total time=  49.8s
[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=11, min_child_weight=4, n_estimators=1000, subsample=0.7; total time= 1.1min
[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=11, min_child_weight=4, n_estimators=1000, subsample=0.7; total time= 1.1min
[CV] END colsample_bytree=0.8, learn

{'subsample': 0.6,
 'n_estimators': 800,
 'min_child_weight': 4,
 'max_depth': 15,
 'learning_rate': 0.05,
 'colsample_bytree': 0.7}