In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pickle
from catboost import CatBoostRegressor

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/car_prices.csv')
df = df.drop(columns=['saledate','vin'])
df.dropna(subset=['sellingprice'], inplace=True)

In [None]:
df

Unnamed: 0,year,make,model,trim,body,transmission,state,condition,odometer,color,interior,seller,mmr,sellingprice
0,2015,Kia,Sorento,LX,SUV,automatic,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0
1,2015,Kia,Sorento,LX,SUV,automatic,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0
3,2015,Volvo,S60,T5,Sedan,automatic,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558832,2015,Kia,K900,Luxury,Sedan,,in,45.0,18255.0,silver,black,avis corporation,35300.0,33000.0
558833,2012,Ram,2500,Power Wagon,Crew Cab,automatic,wa,5.0,54393.0,white,black,i -5 uhlmann rv,30200.0,30800.0
558834,2012,BMW,X5,xDrive35d,SUV,automatic,ca,48.0,50561.0,black,black,financial services remarketing (lease),29800.0,34000.0
558835,2015,Nissan,Altima,2.5 S,sedan,automatic,ga,38.0,16658.0,white,black,enterprise vehicle exchange / tra / rental / t...,15100.0,11100.0


In [None]:
cat_cols = ['make', 'model', 'trim', 'body', 'transmission', 'state', 'color', 'interior', 'seller']
label_encoders = {}

def preprocess_data(data):
    data.dropna(inplace=True)
    for col in cat_cols:
        le = LabelEncoder()
        le.fit(data[col])
        data[col] = le.transform(data[col])
        label_encoders[col] = {label: index for index, label in enumerate(le.classes_)}
    return data

data = preprocess_data(df)

In [None]:
X = data.drop('sellingprice', axis=1)
y = data['sellingprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

catboost_regressor = CatBoostRegressor()
param_grid = {
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
    'depth': [4, 6, 8, 10],
    'iterations': [100, 200, 300, 400, 500]
}

grid_search = GridSearchCV(estimator=catboost_regressor, param_grid=param_grid, cv=2, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
0:	learn: 7879.1185352	total: 185ms	remaining: 1m 32s
1:	learn: 6469.2150803	total: 307ms	remaining: 1m 16s
2:	learn: 5361.3453154	total: 436ms	remaining: 1m 12s
3:	learn: 4486.5516404	total: 558ms	remaining: 1m 9s
4:	learn: 3807.9243684	total: 686ms	remaining: 1m 7s
5:	learn: 3299.2692208	total: 813ms	remaining: 1m 6s
6:	learn: 2920.3731568	total: 950ms	remaining: 1m 6s
7:	learn: 2638.1031888	total: 1.07s	remaining: 1m 5s
8:	learn: 2433.6322721	total: 1.22s	remaining: 1m 6s
9:	learn: 2280.5288715	total: 1.43s	remaining: 1m 10s
10:	learn: 2163.2249182	total: 1.67s	remaining: 1m 14s
11:	learn: 2081.0886557	total: 1.9s	remaining: 1m 17s
12:	learn: 2019.1589878	total: 2.14s	remaining: 1m 20s
13:	learn: 1977.3371095	total: 2.38s	remaining: 1m 22s
14:	learn: 1944.5636561	total: 2.6s	remaining: 1m 24s
15:	learn: 1923.8345809	total: 2.84s	remaining: 1m 25s
16:	learn: 1901.2883501	total: 3.08s	remaining: 1m 27s
17:	learn: 1887.0876

In [None]:
best_model.save_model('catboost_model.bin')

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Mean Absolute Percentage Error (MAPE):", mape)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

Mean Absolute Percentage Error (MAPE): 14.10549883111998
Mean Absolute Error (MAE): 879.4342044727447
Mean Squared Error (MSE): 2166132.1264083493


In [None]:
with open('label_encoders.pkl', 'wb') as file:
    pickle.dump(label_encoders, file)