In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
raw_df = pd.read_csv('car_prices_processed.csv')
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546325 entries, 0 to 546324
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           546325 non-null  int64  
 1   year                 546325 non-null  int64  
 2   make                 546325 non-null  int64  
 3   model                546325 non-null  int64  
 4   body                 546325 non-null  int64  
 5   state                546325 non-null  int64  
 6   odometer             546325 non-null  float64
 7   color                546325 non-null  int64  
 8   interior             546325 non-null  int64  
 9   sellingprice         546325 non-null  float64
 10  newcondition         546325 non-null  float64
 11  profit               546325 non-null  float64
 12  transmission_manual  546325 non-null  bool   
dtypes: bool(1), float64(4), int64(8)
memory usage: 50.5 MB


In [3]:
df = raw_df.drop(columns=['Unnamed: 0'], axis=1)

#### Split Data into Training & Testing Sets

In [4]:
# Define features (X) and target (y)
X = df.drop(columns=['sellingprice'])
y = df['sellingprice']

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (437060, 11) (437060,)
Test set: (109265, 11) (109265,)


#### Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [20, 100, 150],
    'max_depth': [20, 30, 50],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [3, 4]
}
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

### Random Forest Regressor

In [None]:
model = RandomForestRegressor(n_estimators=100, max_depth=20, min_samples_leaf=3, min_samples_split=2, 
                              random_state=23, verbose=2)
model.fit(X_train, y_train)

### XGBoost Regressor

In [5]:
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=20, 
                     random_state=23, verbosity=2)
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=20, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=23, ...)

#### Evaluating Model's Performance

##### Mean Absolute Error (MAE):
tells us the average absolute difference between predicted and actual prices. Lower MAE = better accuracy of the model.
Need to remember, MAE treats all errors equally and does not penalize large errors more than small ones.
##### Root Mean Squared Error (RMSE):
similarly to MAE but RMSE penalizes larger errors because it squares the differences before averaging then. Lower RMSE = better accuracy of the model.

If large errors are unacceptable, RMSE is preferred because it gives more weight to large errors.

If all errors should be treated equally, MAE is a better choice.
##### R² Score (Coefficient of Determination):
measures how well the model explains the variance in the target variable.

R² = 1 → Perfect model (explains 100% of the variance).

##### Mean Absolute Percentage Error (MAPE):
Shows how large the errors are in percentage terms.
MAPE = 10% → On average, the model's predictions are 10% off from actual values.

In [6]:
# Make predictions
y_pred = model.predict(X_test)

# Compute evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Mean Absolute Error (MAE): 1366.68
Root Mean Squared Error (RMSE): 2645.71
R² Score: 0.93
Mean Absolute Percentage Error (MAPE): 14.40%


## Perfomance evaluation of the models
### Random Forest Regressor
* Mean Absolute Error (MAE): 1424.78
* Root Mean Squared Error (RMSE): 2611.95
* R² Score: 0.93
* Mean Absolute Percentage Error (MAPE): 16.78%

### XGBoost Regressor
* Mean Absolute Error (MAE): 1366.68
* Root Mean Squared Error (RMSE): 2645.71
* R² Score: 0.93
* Mean Absolute Percentage Error (MAPE): 14.40%


In [None]:
new_car = pd.DataFrame([{
    'year': 2006,
    'make': 'Toyota',
    'model': 'Land Cruiser',
    'body': 'SUV',
    'state': 'cl',
    'odometer': 120000,
    'color': 'Black',
    'interior': 'Black',
    'newcondition': 2,
    'transmission_manual': 0
}])

# Load each label encoder separately
with open("make_encoder.pkl", "rb") as file:
    make_encoder = pickle.load(file)

with open("model_encoder.pkl", "rb") as file:
    model_encoder = pickle.load(file)

with open("body_encoder.pkl", "rb") as file:
    body_encoder = pickle.load(file)

with open("state_encoder.pkl", "rb") as file:
    state_encoder = pickle.load(file)

with open("color_encoder.pkl", "rb") as file:
    color_encoder = pickle.load(file)

with open("interior_encoder.pkl", "rb") as file:
    interior_encoder = pickle.load(file)

try:
    new_car['make'] = make_encoder.fit_transform(new_car['make'])
    new_car['model'] = model_encoder.fit_transform(new_car['model'])
    new_car['body'] = body_encoder.fit_transform(new_car['body'])
    new_car['state'] = state_encoder.fit_transform(new_car['state'])
    new_car['color'] = color_encoder.fit_transform(new_car['color'])
    new_car['interior'] = interior_encoder.fit_transform(new_car['interior'])
except ValueError as e:
    print(f"Encoding Error: {e}")

new_car["transmission_manual"] = new_car["transmission_manual"].map({"Automatic": 0, "Manual": 1})
new_car["transmission_manual"] = new_car["transmission_manual"].fillna(0)  # Default to 0 if missing

# Make the prediction
predicted_price = model.predict(new_car)

print(f"Predicted Selling Price: ${predicted_price[0]:,.2f}")

In [None]:
with open('model.pkl', 'wb') as reg:
    pickle.dump(model, reg)