In [20]:
# Load data & split

import pandas as pd
from sklearn.model_selection import train_test_split

# Load your cleaned data
df = pd.read_csv('../data/preprocessed_data.csv')

# Drop raw 'running' only from features, keep it for reference
X = df.drop(['price', 'running'], axis=1)
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)


(1313, 36) (329, 36)


In [22]:
# Train Random Forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = mse_rf ** 0.5
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}, R2: {r2_rf:.2f}")


Random Forest - MAE: 2168.30, RMSE: 3677.83, R2: 0.69


In [None]:
# Train SVR

from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)

mae_svr = mean_absolute_error(y_test, y_pred_svr)
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = mse_svr ** 0.5
r2_svr = r2_score(y_test, y_pred_svr)

print(f"SVR - MAE: {mae_svr:.2f}, RMSE: {rmse_svr:.2f}, R2: {r2_svr:.2f}")


SVR - MAE: 4719.59, RMSE: 6634.09, R2: -0.00


In [24]:
# Train MLP Regressor
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
rmse_mlp = mse_mlp ** 0.5
r2_mlp = r2_score(y_test, y_pred_mlp)

print(f"MLP Regressor - MAE: {mae_mlp:.2f}, RMSE: {rmse_mlp:.2f}, R2: {r2_mlp:.2f}")


MLP Regressor - MAE: 4241.46, RMSE: 5770.80, R2: 0.24


In [26]:
# Compare all
results = pd.DataFrame({
    'Model': ['Random Forest', 'SVR', 'MLP Regressor'],
    'MAE': [mae_rf, mae_svr, mae_mlp],
    'RMSE': [rmse_rf, rmse_svr, rmse_mlp],
    'R2': [r2_rf, r2_svr, r2_mlp]
})

print(results)


           Model          MAE         RMSE        R2
0  Random Forest  2168.297984  3677.831187  0.692632
1            SVR  4719.591162  6634.094425 -0.000087
2  MLP Regressor  4241.462144  5770.804249  0.243259


In [28]:
# Tune Random Forest

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Initialize base model
rf = RandomForestRegressor(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    verbose=2
)

# Fit
grid_search.fit(X_train, y_train)

# Best params
print("Best Hyperparameters:", grid_search.best_params_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=

In [29]:
# Evaluate the tuned Random Forest

# Predict with best estimator
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)

# Metrics
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
rmse_best_rf = mse_best_rf ** 0.5
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print(f"Tuned Random Forest - MAE: {mae_best_rf:.2f}, RMSE: {rmse_best_rf:.2f}, R2: {r2_best_rf:.2f}")


Tuned Random Forest - MAE: 2171.40, RMSE: 3331.38, R2: 0.75


In [30]:
# scaling version

from sklearn.preprocessing import StandardScaler

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Refit SVR
svr_scaled = SVR()
svr_scaled.fit(X_train_scaled, y_train)
y_pred_svr_scaled = svr_scaled.predict(X_test_scaled)

mae_svr_scaled = mean_absolute_error(y_test, y_pred_svr_scaled)
rmse_svr_scaled = mean_squared_error(y_test, y_pred_svr_scaled) ** 0.5
r2_svr_scaled = r2_score(y_test, y_pred_svr_scaled)

print(f"Scaled SVR - MAE: {mae_svr_scaled:.2f}, RMSE: {rmse_svr_scaled:.2f}, R2: {r2_svr_scaled:.2f}")

# Refit MLP
mlp_scaled = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
mlp_scaled.fit(X_train_scaled, y_train)
y_pred_mlp_scaled = mlp_scaled.predict(X_test_scaled)

mae_mlp_scaled = mean_absolute_error(y_test, y_pred_mlp_scaled)
rmse_mlp_scaled = mean_squared_error(y_test, y_pred_mlp_scaled) ** 0.5
r2_mlp_scaled = r2_score(y_test, y_pred_mlp_scaled)

print(f"Scaled MLP - MAE: {mae_mlp_scaled:.2f}, RMSE: {rmse_mlp_scaled:.2f}, R2: {r2_mlp_scaled:.2f}")


Scaled SVR - MAE: 4702.88, RMSE: 6616.22, R2: 0.01
Scaled MLP - MAE: 2367.17, RMSE: 3518.00, R2: 0.72


In [31]:
# Compare all 

results = pd.DataFrame({
    'Model': [
        'Random Forest (Base)', 'Random Forest (Tuned)',
        'SVR (Base)', 'SVR (Scaled)',
        'MLP Regressor (Base)', 'MLP Regressor (Scaled)'
    ],
    'MAE': [
        mae_rf, mae_best_rf,
        mae_svr, mae_svr_scaled,
        mae_mlp, mae_mlp_scaled
    ],
    'RMSE': [
        rmse_rf, rmse_best_rf,
        rmse_svr, rmse_svr_scaled,
        rmse_mlp, rmse_mlp_scaled
    ],
    'R2': [
        r2_rf, r2_best_rf,
        r2_svr, r2_svr_scaled,
        r2_mlp, r2_mlp_scaled
    ]
})

print(results)


                    Model          MAE         RMSE        R2
0    Random Forest (Base)  2168.297984  3677.831187  0.692632
1   Random Forest (Tuned)  2171.398797  3331.375138  0.747814
2              SVR (Base)  4719.591162  6634.094425 -0.000087
3            SVR (Scaled)  4702.883662  6616.215683  0.005296
4    MLP Regressor (Base)  4241.462144  5770.804249  0.243259
5  MLP Regressor (Scaled)  2367.173650  3517.998280  0.718767


In [32]:
# Save model

import joblib

# save
joblib.dump(best_rf, '../models/tuned_random_forest.pkl')

# If you used scaling for final, also save the scaler:
joblib.dump(scaler, '../models/scaler.pkl')


['../models/scaler.pkl']

In [None]:
import joblib

print(X_train.columns)  

# Save final feature columns:
joblib.dump(list(X_train.columns), 'X_columns.joblib')
joblib.dump(best_rf, 'model.joblib')
joblib.dump(scaler, 'scaler.joblib')


Index(['year', 'motor_volume', 'model_kia', 'model_mercedes-benz',
       'model_nissan', 'model_toyota', 'motor_type_gas', 'motor_type_hybrid',
       'motor_type_petrol', 'motor_type_petrol and gas', 'color_black',
       'color_blue', 'color_brown', 'color_cherry', 'color_clove',
       'color_golden', 'color_gray', 'color_green', 'color_orange',
       'color_other', 'color_pink', 'color_purple', 'color_red',
       'color_silver', 'color_skyblue', 'color_white', 'type_Universal',
       'type_hatchback', 'type_minivan / minibus', 'type_pickup', 'type_sedan',
       'type_suv', 'status_excellent', 'status_good', 'status_new',
       'status_normal'],
      dtype='object')


['scaler.joblib']

In [None]:
# Example from training notebook
one_row = X_train.iloc[[0]]
one_row_encoded = pd.get_dummies(one_row)

print(one_row_encoded.columns)

# Save that exact list
joblib.dump(list(one_row_encoded.columns), 'X_columns.joblib')


Index(['year', 'motor_volume', 'model_kia', 'model_mercedes-benz',
       'model_nissan', 'model_toyota', 'motor_type_gas', 'motor_type_hybrid',
       'motor_type_petrol', 'motor_type_petrol and gas', 'color_black',
       'color_blue', 'color_brown', 'color_cherry', 'color_clove',
       'color_golden', 'color_gray', 'color_green', 'color_orange',
       'color_other', 'color_pink', 'color_purple', 'color_red',
       'color_silver', 'color_skyblue', 'color_white', 'type_Universal',
       'type_hatchback', 'type_minivan / minibus', 'type_pickup', 'type_sedan',
       'type_suv', 'status_excellent', 'status_good', 'status_new',
       'status_normal'],
      dtype='object')


['X_columns.joblib']

In [36]:
print(X_train.columns)


Index(['year', 'motor_volume', 'model_kia', 'model_mercedes-benz',
       'model_nissan', 'model_toyota', 'motor_type_gas', 'motor_type_hybrid',
       'motor_type_petrol', 'motor_type_petrol and gas', 'color_black',
       'color_blue', 'color_brown', 'color_cherry', 'color_clove',
       'color_golden', 'color_gray', 'color_green', 'color_orange',
       'color_other', 'color_pink', 'color_purple', 'color_red',
       'color_silver', 'color_skyblue', 'color_white', 'type_Universal',
       'type_hatchback', 'type_minivan / minibus', 'type_pickup', 'type_sedan',
       'type_suv', 'status_excellent', 'status_good', 'status_new',
       'status_normal'],
      dtype='object')


In [41]:
import pandas as pd

df = pd.read_csv('../data/preprocessed_data.csv')
print(df.head())
print(df.info())


   year  motor_volume  price  running_km  model_kia  model_mercedes-benz  \
0  2022           2.0  24500     3000.00      False                False   
1  2014           2.0  25500   132000.00      False                 True   
2  2018           2.0  11700   152887.30       True                False   
3  2002           3.2  12000   220479.58      False                 True   
4  2017           2.0  26000   130000.00      False                 True   

   model_nissan  model_toyota  motor_type_gas  motor_type_hybrid  ...  \
0         False          True           False              False  ...   
1         False         False           False              False  ...   
2         False         False           False              False  ...   
3         False         False           False              False  ...   
4         False         False           False              False  ...   

   type_Universal  type_hatchback  type_minivan / minibus  type_pickup  \
0           False           Fa

In [42]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['running_km_scaled'] = scaler.fit_transform(df[['running_km']])


In [43]:
# Features
X = df[['year', 'motor_volume', 'running_km_scaled']]  # use scaled version!
# Add your dummies
X = X.join(df.drop(columns=['year', 'motor_volume', 'running_km', 'running_km_scaled', 'price']))

y = df['price']

print(X.columns)
print(X.head())


Index(['year', 'motor_volume', 'running_km_scaled', 'model_kia',
       'model_mercedes-benz', 'model_nissan', 'model_toyota', 'motor_type_gas',
       'motor_type_hybrid', 'motor_type_petrol', 'motor_type_petrol and gas',
       'color_black', 'color_blue', 'color_brown', 'color_cherry',
       'color_clove', 'color_golden', 'color_gray', 'color_green',
       'color_orange', 'color_other', 'color_pink', 'color_purple',
       'color_red', 'color_silver', 'color_skyblue', 'color_white',
       'type_Universal', 'type_hatchback', 'type_minivan / minibus',
       'type_pickup', 'type_sedan', 'type_suv', 'status_excellent',
       'status_good', 'status_new', 'status_normal'],
      dtype='object')
   year  motor_volume  running_km_scaled  model_kia  model_mercedes-benz  \
0  2022           2.0          -1.201305      False                False   
1  2014           2.0           0.132210      False                 True   
2  2018           2.0           0.348129       True               

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

print(X_train.columns)


Index(['year', 'motor_volume', 'running_km_scaled', 'model_kia',
       'model_mercedes-benz', 'model_nissan', 'model_toyota', 'motor_type_gas',
       'motor_type_hybrid', 'motor_type_petrol', 'motor_type_petrol and gas',
       'color_black', 'color_blue', 'color_brown', 'color_cherry',
       'color_clove', 'color_golden', 'color_gray', 'color_green',
       'color_orange', 'color_other', 'color_pink', 'color_purple',
       'color_red', 'color_silver', 'color_skyblue', 'color_white',
       'type_Universal', 'type_hatchback', 'type_minivan / minibus',
       'type_pickup', 'type_sedan', 'type_suv', 'status_excellent',
       'status_good', 'status_new', 'status_normal'],
      dtype='object')


In [45]:
joblib.dump(model, 'model.joblib')
joblib.dump(list(X_train.columns), 'X_columns.joblib')
joblib.dump(scaler, 'scaler.joblib')  # if used


['scaler.joblib']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Check correlation
correlation = df[['price', 'running_km', 'year']].corr()
print(correlation)

# Visualize the relationship
sns.scatterplot(data=df, x='running_km', y='price')
plt.title("Running KM vs Price")
plt.show()
