In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from math import sqrt
import xgboost as xgb

In [2]:
df = pd.read_csv('/Users/mahmoud/Car_Price_Prediction/car_data_cleaned_updated.csv')

In [3]:
df = df.drop(columns=['Unnamed: 0'])

In [4]:
df.head()

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Review Count,Price,Year,Make,Model,Type
0,2015 Ford C-Max Hybrid SEL,"88,380 mi.",Capitol Kia,3.8,(180 reviews),10998.0,2015,Ford,C-Max,Hybrid SEL
1,2016 Kia Sorento LX,"100,837 mi.",Napleton Mazda of Naperville,3.9,(115 reviews),12577.0,2016,Kia,Sorento,LX
2,2016 Mitsubishi Lancer ES,"133,750 mi.",AVA Auto Sales,4.140821,(8 reviews),8495.0,2016,Mitsubishi,Lancer,ES
3,2011 Ford Escape XLT,"110,862 mi.",94 Nissan Of South Holland,4.7,"(2,682 reviews)",7869.0,2011,Ford,Escape,XLT
4,2017 Dodge Journey SXT,"58,595 mi.",Lithia Chrysler Jeep Dodge Ram of Wasilla,4.6,(543 reviews),14888.0,2017,Dodge,Journey,SXT


In [5]:
# Having the mileage column in float rather than string

df['Mileage'] = df['Mileage'].str.replace(' mi.', '').str.replace(',', '')
df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
print(df['Mileage'])

0         88380
1        100837
2        133750
3        110862
4         58595
          ...  
10033    158598
10034    113581
10035    197496
10036     85384
10037    133800
Name: Mileage, Length: 10038, dtype: int64


In [6]:
# Having Review count to float
df['Review Count'] = df['Review Count'].str.replace(r'\D', '', regex=True).astype(float)
print(df['Review Count'])

0         180.0
1         115.0
2           8.0
3        2682.0
4         543.0
          ...  
10033       1.0
10034    1074.0
10035      29.0
10036       7.0
10037      25.0
Name: Review Count, Length: 10038, dtype: float64


In [7]:
X = df[['Mileage', 'Rating', 'Review Count', 'Year']]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
rf_reg_pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', RandomForestRegressor())])
rf_reg_pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', RandomForestRegressor())])

In [11]:
y_pred_rf = rf_reg_pipeline.predict(X_test)

In [12]:
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)
print(rmse_rf)
print(r2_rf)

2545.087730698955
0.22724433007707556


- Very similar to the one without the updated cleaned data

# Tuned

In [13]:
X = df[['Mileage', 'Rating', 'Review Count', 'Year']]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
rf_reg_pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', RandomForestRegressor())])

param_dist = {
    'model__n_estimators': [100, 200, 300, 400, 500],  # Number of trees in the forest
    'model__max_features': ['auto', 'sqrt'],  # Number of features to consider at each split
    'model__max_depth': [None, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'model__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'model__min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

In [17]:
# Perform Randomized Search Cross-Validation
random_search = RandomizedSearchCV(rf_reg_pipeline, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('model',
                                              RandomForestRegressor())]),
                   param_distributions={'model__max_depth': [None, 10, 20, 30,
                                                             40, 50],
                                        'model__max_features': ['auto', 'sqrt'],
                                        'model__min_samples_leaf': [1, 2, 4],
                                        'model__min_samples_split': [2, 5, 10],
                                        'model__n_estimators': [100, 200, 300,
                                                                400, 500]},
                   random_state=42)

In [18]:
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'model__n_estimators': 200, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_features': 'sqrt', 'model__max_depth': None}


In [19]:
best_model = random_search.best_estimator_
y_pred_rf = best_model.predict(X_test)

In [20]:
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)
print("RMSE:", rmse_rf)
print("R-squared:", r2_rf)

RMSE: 2463.3452599088214
R-squared: 0.27608552875511805


In [21]:
X = df[['Mileage', 'Rating', 'Review Count', 'Year']]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
gb_reg_pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', GradientBoostingRegressor())])
param_grid = {
    'model__n_estimators': [100, 200, 300, 400, 500],  
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2], 
    'model__max_depth': [3, 4, 5, 6],  
}

In [23]:
grid_search = GridSearchCV(gb_reg_pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', GradientBoostingRegressor())]),
             param_grid={'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
                         'model__max_depth': [3, 4, 5, 6],
                         'model__n_estimators': [100, 200, 300, 400, 500]})

In [24]:
best_params_gb = grid_search.best_params_
print("Best Hyperparameters (Gradient Boosting):", best_params_gb)

Best Hyperparameters (Gradient Boosting): {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 100}


In [25]:
best_model_gb = grid_search.best_estimator_
y_pred_gb = best_model_gb.predict(X_test)

In [26]:
rmse_gb = mean_squared_error(y_test, y_pred_gb, squared=False)
r2_gb = r2_score(y_test, y_pred_gb)

print("Root Mean Squared Error:", rmse_gb)
print("R-squared:", r2_gb)

Root Mean Squared Error: 2444.200368748819
R-squared: 0.2872941948960699


In [27]:
# Age column
current_year = 2023
df['Age'] = current_year - df['Year']

In [28]:
df.head()

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Review Count,Price,Year,Make,Model,Type,Age
0,2015 Ford C-Max Hybrid SEL,88380,Capitol Kia,3.8,180.0,10998.0,2015,Ford,C-Max,Hybrid SEL,8
1,2016 Kia Sorento LX,100837,Napleton Mazda of Naperville,3.9,115.0,12577.0,2016,Kia,Sorento,LX,7
2,2016 Mitsubishi Lancer ES,133750,AVA Auto Sales,4.140821,8.0,8495.0,2016,Mitsubishi,Lancer,ES,7
3,2011 Ford Escape XLT,110862,94 Nissan Of South Holland,4.7,2682.0,7869.0,2011,Ford,Escape,XLT,12
4,2017 Dodge Journey SXT,58595,Lithia Chrysler Jeep Dodge Ram of Wasilla,4.6,543.0,14888.0,2017,Dodge,Journey,SXT,6


# Grid search after adding 'Age column'

In [34]:
X = df[['Mileage', 'Rating', 'Review Count', 'Year', 'Age']]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
rf = RandomForestRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [45]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]})

In [46]:
best_params = grid_search.best_params_
gb_reg = grid_search.best_estimator_
y_pred = gb_reg.predict(X_test)

In [47]:
rmse = sqrt(mean_squared_error(y_test, y_pred))
r_squared = gb_reg.score(X_test, y_test)

print("Best Hyperparameters:", best_params)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r_squared)

Best Hyperparameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Root Mean Squared Error: 2427.3575929656877
R-squared: 0.2970827418183767


# Best one yet

In [49]:

X = df[['Mileage', 'Rating', 'Review Count', 'Year', 'Age']]
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_reg = XGBRegressor()
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0, 0.01, 0.1],
}

grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_xgb_reg = grid_search.best_estimator_
y_pred_xgb = best_xgb_reg.predict(X_test)

rmse_xgb = sqrt(mean_squared_error(y_test, y_pred_xgb))
r_squared_xgb = r2_score(y_test, y_pred_xgb)

print("RMSE:", rmse_xgb)
print("R-squared:", r_squared_xgb)


Best XGBoost Regressor (with Hyperparameter Tuning):
Root Mean Squared Error: 2445.077051842308
R-squared: 0.28678283814126715


In [50]:
df.head()

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Review Count,Price,Year,Make,Model,Type,Age
0,2015 Ford C-Max Hybrid SEL,88380,Capitol Kia,3.8,180.0,10998.0,2015,Ford,C-Max,Hybrid SEL,8
1,2016 Kia Sorento LX,100837,Napleton Mazda of Naperville,3.9,115.0,12577.0,2016,Kia,Sorento,LX,7
2,2016 Mitsubishi Lancer ES,133750,AVA Auto Sales,4.140821,8.0,8495.0,2016,Mitsubishi,Lancer,ES,7
3,2011 Ford Escape XLT,110862,94 Nissan Of South Holland,4.7,2682.0,7869.0,2011,Ford,Escape,XLT,12
4,2017 Dodge Journey SXT,58595,Lithia Chrysler Jeep Dodge Ram of Wasilla,4.6,543.0,14888.0,2017,Dodge,Journey,SXT,6


In [51]:
pd.get_dummies(df, columns=['Make', 'Model'])

Unnamed: 0,Name,Mileage,Dealer Name,Rating,Review Count,Price,Year,Type,Age,Make_Acura,...,Model_Zephyr,Model_allroad,Model_e-Golf,Model_i3,Model_iM,Model_iQ,Model_tC,Model_xA,Model_xB,Model_xD
0,2015 Ford C-Max Hybrid SEL,88380,Capitol Kia,3.800000,180.0,10998.0,2015,Hybrid SEL,8,0,...,0,0,0,0,0,0,0,0,0,0
1,2016 Kia Sorento LX,100837,Napleton Mazda of Naperville,3.900000,115.0,12577.0,2016,LX,7,0,...,0,0,0,0,0,0,0,0,0,0
2,2016 Mitsubishi Lancer ES,133750,AVA Auto Sales,4.140821,8.0,8495.0,2016,ES,7,0,...,0,0,0,0,0,0,0,0,0,0
3,2011 Ford Escape XLT,110862,94 Nissan Of South Holland,4.700000,2682.0,7869.0,2011,XLT,12,0,...,0,0,0,0,0,0,0,0,0,0
4,2017 Dodge Journey SXT,58595,Lithia Chrysler Jeep Dodge Ram of Wasilla,4.600000,543.0,14888.0,2017,SXT,6,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10033,2014 Chevrolet Silverado 1500 LT,158598,City Auto Sales Corp,4.140821,1.0,13995.0,2014,1500 LT,9,0,...,0,0,0,0,0,0,0,0,0,0
10034,2019 Hyundai Sonata SE,113581,Auto Boutique,4.800000,1074.0,13500.0,2019,SE,4,0,...,0,0,0,0,0,0,0,0,0,0
10035,2013 Dodge Journey SXT,197496,Ryan Auto Mall Chrysler Dodge Jeep Ram Of Mont...,4.140821,29.0,7425.0,2013,SXT,10,0,...,0,0,0,0,0,0,0,0,0,0
10036,2007 Lexus ES 350,85384,"Mountain Motors, Inc",4.140821,7.0,11995.0,2007,350,16,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# Check for columns with non-numeric data types
non_numeric_columns = df.select_dtypes(exclude=['int64', 'float64', 'uint8']).columns
print(non_numeric_columns)

Index(['Name', 'Dealer Name', 'Make', 'Model', 'Type'], dtype='object')


In [57]:
# Drop the non-numeric columns 'Name', 'Dealer Name', and 'Type'
df_encoded = df_encoded.drop(['Name', 'Dealer Name'], axis=1)

# Now df_encoded contains only numeric features and the target variable 'Price'

In [61]:
# Check for columns with non-numeric data types
non_numeric_columns = df_encoded.select_dtypes(exclude=['int64', 'float64', 'uint8']).columns
print(non_numeric_columns)

Index([], dtype='object')


In [60]:
df_encoded = df_encoded.drop(['Make', 'Model'], axis=1)

In [62]:
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

rf_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R-squared:", r2)


Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 14.0min finished


Best Hyperparameters: {'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Root Mean Squared Error: 2344.3445967532234
R-squared: 0.3443386490919993


# The other model without Age and with 0's in the Review column is better

In [67]:
importances_rf = best_rf_model.feature_importances_
feature_importance_df_rf = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances_rf})
feature_importance_df_rf = feature_importance_df_rf.sort_values(by='Importance', ascending=False)
print(feature_importance_df_rf.head(10))

           Feature  Importance
0          Mileage    0.270502
3             Year    0.169866
4              Age    0.161194
2     Review Count    0.072865
1           Rating    0.049735
1019       Type_SE    0.019680
1000        Type_S    0.007378
456      Type_Base    0.006860
824        Type_LS    0.006659
726       Type_GLS    0.006181


In [65]:
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_reg = xgb.XGBRegressor(random_state=42)
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search_xgb = GridSearchCV(estimator=xgb_reg, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2)

grid_search_xgb.fit(X_train, y_train)

best_xgb_model = grid_search_xgb.best_estimator_

y_pred_xgb = best_xgb_model.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print("RMSE:", rmse_xgb)
print("R-squared for XGBoost:", r2_xgb)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 39.8min finished


Best Hyperparameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 300}
Root Mean Squared Error for XGBoost: 2201.0672569148483
R-squared for XGBoost: 0.422032640889507


In [66]:
# Feature Importance for XGBoost
importances_xgb = best_xgb_model.feature_importances_
feature_importance_df_xgb = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances_xgb})
feature_importance_df_xgb = feature_importance_df_xgb.sort_values(by='Importance', ascending=False)
print(feature_importance_df_xgb.head(10))

                   Feature  Importance
3                     Year    0.007695
608   Type_Cruiser Touring    0.006932
1019               Type_SE    0.006632
56            Type_1500 LT    0.006041
1047              Type_SLT    0.005252
463         Type_Base (M5)    0.005059
726               Type_GLS    0.005023
1000                Type_S    0.005020
824                Type_LS    0.004969
153             Type_2.5 S    0.004883
