In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder , LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from scipy.stats import uniform, randint


In [2]:
Data1 = pd.read_csv("Jayed Prediction.csv", encoding='ISO-8859-1')
Data2 = pd.read_csv("Jayed Prediction 2.csv", encoding='ISO-8859-1')

Data = pd.concat([Data1, Data2], ignore_index=True)
#Data.to_csv(r"C:\Users\user\OneDrive\Desktop\Car Price Prediction\Full Data.csv", index=False)

X = Data.drop(columns=['Price'])
y = Data['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
categorical_features = ['Make', 'Model', 'Year', 'Transmission', 'Fuel', 'Condition', 'Paint', 'Jayed']
numerical_features = ['Avarege Kilometars']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features)
    ])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [5]:
def evaluate_model(model, X_test_processed, y_test):
    y_pred = model.predict(X_test_processed)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

In [7]:
# XGBoost Regressor with RandomizedSearchCV
xgb_param_distributions = {
    'n_estimators': randint(100, 250),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 20),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),
}

xgb_random_search = RandomizedSearchCV(XGBRegressor(random_state=42), xgb_param_distributions, n_iter=25, cv=5, random_state=42, n_jobs=-1)
xgb_random_search.fit(X_train_processed, y_train)

best_xgb_model = xgb_random_search.best_estimator_
evaluate_model(best_xgb_model, X_test_processed, y_test)
cross_val_scores = cross_val_score(best_xgb_model, X_train_processed, y_train, cv=5, scoring='neg_mean_squared_error')

print(f"Best Hyperparameters: {xgb_random_search.best_params_}")
print(f"Cross-Validated Mean Squared Error: {-cross_val_scores.mean()}")

Mean Squared Error: 13111162.169591468
R-squared: 0.831535590178845
Best Hyperparameters: {'learning_rate': 0.10507404463642235, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 242, 'subsample': 0.8777755692715243}
Cross-Validated Mean Squared Error: 12125587.623873685


In [9]:
dt_param_grid = {
    'max_depth': [3, 7,  15],
    'min_samples_split': [2, 7, 10],
    'min_samples_leaf': [ 2, 4, 6],
    'criterion': ['squared_error', 'absolute_error']
}

dt_grid_search = GridSearchCV(DecisionTreeRegressor(), dt_param_grid, cv=5, n_jobs=-1)
dt_grid_search.fit(X_train_processed, y_train)

best_dt_model = dt_grid_search.best_estimator_
evaluate_model(best_dt_model,X_test_processed, y_test)
print("Best Parameters:")
print(dt_grid_search.best_params_)

Mean Squared Error: 17246051.59005933
R-squared: 0.7784066839167878
Best Parameters:
{'criterion': 'squared_error', 'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [10]:
rf_param_distributions = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(1, 15),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

rf_random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), rf_param_distributions, n_iter=25, cv=5, random_state=42, n_jobs=-1)
rf_random_search.fit(X_train_processed, y_train)

best_rf_model = rf_random_search.best_estimator_
evaluate_model(best_rf_model, X_test_processed , y_test)
cross_val_scores = cross_val_score(best_rf_model, X_train_processed, y_train, cv=5, scoring='neg_mean_squared_error')

print(f"Best Hyperparameters: {rf_random_search.best_params_}")
print(f"Cross-Validated Mean Squared Error: {-cross_val_scores.mean()}")

10 fits failed out of a total of 125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\user\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterErr

Mean Squared Error: 13507987.478159962
R-squared: 0.8264368094189567
Best Hyperparameters: {'bootstrap': True, 'max_depth': 16, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 108}
Cross-Validated Mean Squared Error: 13019213.581865495
