In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.base import RegressorMixin
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,mean_absolute_error, r2_score

In [2]:
# Load the preprocessed dataset
data = pd.read_csv(r'DataSets\Model_TrainTest_data.csv')

In [3]:
data.head()

Unnamed: 0,oem,model,modelYear,Registration Year,Mileage,Fuel Type,Transmission,ownerNo,price,Gear Box,city,km,Safety,Interior,Exterior,Insurance Validity,bt
0,13,77,2015,2015.0,23.1,3,1,3,400000.0,5.0,0,120000.0,2.553191,3.529412,1.538462,2,0
1,6,22,2018,2018.0,17.0,3,1,2,811000.0,5.0,0,32706.0,5.531915,6.470588,4.230769,0,4
2,21,133,2018,2018.0,23.84,3,1,1,585000.0,5.0,0,11949.0,4.893617,6.470588,4.615385,0,0
3,8,51,2014,2014.0,19.1,3,1,1,462000.0,5.0,0,17794.0,3.617021,5.294118,3.461538,0,3
4,13,88,2015,2015.0,23.65,1,1,1,790000.0,5.0,0,60000.0,4.468085,5.882353,4.615385,2,4


In [4]:
column_to_round = ['Safety','Interior','Exterior']
data[column_to_round] = data[column_to_round].round(2)

In [5]:
data.head()

Unnamed: 0,oem,model,modelYear,Registration Year,Mileage,Fuel Type,Transmission,ownerNo,price,Gear Box,city,km,Safety,Interior,Exterior,Insurance Validity,bt
0,13,77,2015,2015.0,23.1,3,1,3,400000.0,5.0,0,120000.0,2.55,3.53,1.54,2,0
1,6,22,2018,2018.0,17.0,3,1,2,811000.0,5.0,0,32706.0,5.53,6.47,4.23,0,4
2,21,133,2018,2018.0,23.84,3,1,1,585000.0,5.0,0,11949.0,4.89,6.47,4.62,0,0
3,8,51,2014,2014.0,19.1,3,1,1,462000.0,5.0,0,17794.0,3.62,5.29,3.46,0,3
4,13,88,2015,2015.0,23.65,1,1,1,790000.0,5.0,0,60000.0,4.47,5.88,4.62,2,4


In [6]:
# Define the target variable
target = 'price'

# Split the data into features (X) and target (y)
X = data.drop(columns=[target])
y = data[target]

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models (same as before)
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Support Vector Regression (SVR)': SVR(),
    'K-Nearest Neighbors (KNN)': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Store evaluation metrics for each model
evaluation_results = []

# Iterate over models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Compute metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Append metrics to results
    evaluation_results.append({
        'Model': name,
        'MAE': mae,
        'R² Score': r2
    })

# Convert results to a DataFrame for better readability
evaluation_df = pd.DataFrame(evaluation_results)

  model = cd_fast.enet_coordinate_descent(


# Model Evaluation Summary
- Based on the evaluation metrics (Mean Absolute Error (MAE) and R² Score):
  - **Random Forest** and **XGBoost** outperformed other models:
    - Random Forest: MAE = 70,561.81, R² = 0.9017
    - XGBoost: MAE = 60,683.88, R² = 0.9420
  - Both models demonstrated significantly better accuracy and lower error compared to other algorithms.

# Final Decision
- The **Random Forest** and **XGBoost** models will be taken to the next stage of refinement.
- **Hyperparameter Tuning**:
  - Utilize `RandomizedSearchCV` to optimize the performance of these two models.
  - This approach ensures efficient search across a wide range of hyperparameters while saving computational time compared to `GridSearchCV`.

In [8]:
evaluation_df

Unnamed: 0,Model,MAE,R² Score
0,Linear Regression,137460.535913,0.70405
1,Ridge Regression,137452.899264,0.704046
2,Lasso Regression,137458.75995,0.704052
3,ElasticNet,141310.461867,0.667054
4,Support Vector Regression (SVR),283890.142082,-0.069013
5,K-Nearest Neighbors (KNN),250998.111723,0.185288
6,Random Forest,70959.765177,0.904223
7,Gradient Boosting,87306.943429,0.892314
8,XGBoost,60683.883507,0.942015


# HyperParameter Tuning

In [9]:
# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Randomized Search CV
rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_grid,
    n_iter=50,  # Number of random combinations to try
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV
rf_random_search.fit(X_train, y_train)

# Best parameters and model
rf_best_model = rf_random_search.best_estimator_
print("Best Parameters for Random Forest:", rf_random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for Random Forest: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': True}


In [10]:
# Random Forest Evaluation
rf_y_pred = rf_best_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = rf_best_model.score(X_test, y_test)
print(f"Random Forest - MAE: {rf_mae:.2f}, R² Score: {rf_r2}")

Random Forest - MAE: 69599.30, R² Score: 0.9085813651910772


In [11]:
# Define parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 2, 5]
}

# Initialize XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Randomized Search CV
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_grid,
    n_iter=50,  # Number of random combinations to try
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV
xgb_random_search.fit(X_train, y_train)

# Best parameters and model
xgb_best_model = xgb_random_search.best_estimator_
print("Best Parameters for XGBoost:", xgb_random_search.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters for XGBoost: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 1, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}


In [12]:
# XGBoosting Evaluation
xgb_y_pred = xgb_best_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
xgb_r2 = xgb_best_model.score(X_test, y_test)
print(f"XGBoosting - MAE: {xgb_mae:.2f}, R² Score: {xgb_r2}")

XGBoosting - MAE: 56519.18, R² Score: 0.949466415285093


# Model Performance After Hyperparameter Tuning
- Based on the evaluation metrics, **XGBoost** is the better model for predicting car prices. It will be selected as the final model for deployment.

In [13]:
results = {
    'Model': ['Random Forest', 'XGBoost'],
    'MAE': [rf_mae, xgb_mae],
    'R² Score': [rf_r2, xgb_r2]
}
results_df = pd.DataFrame(results)
print(results_df)

           Model           MAE  R² Score
0  Random Forest  69599.298373  0.908581
1        XGBoost  56519.176737  0.949466


In [14]:
# Save the best model (XGBoost)
joblib.dump(xgb_best_model, 'xgboost_best_model.pkl')
print("Model saved successfully as 'xgboost_best_model.pkl'")

Model saved successfully as 'xgboost_best_model.pkl'


In [None]:
# Load dataset
df_cars = pd.read_csv(r'DataSets\Temp_preprocessed_data.csv')

# Create LabelEncoders for the specified categorical columns
encoders = {
    'oem': LabelEncoder().fit(df_cars['oem'].dropna().unique()),
    'model': LabelEncoder().fit(df_cars['model'].dropna().unique()),
    'Fuel Type': LabelEncoder().fit(df_cars['Fuel Type'].dropna().unique()),
    'Transmission': LabelEncoder().fit(df_cars['Transmission'].dropna().unique()),
    'Insurance Validity': LabelEncoder().fit(df_cars['Insurance Validity'].dropna().unique()),
    'bt': LabelEncoder().fit(df_cars['bt'].dropna().unique()),
    'city': LabelEncoder().fit(df_cars['city'].dropna().unique())
}

# Save the encoders to a .pkl file
joblib.dump(encoders, 'categorical_encoders.pkl')

print("Categorical encoders saved successfully as 'categorical_encoders.pkl'")

Categorical encoders saved successfully as 'categorical_encoders.pkl'
