In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,mean_absolute_error, r2_score

In [3]:
# Load the preprocessed dataset
data = pd.read_csv(r'DataSets\New_Structured_Data\TechnicalBased_Handled_data.csv')

In [4]:
data.head()

Unnamed: 0,oem,model,modelYear,Registration Year,Mileage,Fuel Type,Transmission,ownerNo,price,Gear Box,city,km,Safety,Interior,Exterior,Insurance Validity,bt
0,17,160,2015.0,2015.0,23.1,4,1,3.0,400000.0,5.0,0,120000.0,13.0,7.0,5.0,2,2
1,6,50,2018.0,2018.0,17.0,4,1,2.0,811000.0,5.0,0,32706.0,27.0,12.0,12.0,0,8
2,27,273,2018.0,2018.0,23.84,4,1,1.0,585000.0,5.0,0,11949.0,24.0,12.0,13.0,0,2
3,9,87,2014.0,2014.0,19.1,4,1,1.0,462000.0,5.0,0,17794.0,18.0,10.0,10.0,0,7
4,17,178,2015.0,2015.0,23.65,1,1,1.0,790000.0,5.0,0,60000.0,22.0,11.0,13.0,2,8


In [5]:
# Define the target variable
target = 'price'

# Split the data into features (X) and target (y)
X = data.drop(columns=[target])
y = data[target]

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models (same as before)
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Support Vector Regression (SVR)': SVR(),
    'K-Nearest Neighbors (KNN)': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Store evaluation metrics for each model
evaluation_results = []

# Iterate over models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Compute metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Append metrics to results
    evaluation_results.append({
        'Model': name,
        'MAE': mae,
        'R² Score': r2
    })

# Convert results to a DataFrame for better readability
evaluation_df = pd.DataFrame(evaluation_results)

  model = cd_fast.enet_coordinate_descent(


In [9]:
evaluation_df

Unnamed: 0,Model,MAE,R² Score
0,Linear Regression,180329.240302,0.677519
1,Ridge Regression,180333.192392,0.677516
2,Lasso Regression,180329.266936,0.677519
3,ElasticNet,194310.078492,0.653429
4,Support Vector Regression (SVR),382350.289079,-0.082031
5,K-Nearest Neighbors (KNN),325581.038647,0.232636
6,Random Forest,99819.541465,0.848735
7,Gradient Boosting,130617.434372,0.81789
8,XGBoost,96647.425171,0.840743


# HyperParameter Tuning

In [11]:
# Hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Random Forest Tuning
rf_model = RandomForestRegressor(random_state=42)

rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Best parameters and best model for Random Forest
rf_best_model = rf_grid_search.best_estimator_
print("Best Parameters for Random Forest:", rf_grid_search.best_params_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters for Random Forest: {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [12]:
# Random Forest Evaluation
rf_y_pred = rf_best_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = rf_best_model.score(X_test, y_test)
print(f"Random Forest - MAE: {rf_mae:.2f}, R² Score: {rf_r2:.2f}")

Random Forest - MAE: 99677.83, R² Score: 0.85


In [None]:
# Hyperparameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 2, 5]
}

# # XGBoost Tuning
# xgb_model = XGBRegressor(objective='reg:squarederror',random_state=42)

class SklearnCompatibleXGBRegressor(XGBRegressor, RegressorMixin):
    pass

# Use the compatible wrapper
xgb_model = SklearnCompatibleXGBRegressor(objective='reg:squarederror', random_state=42)

xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)

# Best parameters and best model for XGBoost
xgb_best_model = xgb_grid_search.best_estimator_
print("Best Parameters for XGBoost:", xgb_grid_search.best_params_)


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [15]:
import xgboost
import sklearn

print(f"XGBoost Version: {xgboost.__version__}")
print(f"scikit-learn Version: {sklearn.__version__}")


XGBoost Version: 2.1.3
scikit-learn Version: 1.6.0
