In [3]:
# 📌 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# 📌 2. Load Dataset
df = pd.read_csv("cardekho_imputated.csv")

# 📌 3. Clean Column Names
df.columns = df.columns.str.strip()

# 📌 4. Drop Unnecessary Columns
df.drop(columns=['car_name'], inplace=True)

# 📌 5. Define Feature and Target Variables
X = df.drop(columns=['selling_price'])
y = df['selling_price']

# 📌 6. Specify Numeric and Categorical Features
numeric_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
categorical_features = ['brand', 'model', 'fuel_type', 'transmission_type', 'seller_type']

# 📌 7. Define Preprocessing Pipelines

# Numerical pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# 📌 8. Apply Preprocessing
X_encoded = preprocessor.fit_transform(X)

# Convert sparse matrix to dense if needed
if hasattr(X_encoded, "toarray"):
    X_encoded = X_encoded.toarray()

# 📌 9. Extract Feature Names
cat_ohe_features = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features)
all_features = numeric_features + list(cat_ohe_features)

# 📌 10. Create Encoded DataFrame
df_encoded = pd.DataFrame(X_encoded, columns=all_features)

# Add target variable
df_encoded['selling_price'] = y.reset_index(drop=True)

# 📌 11. Create New Features
df_encoded['power_per_engine'] = df_encoded['max_power'] / df_encoded['engine']
df_encoded['log_km_driven'] = (df_encoded['km_driven'] + 1).apply(np.log)

# 📌 12. Handle Infinite/NaN in New Features
df_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
df_encoded.fillna(df_encoded.median(numeric_only=True), inplace=True)

# 📌 13. Rescale All Numerical Features Again (optional but keeps consistency)
num_cols = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'power_per_engine', 'log_km_driven']
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

# 📌 14. Final Preprocessed Data
df_final = df_encoded.copy()

# 📌 15. Preview the Final Data
df_final.head()
# | vehicle_age | km\_driven | mileage   | engine    | max\_power | seats     | brand\_ Audi | brand\_ BMW | brand\_ Bentley | brand\_ Datsun | ... | fuel\_type\_ LPG | fuel\_type\_ Petrol | transmission\_type\_ Automatic | transmission\_type\_ Manual | seller\_type\_ Dealer | seller\_type\_ Individual | seller\_type\_ Trustmark Dealer | selling\_price | power\_per\_engine | log\_km\_driven |
# | ------------ | ---------- | --------- | --------- | ---------- | --------- | ------------ | ----------- | --------------- | -------------- | --- | ---------------- | ------------------- | ------------------------------ | --------------------------- | --------------------- | ------------------------- | ------------------------------- | -------------- | ------------------ | --------------- |
# | 0.983562     | 1.247335   | -0.000276 | -1.324259 | -1.263352  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 120000         | -0.087619          | 1.101534        |
# | -0.343933    | -0.690016  | -0.192071 | -0.554718 | -0.432571  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 550000         | -0.121077          | -0.847208       |
# | 1.647309     | 0.084924   | -0.647583 | -0.554718 | -0.479113  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 215000         | -0.104962          | 0.385150        |
# | 0.983562     | -0.360667  | 0.292211  | -0.936610 | -0.779312  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 226000         | -0.111041          | -0.135083       |
# | -0.012060    | -0.496281  | 0.735736  | 0.022918  | -0.046502  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 0.0                 | 0.0                            | 1.0                         | 1.0                   | 0.0                       | 0.0                             | 570000         | -0.660555          | -0.369611       |



Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,brand_Audi,brand_BMW,brand_Bentley,brand_Datsun,...,fuel_type_LPG,fuel_type_Petrol,transmission_type_Automatic,transmission_type_Manual,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,selling_price,power_per_engine,log_km_driven
0,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,120000,-0.087619,1.101534
1,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,550000,-0.121077,-0.847208
2,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,215000,-0.104962,0.38515
3,0.983562,-0.360667,0.292211,-0.93661,-0.779312,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,226000,-0.111041,-0.135083
4,-0.01206,-0.496281,0.735736,0.022918,-0.046502,-0.403022,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,570000,-0.660555,-0.369611


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

##Create a Function to Evaluate Model
## Beginning Model Training
#Initialize few parameter for Hyperparamter tuning
# Models list for Hyperparameter tuning
##Hyperparameter Tuning
## Retraining the models with best parameters

# 📌 Step 1: Define Evaluation Function
def evaluate_model(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return {"R2 Score": r2, "MAE": mae, "RMSE": rmse}

# 📌 Step 2: Train and Evaluate Multiple Models
# Split the data
from sklearn.model_selection import train_test_split

X = df_final.drop(columns=['selling_price'])
y = df_final['selling_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(objective='reg:squarederror', verbosity=0)
}

# Train and evaluate
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = evaluate_model(y_test, y_pred)

# Convert to DataFrame for easy viewing
results_df = pd.DataFrame(results).T.sort_values(by="R2 Score", ascending=False)
results_df

# | **Model**             | **R² Score** | **MAE**    | **RMSE**   |
# | --------------------- | ------------ | ---------- | ---------- |
# | **Random Forest**     | 0.9356       | 99,744.67  | 220,252.61 |
# | **XGBoost**           | 0.9270       | 98,304.01  | 234,449.80 |
# | **KNN**               | 0.9245       | 106,754.31 | 238,331.16 |
# | **Gradient Boosting** | 0.9219       | 126,044.43 | 242,494.25 |
# | **Decision Tree**     | 0.8868       | 124,594.81 | 291,880.80 |
# | **Linear Regression** | 0.8081       | 176,791.07 | 380,119.46 |
# | **Ridge**             | 0.8036       | 184,268.34 | 384,526.62 |
# | **Lasso**             | 0.8034       | 177,785.48 | 384,672.61 |
# | **AdaBoost**          | 0.6079       | 379,166.57 | 543,295.11 |

# Insights:
# ✅ Best performer: Random Forest (highest R², lowest MAE & RMSE)
# ⚖️ XGBoost and KNN follow closely behind
# ❌ AdaBoost performed the worst in all metrics



Unnamed: 0,R2 Score,MAE,RMSE
Random Forest,0.935557,99744.66741,220252.610395
XGBoost,0.926982,98304.007812,234449.80331
KNN,0.924544,106754.305871,238331.156436
Gradient Boosting,0.921885,126044.428496,242494.24633
Decision Tree,0.886827,124594.812953,291880.796806
Linear Regression,0.808058,176791.071708,380119.456972
Ridge,0.803581,184268.338549,384526.618574
Lasso,0.803432,177785.478956,384672.612509
AdaBoost,0.607895,379166.569036,543295.108571


In [5]:
pip install xgboost


Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.2
Note: you may need to restart the kernel to use updated packages.
