In [None]:
# 📌 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# 📌 2. Load Dataset
df = pd.read_csv("car_data.csv")

# 📌 3. Clean Column Names
df.columns = df.columns.str.strip()

# 📌 4. Drop Unnecessary Columns
df.drop(columns=['car_name'], inplace=True)

# 📌 5. Define Feature and Target Variables
X = df.drop(columns=['selling_price'])
y = df['selling_price']

# 📌 6. Specify Numeric and Categorical Features
numeric_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
categorical_features = ['brand', 'model', 'fuel_type', 'transmission_type', 'seller_type']

# 📌 7. Define Preprocessing Pipelines

# Numerical pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# 📌 8. Apply Preprocessing
X_encoded = preprocessor.fit_transform(X)

# Convert sparse matrix to dense if needed
if hasattr(X_encoded, "toarray"):
    X_encoded = X_encoded.toarray()

# 📌 9. Extract Feature Names
cat_ohe_features = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features)
all_features = numeric_features + list(cat_ohe_features)

# 📌 10. Create Encoded DataFrame
df_encoded = pd.DataFrame(X_encoded, columns=all_features)

# Add target variable
df_encoded['selling_price'] = y.reset_index(drop=True)

# 📌 11. Create New Features
df_encoded['power_per_engine'] = df_encoded['max_power'] / df_encoded['engine']
df_encoded['log_km_driven'] = (df_encoded['km_driven'] + 1).apply(np.log)

# 📌 12. Handle Infinite/NaN in New Features
df_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
df_encoded.fillna(df_encoded.median(numeric_only=True), inplace=True)

# 📌 13. Rescale All Numerical Features Again (optional but keeps consistency)
num_cols = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'power_per_engine', 'log_km_driven']
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

# 📌 14. Final Preprocessed Data
df_final = df_encoded.copy()

# 📌 15. Preview the Final Data
df_final.head()
# | vehicle_age | km\_driven | mileage   | engine    | max\_power | seats     | brand\_ Audi | brand\_ BMW | brand\_ Bentley | brand\_ Datsun | ... | fuel\_type\_ LPG | fuel\_type\_ Petrol | transmission\_type\_ Automatic | transmission\_type\_ Manual | seller\_type\_ Dealer | seller\_type\_ Individual | seller\_type\_ Trustmark Dealer | selling\_price | power\_per\_engine | log\_km\_driven |
# | ------------ | ---------- | --------- | --------- | ---------- | --------- | ------------ | ----------- | --------------- | -------------- | --- | ---------------- | ------------------- | ------------------------------ | --------------------------- | --------------------- | ------------------------- | ------------------------------- | -------------- | ------------------ | --------------- |
# | 0.983562     | 1.247335   | -0.000276 | -1.324259 | -1.263352  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 120000         | -0.087619          | 1.101534        |
# | -0.343933    | -0.690016  | -0.192071 | -0.554718 | -0.432571  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 550000         | -0.121077          | -0.847208       |
# | 1.647309     | 0.084924   | -0.647583 | -0.554718 | -0.479113  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 215000         | -0.104962          | 0.385150        |
# | 0.983562     | -0.360667  | 0.292211  | -0.936610 | -0.779312  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 1.0                 | 0.0                            | 1.0                         | 0.0                   | 1.0                       | 0.0                             | 226000         | -0.111041          | -0.135083       |
# | -0.012060    | -0.496281  | 0.735736  | 0.022918  | -0.046502  | -0.403022 | 0.0          | 0.0         | 0.0             | 0.0            | ... | 0.0              | 0.0                 | 0.0                            | 1.0                         | 1.0                   | 0.0                       | 0.0                             | 570000         | -0.660555          | -0.369611       |



Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,brand_ Audi,brand_ BMW,brand_ Bentley,brand_ Datsun,...,fuel_type_ LPG,fuel_type_ Petrol,transmission_type_ Automatic,transmission_type_ Manual,seller_type_ Dealer,seller_type_ Individual,seller_type_ Trustmark Dealer,selling_price,power_per_engine,log_km_driven
0,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,120000,-0.087619,1.101534
1,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,550000,-0.121077,-0.847208
2,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,215000,-0.104962,0.38515
3,0.983562,-0.360667,0.292211,-0.93661,-0.779312,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,226000,-0.111041,-0.135083
4,-0.01206,-0.496281,0.735736,0.022918,-0.046502,-0.403022,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,570000,-0.660555,-0.369611


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

##Create a Function to Evaluate Model
## Beginning Model Training
#Initialize few parameter for Hyperparamter tuning
# Models list for Hyperparameter tuning
##Hyperparameter Tuning
## Retraining the models with best parameters

# ✅ 16. Define Evaluation & Tuning Function
from sklearn.model_selection import train_test_split, GridSearchCV

# 📌 Split Data
X = df_final.drop(columns=['selling_price'])
y = df_final['selling_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 Evaluation Function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)

    return {
        "R2 Score": r2,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse
    }

# ✅ 17. Train Baseline Models
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "KNN": KNeighborsRegressor(),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

results = {}
for name, model in models.items():
    print(f"Training: {name}")
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test)

# 📌 Display Results
results_df = pd.DataFrame(results).T.sort_values("R2 Score", ascending=False)
print(results_df)

#✅ 18. Hyperparameter Tuning (Example: RandomForest, GradientBoosting)
# 📌 RandomForest Tuning
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=3, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# 📌 GradientBoosting Tuning
gb_params = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_params, cv=3, scoring='r2', n_jobs=-1)
gb_grid.fit(X_train, y_train)

# 📌 Evaluate Tuned Models
tuned_rf_result = evaluate_model(rf_grid.best_estimator_, X_train, X_test, y_train, y_test)
tuned_gb_result = evaluate_model(gb_grid.best_estimator_, X_train, X_test, y_train, y_test)

print("\nBest Tuned RandomForest:", tuned_rf_result)
print("\nBest Tuned GradientBoosting:", tuned_gb_result)

# Training: LinearRegression
# Training: Ridge
# Training: Lasso
# Training: DecisionTree
# Training: RandomForest
# Training: KNN
# Training: AdaBoost
# Training: GradientBoosting
#                   R2 Score            MAE           MSE           RMSE
# RandomForest      0.935600  100023.336466  4.847921e+10  220179.958362
# KNN               0.924544  106754.305871  5.680174e+10  238331.156436
# GradientBoosting  0.921087  126285.646576  5.940442e+10  243730.212881
# DecisionTree      0.883265  125631.735863  8.787582e+10  296438.568656
# LinearRegression  0.808058  176791.071708  1.444908e+11  380119.456972
# Ridge             0.803581  184268.338549  1.478607e+11  384526.618574
# Lasso             0.803432  177785.478956  1.479730e+11  384672.612509
# AdaBoost          0.749008  291990.391919  1.889426e+11  434675.246631

# Best Tuned RandomForest: {'R2 Score': 0.9359394279534883, 'MAE': 99257.3578895661, 'MSE': 48223632124.55165, 'RMSE': 219598.79809450608}

# Best Tuned GradientBoosting: {'R2 Score': 0.9335799329690326, 'MAE': 109401.3252113319, 'MSE': 49999816983.58635, 'RMSE': 223606.38851246255}


Training: LinearRegression
Training: Ridge
Training: Lasso
Training: DecisionTree
Training: RandomForest
Training: KNN
Training: AdaBoost
Training: GradientBoosting
                  R2 Score            MAE           MSE           RMSE
RandomForest      0.935600  100023.336466  4.847921e+10  220179.958362
KNN               0.924544  106754.305871  5.680174e+10  238331.156436
GradientBoosting  0.921087  126285.646576  5.940442e+10  243730.212881
DecisionTree      0.883265  125631.735863  8.787582e+10  296438.568656
LinearRegression  0.808058  176791.071708  1.444908e+11  380119.456972
Ridge             0.803581  184268.338549  1.478607e+11  384526.618574
Lasso             0.803432  177785.478956  1.479730e+11  384672.612509
AdaBoost          0.749008  291990.391919  1.889426e+11  434675.246631

Best Tuned RandomForest: {'R2 Score': 0.9359394279534883, 'MAE': 99257.3578895661, 'MSE': 48223632124.55165, 'RMSE': 219598.79809450608}

Best Tuned GradientBoosting: {'R2 Score': 0.9335799329690