In [None]:
## importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

# scikit-learn imports for preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

%matplotlib inline


df = pd.read_csv("cardekho_imputated.csv")
df.head()

# | Unnamed: 0 | car_name      | brand   | model    | vehicle_age | km_driven   | seller_type  | fuel_type  | transmission_type  | mileage | engine | max_power  | seats | selling_price  |
# | ---------- | ------------- | ------- | -------- | ------------ | ---------- | ------------ | ---------- | ------------------ | ------- | ------ | ---------- | ----- | -------------- |
# | 0          | Maruti Alto   | Maruti  | Alto     | 9            | 120000     | Individual   | Petrol     | Manual             | 19.70   | 796    | 46.30      | 5     | 120000         |
# | 1          | Hyundai Grand | Hyundai | Grand    | 5            | 20000      | Individual   | Petrol     | Manual             | 18.90   | 1197   | 82.00      | 5     | 550000         |
# | 2          | Hyundai i20   | Hyundai | i20      | 11           | 60000      | Individual   | Petrol     | Manual             | 17.00   | 1197   | 80.00      | 5     | 215000         |
# | 3          | Maruti Alto   | Maruti  | Alto     | 9            | 37000      | Individual   | Petrol     | Manual             | 20.92   | 998    | 67.10      | 5     | 226000         |
# | 4          | Ford Ecosport | Ford    | Ecosport | 6            | 30000      | Dealer       | Diesel     | Manual             | 22.77   | 1498   | 98.59      | 5     | 570000         |

#Checking missing value
df.isnull().sum()
# Unnamed: 0           0
# car_name             0
# brand                0
# model                0
# vehicle_age          0
# km_driven            0
# seller_type          0
# fuel_type            0
# transmission_type    0
# mileage              0
# engine               0
# max_power            0
# seats                0
# selling_price        0
# dtype: int64

df.info()
 #   Column             Non-Null Count  Dtype  
# ---  ------             --------------  -----  
#  0   Unnamed: 0         15411 non-null  int64  
#  1   car_name           15411 non-null  object 
#  2   brand              15411 non-null  object 
#  3   model              15411 non-null  object 
#  4   vehicle_age        15411 non-null  int64  
#  5   km_driven          15411 non-null  int64  
#  6   seller_type        15411 non-null  object 
#  7   fuel_type          15411 non-null  object 
#  8   transmission_type  15411 non-null  object 
#  9   mileage            15411 non-null  float64
#  10  engine             15411 non-null  int64  
#  11  max_power          15411 non-null  float64
#  12  seats              15411 non-null  int64  
#  13  selling_price      15411 non-null  int64  


# Checking unique values for each column 
print(df.columns.tolist())
# ['Unnamed: 0', 'car_name', 'brand', 'model', 'vehicle_age', 'km_driven', 'seller_type', 'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']
df['brand'].unique()
# array(['Maruti', 'Hyundai', 'Ford', 'Renault', 'Mini', 'Mercedes-Benz',
#        'Toyota', 'Volkswagen', 'Honda', 'Mahindra', 'Datsun', 'Tata',
#        'Kia', 'BMW', 'Audi', 'Land Rover', 'Jaguar', 'MG', 'Isuzu',
#        'Porsche', 'Skoda', 'Volvo', 'Lexus', 'Jeep', 'Maserati',
#        'Bentley', 'Nissan', 'ISUZU', 'Ferrari', 'Mercedes-AMG',
#        'Rolls-Royce', 'Force'], dtype=object)

df['model'].unique()
# array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
#        'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
#        'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
#        'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
#        'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
#        'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
#        'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
#        'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
#        'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
#        'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
#        'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
#        'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
#        'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
#        'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
#        'Macan', 'X4', 'Dzire ZXI', 'XC90', 'F-PACE', 'A8', 'MUX',
#        'GTC4Lusso', 'GLS', 'X-Trail', 'XE', 'XC60', 'Panamera', 'Alturas',
#        'Altroz', 'NX', 'Carnival', 'C', 'RX', 'Ghost', 'Quattroporte',
#        'Gurkha'], dtype=object)

df['vehicle_age'].unique()
# array([ 9,  5, 11,  6,  8,  3,  2,  4,  7, 10, 14, 12,  1, 15, 13, 16, 17,
#        18, 29, 19,  0, 21, 22, 25], dtype=int64)

df['km_driven'].unique()
# array([ 120000,   20000,   60000, ...,    9229,   10723, 3800000],dtype=int64)
df['seller_type'].unique() # array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object)
df['fuel_type'].unique() #array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'], dtype=object)
df['transmission_type'].unique() #array(['Manual', 'Automatic'], dtype=object)
df['seats'].unique() #array([5, 8, 7, 6, 4, 2, 9, 0], dtype=int64)

# 1️⃣ Drop unnecessary columns
df.drop(columns=['Unnamed: 0', 'car_name'], inplace=True)
df.head()
# | brand   | model    | vehicle_age  | km_driven  | seller_type  | fuel_type  | transmission_type  | mileage | engine | max_power  | seats | selling_price  |
# | ------- | -------- | ------------ | ---------- | ------------ | ---------- | ------------------ | ------- | ------ | ---------- | ----- | -------------- |
# | Maruti  | Alto     | 9            | 120000     | Individual   | Petrol     | Manual             | 19.70   | 796    | 46.30      | 5     | 120000         |
# | Hyundai | Grand    | 5            | 20000      | Individual   | Petrol     | Manual             | 18.90   | 1197   | 82.00      | 5     | 550000         |
# | Hyundai | i20      | 11           | 60000      | Individual   | Petrol     | Manual             | 17.00   | 1197   | 80.00      | 5     | 215000         |
# | Maruti  | Alto     | 9            | 37000      | Individual   | Petrol     | Manual             | 20.92   | 998    | 67.10      | 5     | 226000         |
# | Ford    | Ecosport | 6            | 30000      | Dealer       | Diesel     | Manual             | 22.77   | 1498   | 98.59      | 5     | 570000         |

numeric_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
categorical_features = ['brand', 'model', 'fuel_type', 'transmission_type', 'seller_type']

# 2️⃣ Handle missing values & scaling for numeric features
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# 3️⃣ One-hot encode categorical features
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# 4️⃣ Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

df.head()
# | #   | brand   | model    | vehicle_age  | km_driven  | seller_type  | fuel_type  | transmission_type  | mileage | engine       | max_power        | seats  | selling_price       |
# |-----|---------|----------|--------------|------------|--------------|------------|--------------------|---------|--------------|------------------|--------|---------------------|
# | 0   | Maruti  | Alto     | 9            | 120000     | Individual   | Petrol     | Manual             | 19.70   | 796          | 46.30            | 5      | 120000              |
# | 1   | Hyundai | Grand    | 5            | 20000      | Individual   | Petrol     | Manual             | 18.90   | 1197         | 82.00            | 5      | 550000              |
# | 2   | Hyundai | i20      | 11           | 60000      | Individual   | Petrol     | Manual             | 17.00   | 1197         | 80.00            | 5      | 215000              |
# | 3   | Maruti  | Alto     | 9            | 37000      | Individual   | Petrol     | Manual             | 20.92   | 998          | 67.10            | 5      | 226000              |
# | 4   | Ford    | Ecosport | 6            | 30000      | Dealer       | Diesel     | Manual             | 22.77   | 1498         | 98.59            | 5      | 570000              |

df.info()
 #   Column             Non-Null Count  Dtype  
# ---  ------             --------------  -----  
#  0   brand              15411 non-null  object 
#  1   model              15411 non-null  object 
#  2   vehicle_age        15411 non-null  int64  
#  3   km_driven          15411 non-null  int64  
#  4   seller_type        15411 non-null  object 
#  5   fuel_type          15411 non-null  object 
#  6   transmission_type  15411 non-null  object 
#  7   mileage            15411 non-null  float64
#  8   engine             15411 non-null  int64  
#  9   max_power          15411 non-null  float64
#  10  seats              15411 non-null  int64  
#  11  selling_price      15411 non-null  int64  
# dtypes: float64(2), int64(5), object(5)
# memory usage: 1.4+ MB

# Fit and transform features
X_encoded = preprocessor.fit_transform(X)

# Convert sparse matrix to dense if needed (fix for ValueError)
if hasattr(X_encoded, "toarray"):
    X_encoded = X_encoded.toarray()

# Extract one-hot encoded categorical feature names
cat_ohe_features = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features)

# Combine all feature names
all_features = numeric_features + list(cat_ohe_features)

# Sanity check for shapes
print("Shape of transformed features:", X_encoded.shape)
print("Number of combined features:", len(all_features))
assert X_encoded.shape[1] == len(all_features), "Mismatch between transformed features and feature names length!"

# Create DataFrame from encoded features
df_encoded = pd.DataFrame(X_encoded, columns=all_features)

# Add target variable back
df_encoded['selling_price'] = y.reset_index(drop=True)

# Create new features
df_encoded['power_per_engine'] = df_encoded['max_power'] / df_encoded['engine']
df_encoded['log_km_driven'] = (df_encoded['km_driven'] + 1).apply(np.log)

# Handle infinite or NaN values in new features
df_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
df_encoded.fillna(df_encoded.median(), inplace=True)

# Scale numerical features including new features
num_cols = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'power_per_engine', 'log_km_driven']
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

# Final DataFrame ready for modeling
df_final = df_encoded.copy()

# Check the final DataFrame
df_final.head()
# | # | vehicle_age  | km_driven  | mileage   | engine    | max_power  | seats    | brand_Audi  | brand_BMW  | fuel_type_Petrol   | transmission_type_Manual   | seller_type_Dealer   | selling_price  | power_per_engine   | log_km_driven |
# | - | ------------ | ---------- | --------- | --------- | ---------- | -------- | ----------- | ---------- | ------------------ | -------------------------- | -------------------- | -------------- | ------------------ | --------------- |
# | 0 | 0.983562     | 1.247335   | -0.000276 | -1.324259 | -1.263352  | -0.40302 | 0.0         | 0.0        | 1.0                | 1.0                        | 0.0                  | 120000         | -0.087619          | 1.101534        |
# | 1 | -0.343933    | -0.690016  | -0.192071 | -0.554718 | -0.432571  | -0.40302 | 0.0         | 0.0        | 1.0                | 1.0                        | 0.0                  | 550000         | -0.121077          | -0.847208       |
# | 2 | 1.647309     | 0.084924   | -0.647583 | -0.554718 | -0.479113  | -0.40302 | 0.0         | 0.0        | 1.0                | 1.0                        | 0.0                  | 215000         | -0.104962          | 0.385150        |
# | 3 | 0.983562     | -0.360667  | 0.292211  | -0.936610 | -0.779312  | -0.40302 | 0.0         | 0.0        | 1.0                | 1.0                        | 0.0                  | 226000         | -0.111041          | -0.135083       |
# | 4 | -0.012060    | -0.496281  | 0.735736  | 0.022918  | -0.046502  | -0.40302 | 0.0         | 0.0        | 0.0                | 1.0                        | 1.0                  | 570000         | -0.660555          | -0.369611       |


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         15411 non-null  int64  
 1   car_name           15411 non-null  object 
 2   brand              15411 non-null  object 
 3   model              15411 non-null  object 
 4   vehicle_age        15411 non-null  int64  
 5   km_driven          15411 non-null  int64  
 6   seller_type        15411 non-null  object 
 7   fuel_type          15411 non-null  object 
 8   transmission_type  15411 non-null  object 
 9   mileage            15411 non-null  float64
 10  engine             15411 non-null  int64  
 11  max_power          15411 non-null  float64
 12  seats              15411 non-null  int64  
 13  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 1.6+ MB
['Unnamed: 0', 'car_name', 'brand', 'model', 'vehicle_age', '

Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,brand_Audi,brand_BMW,brand_Bentley,brand_Datsun,...,fuel_type_LPG,fuel_type_Petrol,transmission_type_Automatic,transmission_type_Manual,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,selling_price,power_per_engine,log_km_driven
0,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,120000,-0.087619,1.101534
1,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,550000,-0.121077,-0.847208
2,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,215000,-0.104962,0.38515
3,0.983562,-0.360667,0.292211,-0.93661,-0.779312,-0.403022,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,226000,-0.111041,-0.135083
4,-0.01206,-0.496281,0.735736,0.022918,-0.046502,-0.403022,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,570000,-0.660555,-0.369611


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Define X and y
X = df_final.drop("selling_price", axis=1)
y = df_final["selling_price"]

X.head()
# | # | vehicle\_age | km\_driven | mileage   | engine    | max\_power | seats     | fuel\_type\_Petrol | transmission\_type\_Manual | seller\_type\_Individual | power\_per\_engine | log\_km\_driven |
# | - | ------------ | ---------- | --------- | --------- | ---------- | --------- | ------------------ | -------------------------- | ------------------------ | ------------------ | --------------- |
# | 0 | 0.983562     | 1.247335   | -0.000276 | -1.324259 | -1.263352  | -0.403022 | 1.0                | 1.0                        | 1.0                      | -0.087619          | 1.101534        |
# | 1 | -0.343933    | -0.690016  | -0.192071 | -0.554718 | -0.432571  | -0.403022 | 1.0                | 1.0                        | 1.0                      | -0.121077          | -0.847208       |
# | 2 | 1.647309     | 0.084924   | -0.647583 | -0.554718 | -0.479113  | -0.403022 | 1.0                | 1.0                        | 1.0                      | -0.104962          | 0.385150        |
# | 3 | 0.983562     | -0.360667  | 0.292211  | -0.936610 | -0.779312  | -0.403022 | 1.0                | 1.0                        | 1.0                      | -0.111041          | -0.135083       |
# | 4 | -0.012060    | -0.496281  | 0.735736  | 0.022918  | -0.046502  | -0.403022 | 0.0                | 1.0                        | 0.0                      | -0.660555          | -0.369611       |

y.head()
# 0    120000
# 1    550000
# 2    215000
# 3    226000
# 4    570000
# Name: selling_price, dtype: int64

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape
# ((12328, 170), (3083, 170))

y_train.shape,y_test.shape
# ((12328,), (3083,))

# Dictionary of models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42)
}

# Evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"--- {name} ---")
    print("R² Score:", r2_score(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print()

#     --- Linear Regression ---
# R² Score: 0.8080575229163514
# MAE: 176791.07170792087
# MSE: 144490801568.8113
# RMSE: 380119.456972162

# --- Ridge ---
# R² Score: 0.8035809018480006
# MAE: 184268.3385487383
# MSE: 147860720391.9593
# RMSE: 384526.61857400625

# --- Lasso ---
# R² Score: 0.803431723926398
# MAE: 177785.47895627984
# MSE: 147973018814.8839
# RMSE: 384672.61250949994

# --- KNN ---
# R² Score: 0.9245442160710825
# MAE: 106754.30587090497
# MSE: 56801740128.324684
# RMSE: 238331.1564364271

# --- Decision Tree ---
# R² Score: 0.8832652089919969
# MAE: 125631.73586333657
# MSE: 87875824986.68774
# RMSE: 296438.5686557803

# --- Random Forest ---
# R² Score: 0.9355999113192877
# MAE: 100023.33646580111
# MSE: 48479214064.34408
# RMSE: 220179.95836211814

# --- AdaBoost ---
# R² Score: 0.7490075179519298
# MAE: 291990.39191872283
# MSE: 188942570033.97092
# RMSE: 434675.24663128785

#✅ Sample Hyperparameter Grids for Popular Regressors
param_grids = {
    "Ridge": {
        "alpha": [0.01, 0.1, 1, 10, 100]
    },
    "Lasso": {
        "alpha": [0.01, 0.1, 1, 10, 100]
    },
    "KNN": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ['uniform', 'distance'],
        "metric": ['euclidean', 'manhattan']
    },
    "Decision Tree": {
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "AdaBoost": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.5, 1]
    }
}
# Models list for Hyperparameter tuning
#Hyperparameter Tuning
## Retraining the models with best parameters

# 🧪 Train, Tune & Evaluate
best_models = {}
tuning_results = []

for name, model in models.items():
    print(f"\n🔧 Tuning: {name}")
    
    if name in param_grids:
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grids[name],
            n_iter=10,
            scoring='r2',
            cv=5,
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_params = search.best_params_
    else:
        model.fit(X_train, y_train)
        best_model = model
        best_params = "Default Parameters"

    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    best_models[name] = best_model
    tuning_results.append({
        "Model": name,
        "Best Params": best_params,
        "R2 Score": round(r2, 4),
        "MAE": round(mae, 2),
        "RMSE": round(rmse, 2)
    })

# 📊 Results Summary
results_df = pd.DataFrame(tuning_results).sort_values(by="R2 Score", ascending=False)
print("\n📈 Final Model Performance After Hyperparameter Tuning:\n")
print(results_df)

# 📈 Final Model Performance After Hyperparameter Tuning:

#                Model                                        Best Params  \
# 5      Random Forest  {'n_estimators': 200, 'min_samples_split': 2, ...   
# 3                KNN  {'weights': 'distance', 'n_neighbors': 3, 'met...   
# 4      Decision Tree  {'min_samples_split': 10, 'min_samples_leaf': ...   
# 0  Linear Regression                                 Default Parameters   
# 1              Ridge                                       {'alpha': 1}   
# 2              Lasso                                     {'alpha': 100}   
# 6           AdaBoost        {'n_estimators': 200, 'learning_rate': 0.1}   

#    R2 Score        MAE       RMSE  
# 5    0.9359   99257.36  219598.80  
# 3    0.9244  103362.38  238483.13  
# 4    0.8965  121417.43  279141.00  
# 0    0.8081  176791.07  380119.46  
# 1    0.8036  184268.34  384526.62  
# 2    0.8013  180761.61  386738.15  
# 6    0.7134  286893.77  464487.16  


--- Linear Regression ---
R² Score: 0.8080575229163514
MAE: 176791.07170792087
MSE: 144490801568.8113
RMSE: 380119.456972162

--- Ridge ---
R² Score: 0.8035809018480006
MAE: 184268.3385487383
MSE: 147860720391.9593
RMSE: 384526.61857400625

--- Lasso ---
R² Score: 0.803431723926398
MAE: 177785.47895627984
MSE: 147973018814.8839
RMSE: 384672.61250949994

--- KNN ---
R² Score: 0.9245442160710825
MAE: 106754.30587090497
MSE: 56801740128.324684
RMSE: 238331.1564364271

--- Decision Tree ---
R² Score: 0.8832652089919969
MAE: 125631.73586333657
MSE: 87875824986.68774
RMSE: 296438.5686557803

--- Random Forest ---
R² Score: 0.9355999113192877
MAE: 100023.33646580111
MSE: 48479214064.34408
RMSE: 220179.95836211814

--- AdaBoost ---
R² Score: 0.7490075179519298
MAE: 291990.39191872283
MSE: 188942570033.97092
RMSE: 434675.24663128785


🔧 Tuning: Linear Regression

🔧 Tuning: Ridge

🔧 Tuning: Lasso

🔧 Tuning: KNN

🔧 Tuning: Decision Tree

🔧 Tuning: Random Forest

🔧 Tuning: AdaBoost

📈 Final Model 

In [None]:


### ✅ **Random Forest Regressor**

#### 📊 Metrics:

# | Metric          | Value                                                                                                         |
# | --------------- | ------------------------------------------------------------------------------------------------------------- |
# | **R² Score**    | **0.9359**                                                                                                    |
# | **MAE**         | 99,257.36                                                                                                     |
# | **RMSE**        | 219,598.80                                                                                                    |
# | **Best Params** | `{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20}` (assumed from output) |

# ---

### 🔍 Why Random Forest is Best:

# * **Highest R² Score** → Indicates best fit to actual selling prices.
# * **Lowest RMSE** → Indicates more accurate predictions.
# * **Low MAE** → Errors are smaller on average compared to others.

### 🥈 Runner-up:

# * **K-Nearest Neighbors (KNN)**: Very close in performance but slightly higher RMSE.


