In [222]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [255]:
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from scipy.stats import uniform, randint, loguniform

In [256]:
df = pd.read_csv('outliers_removed_dataset.csv')

In [257]:
df.head()

Unnamed: 0,area_type,bath,balcony,price,bhk,has_society,society_freq,availability_group,location_grouped,location_target_enc,total_sqft
0,Super built-up Area,2.0,1.0,39.07,2.0,1,5.0,Q4 Possession,electronic city phase ii,48.683906,1056.0
1,Plot Area,5.0,3.0,120.0,4.0,1,12.0,Ready To Move,Other,114.704375,2600.0
2,Built-up Area,2.0,3.0,62.0,3.0,0,0.0,Ready To Move,uttarahalli,63.268535,1440.0
3,Super built-up Area,3.0,1.0,95.0,3.0,1,22.0,Ready To Move,lingadheeranahalli,115.348261,1521.0
4,Super built-up Area,2.0,1.0,51.0,2.0,0,0.0,Ready To Move,kothanur,95.790625,1200.0


In [258]:
print(f"The total number of columns are : {df.shape[1]}")

The total number of columns are : 11


In [259]:
df.columns

Index(['area_type', 'bath', 'balcony', 'price', 'bhk', 'has_society',
       'society_freq', 'availability_group', 'location_grouped',
       'location_target_enc', 'total_sqft'],
      dtype='object')

In [260]:
numeric_features = [
    'total_sqft',
    'bath',
    'bhk',
    'balcony',
    'location_target_enc'
]
categorical_features = [
    'area_type',
    'has_society',
    'location_grouped'
]

In [261]:
preprocessor = ColumnTransformer( 
    transformers=[ 
        ('num', StandardScaler(), numeric_features), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) 
    ] 
)

In [262]:
pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', LinearRegression()) ])

In [263]:
# Split data 
X = df[[
    'total_sqft',
    'bath',
    'bhk',
    'balcony',
    'location_target_enc',          # ← comma added here
    'area_type',
    'has_society',
    'location_grouped'
]]
y = df['price'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [264]:
# Fit pipeline 
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [265]:
# Predictions 
y_pred = pipeline.predict(X_test)

In [266]:
y_train_pred = pipeline.predict(X_train)

In [267]:
# Training performance 
train_r2 = r2_score(y_train, y_train_pred) 
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

In [268]:
print(f"Performance on train using r2 score : {train_r2}")
print(f"Performance on train using rmse score : {train_rmse}")

Performance on train using r2 score : 0.668923893003309
Performance on train using rmse score : 85.0343853891198


In [269]:
# Testing performance 
test_r2 = r2_score(y_test, y_pred) 
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [270]:
print(f"Performance on test using r2 score : {test_r2}")
print(f"Performance on test using rmse score : {test_rmse}")

Performance on test using r2 score : 0.6475834954452333
Performance on test using rmse score : 93.1733510784409


In [271]:
## The gap between training and testing is not huge, which means the model is not strongly overfitting.
## Both R² values are below 0.6, so the model is underfitting — it’s too simple to capture the complexity of housing prices.

In [272]:
# Ridge pipeline 
ridge_pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', Ridge(alpha=1.0)) ])
                            # alpha controls regularization strength 
# Lasso pipeline 
lasso_pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', Lasso(alpha=0.1)) ])
                            # smaller alpha for Lasso to avoid too much shrinkage 
# Fit models 
ridge_pipeline.fit(X_train, y_train) 
lasso_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [273]:
# Predictions
ridge_train_pred = ridge_pipeline.predict(X_train) 
ridge_test_pred = ridge_pipeline.predict(X_test) 
lasso_train_pred = lasso_pipeline.predict(X_train) 
lasso_test_pred = lasso_pipeline.predict(X_test)

In [274]:
# Evaluate Ridge 
print("Ridge Training R²:", r2_score(y_train, ridge_train_pred)) 
print("Ridge Training RMSE:", np.sqrt(mean_squared_error(y_train, ridge_train_pred))) 
print("Ridge Testing R²:", r2_score(y_test, ridge_test_pred)) 
print("Ridge Testing RMSE:", np.sqrt(mean_squared_error(y_test, ridge_test_pred)))

Ridge Training R²: 0.6689138862573287
Ridge Training RMSE: 85.03567045778917
Ridge Testing R²: 0.6476410260998322
Ridge Testing RMSE: 93.16574567036493


In [275]:
# Evaluate Lasso 
print("\nLasso Training R²:", r2_score(y_train, lasso_train_pred))
print("Lasso Training RMSE:", np.sqrt(mean_squared_error(y_train, lasso_train_pred))) 
print("Lasso Testing R²:", r2_score(y_test, lasso_test_pred)) 
print("Lasso Testing RMSE:", np.sqrt(mean_squared_error(y_test, lasso_test_pred)))


Lasso Training R²: 0.6641279136627143
Lasso Training RMSE: 85.64807634062772
Lasso Testing R²: 0.644717287537247
Lasso Testing RMSE: 93.55147383992579


In [276]:
# Build pipeline 
dt_pipeline = Pipeline([ ('preprocessor', preprocessor), 
                         ('model', DecisionTreeRegressor( max_depth=None, random_state=42 )) ])  # let tree grow fully max_depth = None

In [277]:
# Fit pipeline 
dt_pipeline.fit(X_train, y_train) 
# Predictions 
y_train_pred = dt_pipeline.predict(X_train) 
y_test_pred = dt_pipeline.predict(X_test) 
# Evaluate 
print("Decision Tree Training R²:", r2_score(y_train, y_train_pred)) 
print("Decision Tree Training RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred))) 
print("Decision Tree Testing R²:", r2_score(y_test, y_test_pred)) 
print("Decision Tree Testing RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))

Decision Tree Training R²: 0.9978883094693876
Decision Tree Training RMSE: 6.79118862836203
Decision Tree Testing R²: 0.6047012098610431
Decision Tree Testing RMSE: 98.67936327022333


Training R² = 0.9875, RMSE ≈ 16.5  
→ The model fits the training data almost perfectly. Very low error, very high R².

Testing R² = 0.4043, RMSE ≈ 121.1  
→ On unseen data, performance drops drastically. The model explains only ~40% of variance, and the error is much higher.

This is a textbook case of overfitting: the tree has memorized the training set instead of learning generalizable patterns.

In [278]:
# Hyperparameter grid 
param_grid = { 'model__max_depth': [5, 10, 15, 20, None], 
               'model__min_samples_leaf': [1, 2, 5, 10], 
               'model__min_samples_split': [2, 5, 10] }

In [279]:
# GridSearchCV 
grid_search = GridSearchCV( dt_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1 )  # 5-fold cross-validation

In [280]:
grid_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [5, 10, ...], 'model__min_samples_leaf': [1, 2, ...], 'model__min_samples_split': [2, 5, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [282]:
# Best parameters 
print("Best Parameters:", grid_search.best_params_) 
print("Best CV R²:", grid_search.best_score_) 
# Evaluate on test set
best_model = grid_search.best_estimator_ 
y_test_pred = best_model.predict(X_test) 
print("Test R²:", r2_score(y_test, y_test_pred)) 
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))

Best Parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 5, 'model__min_samples_split': 2}
Best CV R²: 0.7111414474024046
Test R²: 0.7404026277673904
Test RMSE: 79.96754105536436


In [283]:
## Much better than previous, with the current best parameter it no longer overfit the model

In [284]:
# Get feature names from the fitted preprocessor
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# Get importances from the decision tree
importances = best_model.named_steps['model'].feature_importances_

# Build DataFrame
importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)
importances_df['importance'] = importances_df['importance'].round(6)
print(importances_df.head(20))  # top 20 features


                                      feature  importance
0                             num__total_sqft    0.601023
4                    num__location_target_enc    0.293576
1                                   num__bath    0.046370
18                cat__location_grouped_Other    0.023740
7                   cat__area_type_Plot  Area    0.021596
2                                    num__bhk    0.008118
3                                num__balcony    0.001280
33    cat__location_grouped_bannerghatta road    0.001071
9                          cat__has_society_0    0.000766
127        cat__location_grouped_rajaji nagar    0.000526
119          cat__location_grouped_nagarbhavi    0.000442
134      cat__location_grouped_sarjapur  road    0.000325
8         cat__area_type_Super built-up  Area    0.000296
5               cat__area_type_Built-up  Area    0.000222
10                         cat__has_society_1    0.000213
152          cat__location_grouped_whitefield    0.000141
15   cat__loca

In [286]:
numeric_features = [
    'total_sqft',
    'bath',
    'bhk',
    'balcony',               # keep if you want, or remove
    'location_target_enc'
]

categorical_features = [
    'area_type',
    'has_society',
    'location_grouped'
]

# Re-create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ]
)

# Re-create pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [287]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist_rf = {
    'model__n_estimators': randint(100, 501),
    'model__max_depth': [5, 8, 10, 12, 15, None],
    'model__min_samples_split': randint(2, 21),
    'model__min_samples_leaf': randint(1, 11),
    'model__max_features': ['sqrt', 'log2', 0.5, None]
}

random_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_dist_rf,
    n_iter=60,                # try 40–80 combinations
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'model__max_depth': [5, 8, ...], 'model__max_features': ['sqrt', 'log2', ...], 'model__min_samples_leaf': <scipy.stats....0021F96C7CA60>, 'model__min_samples_split': <scipy.stats....0021F96A165F0>, ...}"
,n_iter,60
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,215
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [253]:
X_train.head()

Unnamed: 0,total_sqft,bath,bhk,balcony,location_target_enc,area_type,has_society,location_grouped
5206,1200.0,2.0,2.0,1.0,102.429704,Built-up Area,1,marathahalli
8796,1158.0,2.0,2.0,2.0,116.526076,Super built-up Area,1,kasavanhalli
8451,1280.0,2.0,3.0,3.0,238.375,Super built-up Area,1,Other
765,1750.0,3.0,3.0,2.0,205.215,Super built-up Area,0,koramangala
2688,1360.0,3.0,3.0,3.0,74.0,Super built-up Area,0,Other


In [288]:
# Results
print("Best Parameters:", random_search.best_params_)
print("Best CV R²:", random_search.best_score_)

y_pred_rf = random_search.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
print("Test R²:", r2_score(y_test, y_pred_rf))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))

Best Parameters: {'model__max_depth': 10, 'model__max_features': None, 'model__min_samples_leaf': 3, 'model__min_samples_split': 2, 'model__n_estimators': 215}
Best CV R²: 0.7650970988288556
Test R²: 0.8178284456162606
Test RMSE: 66.98905910602434
