In [620]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [621]:
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor

In [622]:
df = pd.read_csv('Bengaluru_House_Data.csv')

## Remove the duplicates

In [623]:
df = df.drop_duplicates()

## Handling missing values

In [624]:
## filling the missing value in location to 'other'
df = df.dropna(subset= ['location'])

In [625]:
def extract_bhk(x):
    try:
        x = str(x).lower()
        if 'rk' in x:
            return 1
        return int(x.split()[0])
    except:
        return None

df['bhk'] = df['size'].apply(extract_bhk)

In [626]:
## Filling the missing values in 'bhk' with median, since most of the values lies in between 1 to 3
median_bhk = df['bhk'].median()
df['bhk'] = df['bhk'].fillna(median_bhk)

In [627]:
## Since missingness is very high, don’t try to impute with median/mode
## Capture whether society info is present at all
df['has_society'] = df['society'].notnull().astype(int)

In [628]:
## Bathrooms are usually proportional to BHK.
## You can impute missing values based on bhk (e.g., 2 BHK → 2 baths, 3 BHK → 3 baths).
df.loc[df['bath'].isnull(), 'bath'] = df.loc[df['bath'].isnull(), 'bhk']

In [629]:
## Balcony count is numeric but categorical-like (most homes have 0–2 balconies)
df['balcony'] = df['balcony'].fillna(df['balcony'].median())

In [630]:
df = df.drop( ['size', 'society'], axis = 1)

## Feature engineering

In [631]:
def group_availability(x):
    if x in ['Ready To Move', 'Immediate Possession']:
        return 'Ready To Move'
    else:
        return 'Not Ready'
        
df['availability_group'] = df['availability'].apply(group_availability)

In [632]:
df = df.drop('availability', axis = 1)

In [633]:
## Spelling/formatting issues: "Whitefield" vs "White Field", "Electronic City" vs "ElectronicCity"
df['location'] = df['location'].str.strip().str.lower()

In [634]:
threshold = 20
location_counts = df['location'].value_counts()
top_locations = location_counts[location_counts >= threshold].index.tolist()

In [635]:
df['location_grouped'] = df['location'].apply(
    lambda x: x if x in top_locations else 'Other'
)

In [636]:
def convert_total_sqft(x):
    """
    Convert total_sqft string to float in square feet.
    Handles:
    - ranges "1200 - 1400" → average
    - units: Sq. Meter, Sq. Yards, Acres, Perch, Guntha, Cents, etc.
    Returns float or None (later to NaN)
    """
    if isinstance(x, float):  # already numeric
        return x
    
    x = str(x).strip()
    # Case 1: range like "1133 - 1384"
    if '-' in x:
        tokens = x.split('-')
        if len(tokens) == 2:
            try:
                return (float(tokens[0]) + float(tokens[1])) / 2
            except:
                return None
    
    # Case 2: single number with unit
    # Remove commas if any
    x = x.replace(',', '')
    
    # Common unit conversions (approximate standard factors)
    conversion_factors = {
        'sq. meter': 10.7639,       # 1 sq.m = 10.7639 sqft
        'sq. meter': 10.7639,
        'square meter': 10.7639,
        'sq.meter': 10.7639,
        'sq meter': 10.7639,
        
        'sq. yard': 9.0,            # 1 sq.yd = 9 sqft
        'sq.yard': 9.0,
        'square yard': 9.0,    
        'sq yard': 9.0,
        
        'acre': 43560,              # 1 acre = 43560 sqft
        'acres': 43560,
        
        'perch': 272.25,            # 1 perch ≈ 272.25 sqft
        'guntha': 1089,             # 1 guntha = 1089 sqft
        'cents': 435.6,             # 1 cent = 435.6 sqft
        'ground': 2400,             # 1 ground ≈ 2400 sqft (South India)
    }
    
    for unit, factor in conversion_factors.items():
        if unit in x.lower():
            num_part = x.lower().replace(unit, '').strip()
            try:
                return float(num_part) * factor
            except:
                return None
    
    # Plain number (most common case)
    try:   
        return float(x)
    except:
        return None

# Apply conversion
df['total_sqft_num'] = df['total_sqft'].apply(convert_total_sqft)

# Drop original and rows where conversion failed
df = df.dropna(subset=['total_sqft_num'])
df = df.drop(columns=['total_sqft'])
df = df.rename(columns={'total_sqft_num': 'total_sqft'})

print("After total_sqft cleaning:", df.shape)        

After total_sqft cleaning: (12766, 10)


In [637]:
df.head()

Unnamed: 0,area_type,location,bath,balcony,price,bhk,has_society,availability_group,location_grouped,total_sqft
0,Super built-up Area,electronic city phase ii,2.0,1.0,39.07,2.0,1,Not Ready,electronic city phase ii,1056.0
1,Plot Area,chikka tirupathi,5.0,3.0,120.0,4.0,1,Ready To Move,Other,2600.0
2,Built-up Area,uttarahalli,2.0,3.0,62.0,3.0,0,Ready To Move,uttarahalli,1440.0
3,Super built-up Area,lingadheeranahalli,3.0,1.0,95.0,3.0,1,Ready To Move,lingadheeranahalli,1521.0
4,Super built-up Area,kothanur,2.0,1.0,51.0,2.0,0,Ready To Move,kothanur,1200.0


## Remove outliers

In [638]:
## Handling outliers in bathrooms
df['bath'] = df.apply( lambda row: min(row['bath'], row['bhk']) if row['bath'] > 8 else row['bath'], axis=1 )

In [639]:
df = df[df['total_sqft'] <= 20000]

In [640]:
df = df.drop(['availability_group'], axis = 1)

In [641]:
df['price_per_sqft'] = df['price'] * 1e5 / df['total_sqft']   # lakhs → rupees/sqft

print(df['price_per_sqft'].describe(percentiles=[0.01, 0.05, 0.95, 0.99]))

count    1.275400e+04
mean     8.023575e+03
std      1.085753e+05
min      4.053955e+02
1%       2.500436e+03
5%       3.119674e+03
50%      5.483044e+03
95%      1.548272e+04
99%      2.407833e+04
max      1.200000e+07
Name: price_per_sqft, dtype: float64


In [642]:
df = df[df['price_per_sqft'] <= 25000]
df = df[df['price_per_sqft'] >= 2000]

In [643]:
df['sqft_per_bhk'] = df['total_sqft'] / df['bhk']

In [644]:
# After price_per_sqft filtering
df['bath_per_bhk']     = df['bath'] / df['bhk']
df['extra_bath']       = (df['bath'] > df['bhk'] + 1).astype(int)
df['total_sqft_log']   = np.log1p(df['total_sqft'])

In [645]:
def remove_bhk_outliers(df):
    exclude_indices = []
    for location, loc_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, sub_df in loc_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': sub_df['price'].mean(),
                'std': sub_df['price'].std(),
                'count': sub_df.shape[0]
            }
        for bhk, sub_df in loc_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude = sub_df[(sub_df['price'] < (stats['mean'] - 0.5 * stats['std']))]
                exclude_indices.extend(exclude.index.tolist())
    return df.drop(exclude_indices, axis=0)

df = remove_bhk_outliers(df)
print("Rows after BHK outlier removal:", df.shape[0])

Rows after BHK outlier removal: 12342


In [646]:
df = df.drop(['price_per_sqft'], axis = 1)

In [647]:
df.head()

Unnamed: 0,area_type,location,bath,balcony,price,bhk,has_society,location_grouped,total_sqft,sqft_per_bhk,bath_per_bhk,extra_bath,total_sqft_log
0,Super built-up Area,electronic city phase ii,2.0,1.0,39.07,2.0,1,electronic city phase ii,1056.0,528.0,1.0,0,6.96319
1,Plot Area,chikka tirupathi,5.0,3.0,120.0,4.0,1,Other,2600.0,650.0,1.25,0,7.863651
2,Built-up Area,uttarahalli,2.0,3.0,62.0,3.0,0,uttarahalli,1440.0,480.0,0.666667,0,7.273093
3,Super built-up Area,lingadheeranahalli,3.0,1.0,95.0,3.0,1,lingadheeranahalli,1521.0,507.0,1.0,0,7.327781
4,Super built-up Area,kothanur,2.0,1.0,51.0,2.0,0,kothanur,1200.0,600.0,1.0,0,7.09091


In [648]:
# Split data 
X = df.drop(['price'], axis = 1)
y = df['price'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [649]:
X_train.head()

Unnamed: 0,area_type,location,bath,balcony,bhk,has_society,location_grouped,total_sqft,sqft_per_bhk,bath_per_bhk,extra_bath,total_sqft_log
9994,Super built-up Area,marathahalli,3.0,2.0,3.0,1,marathahalli,1550.0,516.666667,1.0,0,7.346655
7575,Built-up Area,kallumantapa,2.0,1.0,2.0,1,Other,1050.0,525.0,1.0,0,6.957497
306,Super built-up Area,kanakpura road,1.0,1.0,1.0,1,kanakpura road,525.0,525.0,1.0,0,6.265301
3889,Super built-up Area,whitefield,2.0,2.0,2.0,1,whitefield,1140.0,570.0,1.0,0,7.03966
5798,Super built-up Area,rayasandra,2.0,2.0,2.0,1,rayasandra,1253.0,626.5,1.0,0,7.134094


In [650]:
import pandas as pd

def target_encode(train_X, train_y, test_X, col, smoothing=10):
    """
    Target encoding with additive smoothing for regularization.
    - smoothing=10: Good default (tune 5-20 based on your data).
    """
    # Group stats from TRAIN only
    stats = (pd.DataFrame({col: train_X[col], 'target': train_y})
             .groupby(col)
             .agg(count=('target', 'size'), mean=('target', 'mean'))
             .reset_index())
    
    global_mean = train_y.mean()
    
    # Smoothed encoding: (count * mean + smoothing * global) / (count + smoothing)
    stats['smoothed'] = (stats['count'] * stats['mean'] + smoothing * global_mean) / (stats['count'] + smoothing)
    
    # Create mapping dict
    mapping = dict(zip(stats[col], stats['smoothed']))
    
    # Apply to train/test
    train_X[col + "_target_enc"] = train_X[col].map(mapping)
    test_X[col + "_target_enc"] = test_X[col].map(mapping).fillna(global_mean)  # Unseen → global
    
    # Drop original
    train_X = train_X.drop(columns=[col])
    test_X = test_X.drop(columns=[col])
    
    return train_X, test_X

In [651]:
X_train, X_test = target_encode(X_train, y_train, X_test, col='location')

In [652]:
X_train.columns

Index(['area_type', 'bath', 'balcony', 'bhk', 'has_society',
       'location_grouped', 'total_sqft', 'sqft_per_bhk', 'bath_per_bhk',
       'extra_bath', 'total_sqft_log', 'location_target_enc'],
      dtype='object')

In [653]:
numeric_features = [
    'total_sqft',
    'bath',
    'bhk',
    'balcony',
    'location_target_enc',
    'sqft_per_bhk',
    'bath_per_bhk',
    'total_sqft_log'
]

categorical_features = [
    'area_type',
    'has_society',
    'location_grouped',
    'extra_bath'
]

# Re-create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ]
)

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        random_state=42,
        max_depth=15,
        max_features= 0.5,
        min_samples_leaf=3,
        min_samples_split= 2,
        n_estimators= 104,
        n_jobs=-1
    ))
])


In [654]:
rf_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,104
,criterion,'squared_error'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,0.5
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [655]:
y_train_pred = rf_pipeline.predict(X_train)

In [656]:
y_pred_rf = rf_pipeline.predict(X_test)

In [657]:
from sklearn.metrics import r2_score, mean_squared_error
print("Test R²:", r2_score(y_test, y_pred_rf))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))

Test R²: 0.8172675168901694
Test RMSE: 55.99485972470895


In [658]:
print("Train R²:", r2_score(y_train, y_train_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))

Train R²: 0.9323928957563309
Test RMSE: 31.518383187892024
