In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [6]:
for column in df.columns:
    print(df[column].value_counts())
    print()

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64

location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4

In [7]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
# We are dropping Society column as in total 13000 entries 5000 are null.
# The other columns are also not much useful to us so we are dropping them

df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [9]:
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [10]:
print(df.info())
print()
print("Null values count:")
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB
None

Null values count:
location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64


In [11]:
print(df['location'].value_counts())

# As there is only one missing value in Location we can just fill it with any one of the most occuring location
df['location'] = df['location'].fillna('Whitefield')

location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64


In [12]:
df['size'].value_counts()

# As most occuring is 2 BHK we will fill with it 

df['size'] = df['size'].fillna('2 BHK')

In [13]:
df['bath'].value_counts()

# As most occuring is 2 Bathrooms we will fill with it 

df['bath'] = df['bath'].fillna(df['bath'].median())
# We have filled all the null values

In [14]:
# In Size column we can see that the entries are not same some places it is like '2 BHK' and in some places it is like '2 Bedroom'
df['BHK'] = df['size'].str.split().str.get(0).astype(int)


In [15]:
# Outliers
df[df.BHK > 20]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [16]:
# In the column total_sqft
df['total_sqft'].unique()

# Function for changing the entries to float and also to remove the ranges
def convert_sqft(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1])) / 2
    try:
        return float(temp[0])
    except:
        return None
    
df['total_sqft'] = df['total_sqft'].apply(convert_sqft)

In [17]:
# We are gonna make a new column for calculating the price per square feet as in the current data it is for the total sqft
df['price_per_sqft'] = df['price']*100000 / df['total_sqft']

df['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [18]:
df.describe()

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [19]:
# Now in the location column as there are 1305 entries if we encode it, it will be very huge and will negatively effect our model (The Curse of Dimensionality)

# First we remove the whitespaces if any

df['location'] = df['location'].apply(lambda x: x.strip())

# We reduce the size by clubbing all less frequent entries into Other
location_stats = df['location'].value_counts()
location_stats_less_than_10 = location_stats[location_stats <= 10]

print('Number of locations appearing less than 10: ',len(location_stats_less_than_10))

df['location'] = df['location'].apply(lambda x: "Other" if x in location_stats_less_than_10 else x)

Number of locations appearing less than 10:  1053


In [20]:
df['location'].value_counts()

location
Other                        2885
Whitefield                    542
Sarjapur  Road                399
Electronic City               304
Kanakpura Road                273
                             ... 
Tindlu                         11
Marsur                         11
2nd Phase Judicial Layout      11
Thyagaraja Nagar               11
HAL 2nd Stage                  11
Name: count, Length: 242, dtype: int64

### OUTLIER DETECTION

In [21]:
# There are some outliers like in the min(total_sqft) = 1.0000 which is an outlier
df.describe()

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [22]:
# This will give us the sqft of a single room
(df['total_sqft']/df['BHK']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [23]:
# We took a threshold like any house with sqft of a single room < 300 will be removed or else it is very small room not in real world
df = df[((df['total_sqft']/df['BHK']) >= 300)]
df.describe()

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [24]:
# For the price_per_sqft the max value is 176470 (which is nearly 28 times mean) so it is an outlier
def remove_outlier_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):  # Grouping the entries by location, (subdf contains all the data of the same location (which is in key))
        m = np.mean(subdf.price_per_sqft)
        st = m = np.std(subdf.price_per_sqft)
        gen_df = subdf[((subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft < (m+st)))]
        df_output = pd.concat([df_output,gen_df],ignore_index=True)  # Joining the both Dataframes
    return df_output

df = remove_outlier_sqft(df)
df.describe()

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft
count,4903.0,4903.0,4903.0,4903.0,4903.0
mean,1533.987177,2.424434,81.649378,2.561493,5179.901849
std,1630.011986,1.028734,80.889654,1.005413,2088.614106
min,300.0,1.0,8.44,1.0,267.829813
25%,1100.0,2.0,44.5,2.0,3809.52381
50%,1247.0,2.0,59.0,2.0,4631.578947
75%,1567.0,3.0,89.0,3.0,5937.5
max,52272.0,16.0,2100.0,16.0,16500.0


In [25]:
df['BHK'].describe()

count    4903.000000
mean        2.561493
std         1.005413
min         1.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        16.000000
Name: BHK, dtype: float64

In [26]:
# Now removing the Outliers in BHK column
def bhk_outlier(df):
    exclude_indices = np.array([])

    # First grouping on basis of location, so we get all bhk's of a every single place
    for location,location_df in df.groupby('location'):
        bhk_stats = {}

        # Then we goup again on basis of BHK which will give data of each BHK type in each location and calculating the statistics
        for bhk,bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {'mean' : np.mean(bhk_df.price_per_sqft),
                        'std': np.std(bhk_df.price_per_sqft),
                        'count' : bhk_df.shape[0]
                        }
            
        # Checking of the outlier like if the Price Per Sqft of 3 BHK is less than mean of 2 BHK's Price Per Sqft then it is an outlier
        for bhk,bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    
    # We drop those outliers
    return df.drop(exclude_indices,axis = 'index')
                

In [27]:
df = bhk_outlier(df)
print(df.shape)  # The shape of Data changed
df.head()

(3215, 7)


Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
0,1st Block Jayanagar,2 BHK,1000.0,3.0,60.0,2,6000.0
1,1st Block Jayanagar,3 BHK,1760.0,3.0,115.0,3,6534.090909
2,1st Phase JP Nagar,4 BHK,2825.0,4.0,250.0,4,8849.557522
3,1st Phase JP Nagar,3 BHK,1875.0,3.0,167.0,3,8906.666667
4,1st Phase JP Nagar,5 Bedroom,1500.0,5.0,85.0,5,5666.666667


In [28]:
# Now we will drop the size and Price per sqft columns too as Price per sqft column was only useful to detect the outliers
df.drop(columns = ['size','price_per_sqft'],inplace=True)
df.head()


Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,1000.0,3.0,60.0,2
1,1st Block Jayanagar,1760.0,3.0,115.0,3
2,1st Phase JP Nagar,2825.0,4.0,250.0,4
3,1st Phase JP Nagar,1875.0,3.0,167.0,3
4,1st Phase JP Nagar,1500.0,5.0,85.0,5


In [29]:
df.to_csv('Cleaned_data.csv')

In [30]:
X = df.drop(columns = ['price']) # Input 
Y = df['price'] # Output

In [31]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [32]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
print(f"X_test Shape: {X_test.shape}")
print(f"X_train Shape: {X_train.shape}")

X_test Shape: (643, 4)
X_train Shape: (2572, 4)


### Why use OHE of sklearn class instead of pd.get_dummies()?

We could have used Pandas built in function for OHE (called pd.get_dummies()) but it is not recommended to use becoz:  
&emsp;&emsp;    1. It doesn't maintain the same order of the columns changes based on what it first sees, and our model gets confused  
&emsp;&emsp;     2. If we have trained our model with 2 categories (A & B) and if a new category is introduced while testing then this function makes a &new extra  
&emsp;&emsp;         column for that new category, due to the mismatch of column numbers our model will give error.   <br><br>
So that's why we are using the OneHotEncoder of sklearn class

In [33]:
column_trans = make_column_transformer((OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['location']),remainder='passthrough')

## Linear Regression

In [34]:
scaler = StandardScaler()
lr = LinearRegression()

In [35]:
pipe = make_pipeline(column_trans,scaler,lr)
pipe.fit(X_train,Y_train)

0,1,2
,steps,"[('columntransformer', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [36]:
Y_pred_lr = pipe.predict(X_test)
print(f"r2_score: {r2_score(Y_test,Y_pred_lr)}")


r2_score: 0.7136156595625918


## Lasso

In [37]:
lasso = Lasso()
pipe = make_pipeline(column_trans,scaler,lasso)
pipe.fit(X_train,Y_train)

0,1,2
,steps,"[('columntransformer', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [38]:
Y_pred_lasso = pipe.predict(X_test)
print(f"r2_score: {r2_score(Y_test,Y_pred_lasso)}")

r2_score: 0.7058789838070585


## Ridge

In [39]:
ridge = Ridge()
pipe = make_pipeline(column_trans,scaler,ridge)
pipe.fit(X_train,Y_train)

0,1,2
,steps,"[('columntransformer', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [40]:
Y_pred_ridge = pipe.predict(X_test)
print(f"r2_score: {r2_score(Y_test,Y_pred_ridge)}")

r2_score: 0.7136208085613944


As the performance is very low we will try to increase it 

In [44]:
import xgboost as xgb
from sklearn.pipeline import make_pipeline

xgb_simple = xgb.XGBRegressor(
    n_estimators=50,    # Much fewer trees
    max_depth=3,        # Very shallow trees (prevents complex memorization)
    learning_rate=0.1,
    n_jobs=-1
)
pipe_xgb = make_pipeline(column_trans,scaler,xgb_simple)

print("Training XGBoost... (This might take a few seconds)")
pipe_xgb.fit(X_train, Y_train)

y_pred_xgb = pipe_xgb.predict(X_test)
print(f"XGBoost R2 Score: {r2_score(Y_test, y_pred_xgb):.4f}")

Training XGBoost... (This might take a few seconds)
XGBoost R2 Score: 0.6681


In [45]:
from sklearn.ensemble import RandomForestRegressor

# 1. Initialize Random Forest
# n_estimators=100: Build 100 trees
# random_state=10: Keep results consistent
rf_model = RandomForestRegressor(n_estimators=100, random_state=10)

# 2. Make Pipeline
pipe_rf = make_pipeline(column_trans, scaler, rf_model)

# 3. Train
print("Training Random Forest... (This may take a minute)")
pipe_rf.fit(X_train, Y_train)

# 4. Predict & Score
y_pred_rf = pipe_rf.predict(X_test)
print(f"Random Forest R2 Score: {r2_score(Y_test, y_pred_rf):.4f}")

Training Random Forest... (This may take a minute)
Random Forest R2 Score: 0.7754


In [48]:
from sklearn.ensemble import VotingRegressor

# 1. Define the 3 Experts (Use the settings that worked best previously)
r1 = Ridge()
r2 = RandomForestRegressor(n_estimators=100, random_state=10) 
r3 = xgb.XGBRegressor(n_estimators=50, max_depth=3, learning_rate=0.1, n_jobs=-1)

# 2. Create the Voting Ensemble
# It bundles them all together
vote = VotingRegressor([('ridge', r1), ('rf', r2), ('xgb', r3)])

# 3. Make Pipeline
pipe_vote = make_pipeline(column_trans, scaler, vote)

# 4. Train
print("Training the Committee... 🗳️")
pipe_vote.fit(X_train, Y_train)

# 5. Predict
y_pred_vote = pipe_vote.predict(X_test)
print(f"Voting Regressor R2 Score: {r2_score(Y_test, y_pred_vote):.4f}")

Training the Committee... 🗳️
Voting Regressor R2 Score: 0.7769


In [50]:
import numpy as np
from sklearn.compose import TransformedTargetRegressor

# 1. Setup the Strategy
# We wrap our best model (Ridge or Voting) inside a "TransformedTargetRegressor"
# It automatically turns Price -> Log(Price) for training
# And turns Predicted Log(Price) -> Price for testing
log_regressor = TransformedTargetRegressor(
    regressor=pipe_vote,        # Our best model (The Committee)
    func=np.log1p,              # Convert Target to Log before training
    inverse_func=np.expm1       # Convert Prediction back to Real Price
)

# 2. Train
print("Training with Log-Transformation... 📉")
log_regressor.fit(X_train, Y_train)

# 3. Predict & Score
y_pred_log = log_regressor.predict(X_test)
print(f"Log-Transformed R2 Score: {r2_score(Y_test, y_pred_log):.4f}")

Training with Log-Transformation... 📉
Log-Transformed R2 Score: 0.8096


In [51]:
from catboost import CatBoostRegressor

# 1. Initialize CatBoost
# iterations=1000: It learns slowly but deeply
# depth=6: Standard depth
# l2_leaf_reg: Regularization strength
cat_model = CatBoostRegressor(
    iterations=2000, 
    learning_rate=0.03, 
    depth=6, 
    l2_leaf_reg=3,
    loss_function='RMSE',
    verbose=0  # Don't print 2000 lines
)

# 2. Pipeline
pipe_cat = make_pipeline(column_trans, scaler, cat_model)

# 3. Train
print("Training CatBoost... (This might take a moment) 🐱")
pipe_cat.fit(X_train, Y_train)

# 4. Score
y_pred_cat = pipe_cat.predict(X_test)
print(f"CatBoost R2 Score: {r2_score(Y_test, y_pred_cat):.4f}")

Training CatBoost... (This might take a moment) 🐱
CatBoost R2 Score: 0.8230


In [None]:
from sklearn.compose import TransformedTargetRegressor

# 1. Take the CatBoost model that worked best
# We use the same settings as before
cat_model_final = CatBoostRegressor(
    iterations=2000, 
    learning_rate=0.03, 
    depth=6, 
    l2_leaf_reg=3,
    loss_function='RMSE',
    verbose=0
)

# 2. Make the Pipeline (Encoding -> Scaling -> CatBoost)
pipe_cat_final = make_pipeline(column_trans, scaler, cat_model_final)

# 3. The Secret Sauce: Wrap the WHOLE pipeline in Log-Transform
# This forces CatBoost to predict Log(Price), which is mathematically easier for it
log_cat_model = TransformedTargetRegressor(
    regressor=pipe_cat_final, 
    func=np.log1p, 
    inverse_func=np.expm1
)

# 4. Train
print("Training Log-Transformed CatBoost... 🐱📈")
log_cat_model.fit(X_train, Y_train)

# 5. Predict & Score
y_pred_log_cat = log_cat_model.predict(X_test)
score = r2_score(Y_test, y_pred_log_cat)
print(f"Final 'God Mode' Score: {score:.4f}")