In [42]:
# Importing required libraries for data handling, visualization, and model building.
import pandas as pd
import numpy as np

In [43]:
# Loading the house price dataset into a pandas DataFrame.
data = pd.read_csv("Bengaluru_House_Data.csv")

In [44]:
# Showing the first few rows to understand the dataset structure.
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [45]:
# Checking for missing values and data types.
data.shape

(13320, 9)

In [46]:
# Generating summary statistics for numerical columns.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [47]:
# Visualizing correlations to identify important features.
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [48]:
# Selecting the most relevant features for model training.
# Check which columns exist
print("Columns in dataset:", data.columns.tolist())

# Drop only if present
cols_to_drop = ['area_type', 'availability', 'society', 'balcony']
existing_cols = [col for col in cols_to_drop if col in data.columns]

data.drop(columns=existing_cols, inplace=True)

print("Dropped columns:", existing_cols)


Columns in dataset: ['area_type', 'availability', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony', 'price']
Dropped columns: ['area_type', 'availability', 'society', 'balcony']


In [49]:
# Splitting the dataset into training and testing sets.
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [51]:
data['location'].value_counts()

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [52]:
data['location'] = data['location'].fillna('Sarjapur Road')

In [53]:
data['size'].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

In [54]:
data['size'] = data['size'].fillna('2 BHK')

In [55]:
data['bath']= data['bath'].fillna(data['bath'].median())

In [56]:
data['size'] = data['size'].apply(lambda x: int(x.split()[0]))


In [57]:
def convert_sqft(x):
    try:
        if '-' in str(x):      # Example: "2100 - 2850"
            nums = x.split('-')
            return (float(nums[0]) + float(nums[1])) / 2
        return float(x)
    except:
        return None

data['total_sqft'] = data['total_sqft'].apply(convert_sqft)
data = data.dropna(subset=['total_sqft'])



In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13274 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13274 non-null  object 
 1   size        13274 non-null  int64  
 2   total_sqft  13274 non-null  float64
 3   bath        13274 non-null  float64
 4   price       13274 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 622.2+ KB


In [59]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

location
Whitefield               538
Sarjapur  Road           399
Electronic City          302
Kanakpura Road           271
Thanisandra              233
                        ... 
BEML Layout 5th stage      1
Kannur                     1
singapura paradise         1
Uvce Layout                1
Abshot Layout              1
Name: count, Length: 1300, dtype: int64
********************
size
2     5534
3     4843
4     1412
1      647
5      352
6      220
7      100
8       88
9       52
10      14
11       4
27       1
19       1
16       1
43       1
14       1
12       1
13       1
18       1
Name: count, dtype: int64
********************
total_sqft
1200.0    843
1100.0    221
1500.0    206
2400.0    196
600.0     180
         ... 
3080.0      1
787.0       1
6613.0      1
502.0       1
4689.0      1
Name: count, Length: 1999, dtype: int64
********************
bath
2.0     6966
3.0     3275
4.0     1222
1.0      781
5.0      521
6.0      269
7.0      102
8.0       64
9.0       41
10

In [60]:
data.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [61]:
# Check which columns exist
print("Columns in dataset:", data.columns.tolist())

# Drop only if present
cols_to_drop = ['area_type', 'availability', 'society', 'balcony']
existing_cols = [col for col in cols_to_drop if col in data.columns]

data.drop(columns=existing_cols, inplace=True)

print("Dropped columns:", existing_cols)


Columns in dataset: ['location', 'size', 'total_sqft', 'bath', 'price']
Dropped columns: []


In [62]:
data.describe()

Unnamed: 0,size,total_sqft,bath,price
count,13274.0,13274.0,13274.0,13274.0
mean,2.801718,1559.626694,2.687359,112.453654
std,1.291591,1238.405258,1.336158,149.070368
min,1.0,1.0,1.0,8.0
25%,2.0,1100.0,2.0,50.0
50%,3.0,1276.0,2.0,72.0
75%,3.0,1680.0,3.0,120.0
max,43.0,52272.0,40.0,3600.0


In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13274 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13274 non-null  object 
 1   size        13274 non-null  int64  
 2   total_sqft  13274 non-null  float64
 3   bath        13274 non-null  float64
 4   price       13274 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 622.2+ KB


In [64]:
data['location'].value_counts()

location
Whitefield               538
Sarjapur  Road           399
Electronic City          302
Kanakpura Road           271
Thanisandra              233
                        ... 
BEML Layout 5th stage      1
Kannur                     1
singapura paradise         1
Uvce Layout                1
Abshot Layout              1
Name: count, Length: 1300, dtype: int64

In [65]:
data['location'] = data['location'].fillna ('Sarjapur Road')

In [66]:
data['size'].value_counts()

size
2     5534
3     4843
4     1412
1      647
5      352
6      220
7      100
8       88
9       52
10      14
11       4
27       1
19       1
16       1
43       1
14       1
12       1
13       1
18       1
Name: count, dtype: int64

In [67]:
data['size']= data['size'].fillna ('2 BHK')

In [68]:
data['bath'] = data['bath'].fillna (data['bath'].median())


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13274 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13274 non-null  object 
 1   size        13274 non-null  int64  
 2   total_sqft  13274 non-null  float64
 3   bath        13274 non-null  float64
 4   price       13274 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 622.2+ KB


In [70]:
def extract_bhk(x):
    try:
        return int(str(x).split()[0])
    except:
        return None

data['bhk'] = data['size'].apply(extract_bhk)

# Drop rows where BHK could not be extracted
data = data.dropna(subset=['bhk'])


In [71]:
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27,8000.0,27.0,230.0,27
4684,Munnekollal,43,2400.0,40.0,660.0,43


In [72]:
data['total_sqft'].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [73]:
# ---- Fix for total_sqft ----

def convertRange(x):
    # If numeric → return directly
    if isinstance(x, float) or isinstance(x, int):
        return x
    
    # If range: e.g. '1000-1200'
    if '-' in x:
        temp = x.split('-')
        return (float(temp[0]) + float(temp[1])) / 2
    
    # Convert string number
    try:
        return float(x)
    except:
        return None

# Apply cleaning
data['total_sqft'] = data['total_sqft'].apply(convertRange)

# Remove rows where sqft was invalid
data = data.dropna(subset=['total_sqft'])


In [74]:
data['total_sqft']= data['total_sqft'].apply(convertRange)

In [75]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4,2600.0,5.0,120.0,4
2,Uttarahalli,3,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3,1521.0,3.0,95.0,3
4,Kothanur,2,1200.0,2.0,51.0,2


In [76]:
data['price_per_sqft'] = data['price'] *100000 /data['total_sqft']

In [77]:
data['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13274, dtype: float64

In [78]:
data.describe()

Unnamed: 0,size,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13274.0,13274.0,13274.0,13274.0,13274.0
mean,2.801718,1559.626694,2.687359,112.453654,2.801718,7907.501
std,1.291591,1238.405258,1.336158,149.070368,1.291591,106429.6
min,1.0,1.0,1.0,8.0,1.0,267.8298
25%,2.0,1100.0,2.0,50.0,2.0,4266.865
50%,3.0,1276.0,2.0,72.0,3.0,5434.306
75%,3.0,1680.0,3.0,120.0,3.0,7311.746
max,43.0,52272.0,40.0,3600.0,43.0,12000000.0


In [79]:
data['location']. value_counts()

location
Whitefield               538
Sarjapur  Road           399
Electronic City          302
Kanakpura Road           271
Thanisandra              233
                        ... 
BEML Layout 5th stage      1
Kannur                     1
singapura paradise         1
Uvce Layout                1
Abshot Layout              1
Name: count, Length: 1300, dtype: int64

In [80]:
data['location'] = data['location'].apply(lambda x: x.strip())
location_count= data['location'].value_counts()

In [81]:
location_count

location
Whitefield                            539
Sarjapur  Road                        399
Electronic City                       304
Kanakpura Road                        271
Thanisandra                           236
                                     ... 
1Channasandra                           1
Hosahalli                               1
Vijayabank bank layout                  1
near Ramanashree California resort      1
Abshot Layout                           1
Name: count, Length: 1289, dtype: int64

In [82]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

location
Kalkere                               10
Thyagaraja Nagar                      10
BTM 1st Stage                         10
Basapura                              10
Dodsworth Layout                      10
                                      ..
1Channasandra                          1
Hosahalli                              1
Vijayabank bank layout                 1
near Ramanashree California resort     1
Abshot Layout                          1
Name: count, Length: 1049, dtype: int64

In [83]:
data['location']=data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [84]:
data['location'].value_counts()

location
other              2877
Whitefield          539
Sarjapur  Road      399
Electronic City     304
Kanakpura Road      271
                   ... 
Doddaballapur        11
Tindlu               11
Marsur               11
HAL 2nd Stage        11
Kodigehalli          11
Name: count, Length: 241, dtype: int64

In [85]:
data.describe()

Unnamed: 0,size,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13274.0,13274.0,13274.0,13274.0,13274.0
mean,2.801718,1559.626694,2.687359,112.453654,2.801718,7907.501
std,1.291591,1238.405258,1.336158,149.070368,1.291591,106429.6
min,1.0,1.0,1.0,8.0,1.0,267.8298
25%,2.0,1100.0,2.0,50.0,2.0,4266.865
50%,3.0,1276.0,2.0,72.0,3.0,5434.306
75%,3.0,1680.0,3.0,120.0,3.0,7311.746
max,43.0,52272.0,40.0,3600.0,43.0,12000000.0


In [86]:
(data['total_sqft']/data['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [87]:
data= data[((data['total_sqft']/data['bhk']) >= 300)]
data.describe()

Unnamed: 0,size,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0,12530.0
mean,2.650838,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,0.976678,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,1.0,300.0,1.0,8.44,1.0,267.829813
25%,2.0,1116.0,2.0,49.0,2.0,4210.526316
50%,3.0,1300.0,2.0,70.0,3.0,5294.117647
75%,3.0,1700.0,3.0,115.0,3.0,6916.666667
max,16.0,52272.0,16.0,3600.0,16.0,176470.588235


In [88]:
data.shape

(12530, 7)

In [89]:
data.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [90]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        gen_df =subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output= pd.concat([df_output, gen_df], ignore_index =True)
    return df_output
data = remove_outliers_sqft(data)
data.describe()

Unnamed: 0,size,total_sqft,bath,price,bhk,price_per_sqft
count,10302.0,10302.0,10302.0,10302.0,10302.0,10302.0
mean,2.575325,1508.585392,2.472044,91.294983,2.575325,5659.09597
std,0.898664,880.774066,0.980018,86.343018,0.898664,2265.667258
min,1.0,300.0,1.0,10.0,1.0,1250.0
25%,2.0,1110.0,2.0,49.0,2.0,4244.994224
50%,2.0,1286.0,2.0,67.0,2.0,5175.792088
75%,3.0,1650.0,3.0,100.0,3.0,6428.571429
max,16.0,30400.0,16.0,2200.0,16.0,24509.803922


In [91]:
def bhk_outlier_remover(df):
    exclude_indices= np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats= {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats [bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }      
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats =bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices= np.append(exclude_indices, bhk_df [bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

In [92]:
data= bhk_outlier_remover(data)

In [93]:
data.shape

(7348, 7)

In [94]:
data

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4,2850.0,4.0,428.0,4,15017.543860
1,1st Block Jayanagar,3,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...,...
10293,other,2,1200.0,2.0,70.0,2,5833.333333
10294,other,1,1800.0,1.0,200.0,1,11111.111111
10297,other,2,1353.0,2.0,110.0,2,8130.081301
10298,other,1,812.0,1.0,26.0,1,3201.970443


In [95]:
data.drop(columns=['size', 'price_per_sqft'], inplace=True)

In [96]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [97]:
data.to_csv("Cleaned_data.csv")

In [98]:
X=data.drop(columns=['price'], axis=1)
y=data['price']

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# Define features and target
X = data.drop(['price'], axis=1)
y = data['price']

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Preprocessing: OneHotEncode 'location'
column_trans = make_column_transformer(
    (OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['location']),
    remainder='passthrough'
)

# Pipeline
lr = LinearRegression()

pipe = make_pipeline(
    column_trans,
    StandardScaler(),
    lr
)

# Fit model
pipe.fit(X_train, y_train)

# Predict & evaluate
y_pred_lr = pipe.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred_lr))



R2 Score: 0.8547120152888832


In [100]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# Define features and target
X = data.drop(['price'], axis=1)
y = data['price']

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    random_state=0
)

#  Final pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gbr', gbr)
])

#  Fit model
pipe.fit(X_train, y_train)

# Predict & Evaluate
y_pred = pipe.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))


R2 Score: 0.88247714255363
