# Banglore House Price Prediction


In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [3]:
# checking data types
print(df.dtypes)

area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object


In [4]:
### 1️⃣ Data Cleaning
df = df.drop(['area_type', 'society', 'availability'], axis=1)  # Drop unnecessary columns

In [5]:
# Handle missing values
df = df.dropna(subset=['size', 'total_sqft', 'price'])  # Drop rows with missing critical values
df['bath'].fillna(df['bath'].median(), inplace=True)
df['balcony'].fillna(df['balcony'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bath'].fillna(df['bath'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['balcony'].fillna(df['balcony'].median(), inplace=True)


In [6]:
# Convert 'size' (e.g., "2 BHK" → 2)
df['BHK'] = df['size'].apply(lambda x: int(x.split(' ')[0]))  # Extract first number
df = df.drop(['size'], axis=1)  # Drop old size column

In [7]:
# Convert 'total_sqft' to numeric (handling ranges like "600 - 1000")
def convert_sqft(value):
    try:
        if '-' in value:
            vals = list(map(float, value.split('-')))
            return (vals[0] + vals[1]) / 2
        return float(value)
    except:
        return None  # Return None for invalid values

In [8]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna(subset=['total_sqft'])  # Drop rows where sqft couldn't be converted

# Using Feature Engineering

In [9]:
###  Feature Engineering
# Encode 'location' as numeric (Label Encoding)
le = LabelEncoder()
df['location'] = le.fit_transform(df['location'])

In [10]:
###  Train Linear Regression Model
# Define input (X) and target (y)
X = df[['total_sqft', 'bath', 'balcony', 'BHK', 'location']]
y = df['price']

In [11]:
# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluting Model

In [13]:
# Evaluate model
score = model.score(X_test, y_test)  # R² score
print(f"Model Accuracy (R² Score): {score:.2f}")

Model Accuracy (R² Score): 0.49


# Making prediction

In [16]:
# Make a prediction (example: 1000 sqft, 2 bath, 1 balcony, 2 BHK, location=5)
predicted_price = model.predict([[1000, 2, 1, 2, 5]])
print(f"Predicted Price: ₹{predicted_price[0]:.2f} Lakhs")

Predicted Price: ₹66.04 Lakhs




In [17]:
print(f"predicted_price: ₹{model.predict([[1000, 1, 1, 1, 5]])} Laks")

predicted_price: ₹[36.18102595] Laks


