In [1]:
import numpy as np
import pandas as pd
import pickle
import sklearn.datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [2]:
house_dataset=pd.read_csv("C:/Users/Keert/pytoncode/Bengaluru_House_Data.csv")

In [3]:
house_dataset.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
house_dataset=house_dataset.drop(['area_type','availability','society'], axis=1)

In [5]:
house_dataset.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [6]:
house_dataset.shape

(13320, 6)

In [7]:
house_dataset.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [8]:
house_dataset = house_dataset.dropna()
house_dataset.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [9]:
house_dataset.describe()

Unnamed: 0,bath,balcony,price
count,12710.0,12710.0,12710.0
mean,2.617309,1.584343,106.060778
std,1.226,0.817287,131.766089
min,1.0,0.0,8.0
25%,2.0,1.0,49.03
50%,2.0,2.0,70.0
75%,3.0,2.0,115.0
max,40.0,3.0,2912.0


In [10]:
# 1. Clean 'size' column
house_dataset['size'] = house_dataset['size'].astype(str).str.extract('(\d+)').astype(float)

# 2. Clean 'total_sqft' column
def convert_sqft_to_num(x):
    try:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

house_dataset['total_sqft'] = house_dataset['total_sqft'].apply(convert_sqft_to_num)

# 3. Drop rows with remaining nulls (after conversion)
house_dataset.dropna(inplace=True)

house_dataset['location'] = house_dataset['location'].fillna('Unknown')

# Convert to numeric labels
label_encoder = LabelEncoder()
house_dataset['location'] = label_encoder.fit_transform(house_dataset['location'])


In [11]:
with open('location_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [12]:
X=house_dataset.drop(['price'], axis=1)
Y=house_dataset['price']

In [13]:
print(X)
print(Y)

       location  size  total_sqft  bath  balcony
0           404   2.0      1056.0   2.0      1.0
1           303   4.0      2600.0   5.0      3.0
2          1136   3.0      1440.0   2.0      3.0
3           735   3.0      1521.0   3.0      1.0
4           694   2.0      1200.0   2.0      1.0
...         ...   ...         ...   ...      ...
13314       459   3.0      1715.0   3.0      3.0
13315      1208   5.0      3453.0   4.0      0.0
13317       938   2.0      1141.0   2.0      1.0
13318       877   4.0      4689.0   4.0      1.0
13319       381   1.0       550.0   1.0      1.0

[12668 rows x 5 columns]
0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
13314    112.00
13315    231.00
13317     60.00
13318    488.00
13319     17.00
Name: price, Length: 12668, dtype: float64


In [14]:
X.head()

Unnamed: 0,location,size,total_sqft,bath,balcony
0,404,2.0,1056.0,2.0,1.0
1,303,4.0,2600.0,5.0,3.0
2,1136,3.0,1440.0,2.0,3.0
3,735,3.0,1521.0,3.0,1.0
4,694,2.0,1200.0,2.0,1.0


In [15]:
X_train, X_test, Y_train, Y_test =train_test_split(X,Y,test_size=0.2,random_state=3)

In [16]:
print(X.shape,X_train.shape, X_test.shape)

(12668, 5) (10134, 5) (2534, 5)


In [17]:
#model=XGBRegressor()
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=42)

In [18]:
model.fit(X_train, Y_train)

In [19]:
train_predict=model.predict(X_train)


In [20]:
print(train_predict)

[40.    45.    45.    ... 70.646 55.    25.   ]


In [21]:
#R square error
score1=metrics.r2_score(Y_train,train_predict)
#mean absolute error
score2=metrics.mean_absolute_error(Y_train,train_predict)
print("R square error:", score1)
print("Mean Absolute error:", score2)

R square error: 0.9966894358919306
Mean Absolute error: 1.2351326518853163


In [22]:
test_predict=model.predict(X_test)


In [23]:
score1=metrics.r2_score(Y_test,test_predict)
score2=metrics.mean_absolute_error(Y_test,test_predict)
print("R square error:", score1)
print("Mean Absolute error:", score2)

R square error: 0.3612000667084696
Mean Absolute error: 37.666968943546586


In [24]:
# Load the encoder
with open('location_label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)



In [27]:
# Sample input for prediction (make sure feature names and order match your training data)
sample_data = {
    'location': label_encoder.transform(['Bisuvanahalli'])[0],
    'size': 3,
    'total_sqft': 1180,
    'bath': 3,
    'balcony': 2
}


# Convert to DataFrame (shape must match training data used for the model)
sample_df = pd.DataFrame([sample_data])

# Predict the price
predicted_price = model.predict(sample_df)
print(f"Predicted house price: ₹{predicted_price[0]:,.2f}")


Predicted house price: ₹48.00
