In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("Bengaluru_House_Data.csv")

In [3]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.drop(columns=["area_type","availability","society"],inplace=True)

In [5]:
data.shape

(13320, 6)

In [6]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [7]:
data.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [8]:
data.dropna(inplace=True)

In [9]:
data.shape

(12710, 6)

In [10]:
data["location"].value_counts()

location
Whitefield            514
Sarjapur  Road        372
Electronic City       300
Kanakpura Road        261
Thanisandra           231
                     ... 
Milk Colony             1
Sundara Nagar           1
Jaladarsini Layout      1
Madanayakahalli         1
Abshot Layout           1
Name: count, Length: 1265, dtype: int64

In [11]:
data["location"] = data["location"].apply(lambda x: x.strip())

In [12]:
data["location"].value_counts()

location
Whitefield          515
Sarjapur  Road      372
Electronic City     302
Kanakpura Road      261
Thanisandra         234
                   ... 
Shirdi Sai Nagar      1
S R Layout            1
Meenakshi Layout      1
Vidyapeeta            1
Abshot Layout         1
Name: count, Length: 1254, dtype: int64

In [13]:
location_stats = data.groupby("location")["location"].agg("count").sort_values(ascending=False)

In [14]:
location_stats

location
Whitefield              515
Sarjapur  Road          372
Electronic City         302
Kanakpura Road          261
Thanisandra             234
                       ... 
Kanakapura  Rod           1
Kanakapura Main Road      1
Kanakapura Road           1
Kanakapura Road,          1
whitefiled                1
Name: location, Length: 1254, dtype: int64

In [15]:
locations_less_than_10_entries = location_stats[location_stats <= 10]

In [16]:
locations_less_than_10_entries

location
1st Block Koramangala    10
Kalkere                  10
Basapura                 10
Kodigehalli              10
Gunjur Palya             10
                         ..
Kanakapura  Rod           1
Kanakapura Main Road      1
Kanakapura Road           1
Kanakapura Road,          1
whitefiled                1
Name: location, Length: 1017, dtype: int64

In [17]:
data["location"] = data["location"].apply(lambda x: "other" if x in locations_less_than_10_entries else x)

In [18]:
data["location"].value_counts()

location
other                        2739
Whitefield                    515
Sarjapur  Road                372
Electronic City               302
Kanakpura Road                261
                             ... 
Marsur                         11
LB Shastri Nagar               11
2nd Phase Judicial Layout      11
ISRO Layout                    11
Vishveshwarya Layout           11
Name: count, Length: 238, dtype: int64

In [19]:
data.sample(5)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
8236,Budigere,1 BHK,664 - 722,1.0,0.0,27.375
8377,Badavala Nagar,3 BHK,1842,3.0,2.0,115.0
2793,Old Madras Road,2 BHK,1225,2.0,1.0,53.55
5179,Rachenahalli,1 BHK,660 - 700,1.0,0.0,32.64
9216,Yelahanka,3 BHK,1780,3.0,2.0,107.0


In [20]:
data["size"].value_counts()

size
2 BHK         5152
3 BHK         4128
4 Bedroom      749
1 BHK          530
3 Bedroom      527
4 BHK          489
2 Bedroom      328
5 Bedroom      263
6 Bedroom      169
1 Bedroom      105
7 Bedroom       69
8 Bedroom       65
5 BHK           36
9 Bedroom       29
6 BHK           23
7 BHK           16
1 RK            13
9 BHK            5
8 BHK            3
10 Bedroom       3
11 Bedroom       2
11 BHK           1
27 BHK           1
43 Bedroom       1
14 BHK           1
12 Bedroom       1
13 BHK           1
Name: count, dtype: int64

In [21]:
data["bedroom"] = data["size"].apply(lambda x: int(x.split(" ")[0]))

In [22]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bedroom
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [23]:
data["total_sqft"].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [24]:
def clean(sqft):
    tokens = sqft.split("-")
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    else:
        try:
            return float(sqft)
        except:
            return None

In [25]:
clean("100-200")

150.0

In [26]:
data["total_sqft"] = data["total_sqft"].apply(clean)

In [27]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bedroom
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4


In [28]:
data["total_sqft"].value_counts()

total_sqft
1200.0    788
1100.0    218
1500.0    198
2400.0    177
600.0     172
         ... 
2435.0      1
2424.0      1
2863.0      1
3680.0      1
4689.0      1
Name: count, Length: 1886, dtype: int64

In [29]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bedroom
count,12668.0,12710.0,12710.0,12710.0,12710.0
mean,1511.835167,2.617309,1.584343,106.060778,2.737136
std,1162.097276,1.226,0.817287,131.766089,1.205097
min,5.0,1.0,0.0,8.0,1.0
25%,1100.0,2.0,1.0,49.03,2.0
50%,1260.0,2.0,2.0,70.0,3.0
75%,1640.0,3.0,2.0,115.0,3.0
max,52272.0,40.0,3.0,2912.0,43.0


In [30]:
data.dropna(inplace=True)

In [31]:
data.shape

(12668, 7)

In [32]:
data["sqft_per_bed"] = data["total_sqft"]/data["bedroom"]

In [33]:
data["sqft_per_bed"].describe()

count    12668.000000
mean       570.060291
std        380.298999
min          0.714286
25%        473.333333
50%        550.000000
75%        622.500000
max      26136.000000
Name: sqft_per_bed, dtype: float64

In [34]:
data2 = data[data["sqft_per_bed"] >= 300]

In [35]:
data2

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bedroom,sqft_per_bed
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,528.000000
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4,650.000000
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3,480.000000
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3,507.000000
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2,600.000000
...,...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3,571.666667
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5,690.600000
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2,570.500000
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4,1172.250000


In [36]:
data2["price_per_sqft"] = round(data2["price"]*100000/data2["total_sqft"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2["price_per_sqft"] = round(data2["price"]*100000/data2["total_sqft"])


In [37]:
data2

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bedroom,sqft_per_bed,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,528.000000,3700.0
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4,650.000000,4615.0
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3,480.000000,4306.0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3,507.000000,6246.0
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2,600.000000,4250.0
...,...,...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3,571.666667,6531.0
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5,690.600000,6690.0
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2,570.500000,5259.0
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4,1172.250000,10407.0


In [38]:
data2["price_per_sqft"].describe()

count     12013.000000
mean       6206.079081
std        3985.524622
min         268.000000
25%        4199.000000
50%        5253.000000
75%        6824.000000
max      176471.000000
Name: price_per_sqft, dtype: float64

In [39]:
data3 = data2[data2["price_per_sqft"] >= 2000]

In [40]:
data3

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bedroom,sqft_per_bed,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,528.000000,3700.0
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4,650.000000,4615.0
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3,480.000000,4306.0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3,507.000000,6246.0
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2,600.000000,4250.0
...,...,...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3,571.666667,6531.0
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5,690.600000,6690.0
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2,570.500000,5259.0
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4,1172.250000,10407.0


In [41]:
data3.drop(columns=["size","sqft_per_bed","price_per_sqft"],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data3.drop(columns=["size","sqft_per_bed","price_per_sqft"],inplace=True)


In [42]:
data3

Unnamed: 0,location,total_sqft,bath,balcony,price,bedroom
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.00,4
2,Uttarahalli,1440.0,2.0,3.0,62.00,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.00,3
4,Kothanur,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...
13314,Green Glen Layout,1715.0,3.0,3.0,112.00,3
13315,Whitefield,3453.0,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,1141.0,2.0,1.0,60.00,2
13318,Padmanabhanagar,4689.0,4.0,1.0,488.00,4


In [43]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [44]:
col_trans = make_column_transformer((OneHotEncoder(sparse_output=False),["location"]),remainder="passthrough")

In [45]:
lr = LinearRegression()
# rf = RandomForestRegressor()
scalar = StandardScaler()

In [46]:
model = make_pipeline(col_trans,scalar,lr)

In [47]:
data_input = data3.drop(columns=["price"])
data_output = data3["price"]

In [48]:
X_train,X_test,y_train,y_test = train_test_split(data_input,data_output,test_size=0.2)

In [49]:
X_train.shape

(9588, 5)

In [50]:
model.fit(X_train,y_train)

In [51]:
model.score(X_test,y_test)

0.6482071340425763

In [52]:
input = pd.DataFrame([["Electronic City Phase II",1500.0,3.0,2.0,3]],columns=["location",	"total_sqft",	"bath",	"balcony","bedroom"])

In [53]:
model.predict(input)

array([78.50846629])

In [54]:
import pickle as pk

In [55]:
pk.dump(model,open("House_Price_Price_model.pkl","wb"))

In [56]:
data3.to_csv("Cleaned_data.csv")