In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [65]:
df1= pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [66]:
df1.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [67]:
# removing particular columns
# removing null values also because comparing to total data points null points is 
# only 3 to 5 percent

df2=df1.copy()
df2.drop(['area_type','availability','society'],axis=1,inplace=True)
df2.dropna(inplace=True)
df2.reset_index(inplace=True, drop=True)
df2.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0
5,Whitefield,2 BHK,1170,2.0,1.0,38.0
6,Marathahalli,3 BHK,1310,3.0,1.0,63.25
7,Whitefield,3 BHK,1800,2.0,2.0,70.0
8,Whitefield,4 Bedroom,2785,5.0,3.0,295.0
9,7th Phase JP Nagar,2 BHK,1000,2.0,1.0,38.0


In [68]:
# preprocessing size column

df2['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [69]:
# changing size column from string to numeric 

df2['size']=df2['size'].apply(lambda x : float(x.split(' ')[0]))
df2.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2.0,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4.0,2600,5.0,3.0,120.0
2,Uttarahalli,3.0,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3.0,1521,3.0,1.0,95.0
4,Kothanur,2.0,1200,2.0,1.0,51.0
5,Whitefield,2.0,1170,2.0,1.0,38.0
6,Marathahalli,3.0,1310,3.0,1.0,63.25
7,Whitefield,3.0,1800,2.0,2.0,70.0
8,Whitefield,4.0,2785,5.0,3.0,295.0
9,7th Phase JP Nagar,2.0,1000,2.0,1.0,38.0


In [70]:
# preprocessing total_sqft attribute
# analysing what types of values are there and displaying only range like values

def is_float(x):
    try:
        float(x)
    except:
        return False
    
    return True

# ~ negation symbol interchanges boolean values 

df2[~df2['total_sqft'].apply(is_float)]

Unnamed: 0,location,size,total_sqft,bath,balcony,price
27,Yelahanka,4.0,2100 - 2850,4.0,0.0,186.000
114,Hebbal,4.0,3067 - 8156,4.0,0.0,477.000
129,8th Phase JP Nagar,2.0,1042 - 1105,2.0,0.0,54.005
153,Sarjapur,2.0,1145 - 1340,2.0,0.0,43.490
176,KR Puram,2.0,1015 - 1540,2.0,0.0,56.800
...,...,...,...,...,...,...
12384,Whitefield,2.0,850 - 1060,2.0,0.0,38.190
12399,Talaghattapura,3.0,1804 - 2273,3.0,0.0,122.000
12466,Harlur,2.0,1200 - 1470,2.0,0.0,72.760
12660,Hoodi,2.0,1133 - 1384,2.0,0.0,59.135


In [71]:
# converting range like values in string to numeric values
# range value is converted to mean value

# values with different units (eg: "50 sq meters") converted to none so that
# later it is removed from the dataframe

def convertSqftToNum(x):
    tokens=x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
df2['total_sqft']=df2['total_sqft'].apply(convertSqftToNum)
df2.dropna(inplace=True)
df2.reset_index(inplace=True,drop=True)
df2.head(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2.0,1056.0,2.0,1.0,39.07
1,Chikka Tirupathi,4.0,2600.0,5.0,3.0,120.0
2,Uttarahalli,3.0,1440.0,2.0,3.0,62.0
3,Lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0
4,Kothanur,2.0,1200.0,2.0,1.0,51.0
5,Whitefield,2.0,1170.0,2.0,1.0,38.0
6,Marathahalli,3.0,1310.0,3.0,1.0,63.25
7,Whitefield,3.0,1800.0,2.0,2.0,70.0
8,Whitefield,4.0,2785.0,5.0,3.0,295.0
9,7th Phase JP Nagar,2.0,1000.0,2.0,1.0,38.0


In [72]:
# df2["price_per_sqft"]=df2["price"]*100000 / df2["total_sqft"]
# df2

In [75]:
# grouping by location and calculating how many values are there within one location

location_count=df2.groupby("location")["location"].count()
location_count

location
1 Annasandrapalya                                  1
1 Giri Nagar                                       1
1 Ramamurthy Nagar                                 1
12th cross srinivas nagar banshankari 3rd stage    1
1A Block Koramangala                               1
                                                  ..
south                                              2
t.c palya                                          1
tc.palya                                           4
vinayakanagar                                      1
whitefiled                                         1
Name: location, Length: 1248, dtype: int64

In [78]:
# there are too many unique values in location which is not great for any conversion
# methods like one-hot encoding.

# so checkLocation function groups them based on no.of datapoints per location using some
# threshold

def check_location(x):
    if location_count[x] <= 10:
        return 'others'
    else:
        return x
    
df2['location']=df2['location'].apply(lambda x : x.strip())
df2['location']=df2['location'].apply(check_location)
location_count=df2.groupby("location")["price"].count()
location_count

location
1st Block Jayanagar            12
1st Phase JP Nagar             21
2nd Phase Judicial Layout      11
2nd Stage Nagarbhavi           23
5th Phase JP Nagar             37
                             ... 
Yelahanka                     206
Yelahanka New Town             40
Yelenahalli                    12
Yeshwanthpur                   78
others                       2741
Name: price, Length: 236, dtype: int64

In [80]:
df2.head(20)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2.0,1056.0,2.0,1.0,39.07
1,Chikka Tirupathi,4.0,2600.0,5.0,3.0,120.0
2,Uttarahalli,3.0,1440.0,2.0,3.0,62.0
3,Lingadheeranahalli,3.0,1521.0,3.0,1.0,95.0
4,Kothanur,2.0,1200.0,2.0,1.0,51.0
5,Whitefield,2.0,1170.0,2.0,1.0,38.0
6,Marathahalli,3.0,1310.0,3.0,1.0,63.25
7,Whitefield,3.0,1800.0,2.0,2.0,70.0
8,Whitefield,4.0,2785.0,5.0,3.0,295.0
9,7th Phase JP Nagar,2.0,1000.0,2.0,1.0,38.0
