In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (7, 3)

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("./../datasets/bengaluru_house_prices.csv")

# display total rows and columns number
print(df.shape)

# show top five rows
df.head()

(13320, 9)


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


## 1. Data Cleaning

In [3]:
# frequency of each unique value in area_type feature
df.groupby(['area_type'])["area_type"].agg("count")

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [4]:
df.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [5]:
# remove unnecessary columns/features and store the rest in a new dataframe
new_df = df.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)

# check the new columns 
print(new_df.shape)

# show top 5 rows
new_df.head()

(13320, 5)


Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [6]:
# check for null value in each column
new_df.isna().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [7]:
# drop the rows with null values as it will not have any much effect on the whole dataset
new_df.dropna(inplace=True)

# check the total rows after dropping rows with null values
new_df.shape

(13246, 5)

In [16]:
# check unique value of size features/columns
new_df["size"].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [23]:
# create new column 'bhk' and insert the numerical value of size column
new_df['bhk'] = new_df['size'].apply(lambda x: int(x.split(' ')[0]))

# check the unique values of bhk
np.sort(new_df.bhk.unique())

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 18, 19,
       27, 43], dtype=int64)

In [19]:
#
new_df[new_df['bhk']>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [37]:
# function to check if value is float
def is_float(x):
    try:
        float(x)
        
# if except is written without ValueError it will not return the False
    except ValueError:
        return False
    return True

In [44]:
# convert a number into float and if it cannot be converted return it's value
new_df[~new_df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [43]:
# count number which are not float in the total_sqft column
len(new_df[~new_df['total_sqft'].apply(is_float)])

190

In [53]:
# to convert range value such as 2000-3100 to a floating point by taking average through function
def to_float(x):
    convert = x.split("-")
    if len(convert)==2:
        return (float(convert[0])+float(convert[1]))/2
    try:
        return float(x)
    except:
        return None

In [55]:
new_df1 = new_df.copy()

new_df1['total_sqft'] = new_df['total_sqft'].apply(to_float)

In [56]:
new_df1.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


## 2. Feature Engineering

In [66]:
new_df2 = new_df1.copy()

# create a new column which shows price per square feet
new_df2['price_per_sqft'] = (new_df1['price']*100000)/new_df1['total_sqft']
new_df2.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [68]:
len(new_df2.location.unique())

1304

we learn from above that there are 1304 unique location which is quite many and to transform them into dummies variable would be hectic so we must find a way to remove some of them which have like just 1 or 2 or very less number of data or information

In [69]:
# remove any place at the beginning and end of the string if it exist
new_df2['location'] = new_df2['location'].apply(lambda x: x.strip())

In [75]:
# show number of data per location 
statistics_locations = new_df2.groupby(['location'])['location'].agg('count').sort_values(ascending=False)
statistics_locations

location
Whitefield               535
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           266
Thanisandra              236
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Karnataka Shabarimala      1
whitefiled                 1
Name: location, Length: 1293, dtype: int64

In [82]:
# location which have data less than 11
location_less_than_11 = statistics_locations[statistics_locations<11]
len(location_less_than_11)
location_less_than_11

location
Basapura                 10
1st Block Koramangala    10
Gunjur Palya             10
Kalkere                  10
Sector 1 HSR Layout      10
                         ..
1 Giri Nagar              1
Kanakapura Road,          1
Kanakapura main  Road     1
Karnataka Shabarimala     1
whitefiled                1
Name: location, Length: 1052, dtype: int64

In [84]:
len(new_df2.location.unique())

1293

In [85]:
new_df2['location'] = new_df2['location'].apply(lambda x: 'other' if x in location_less_than_11 else x)
len(new_df2.location.unique())                     

242