## Data science regression technique :  Home price prediction 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

In [2]:
# Suppress Warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

In [3]:
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df1.shape

(13320, 9)

In [5]:
df1.groupby("area_type")["area_type"].agg("count")

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

### drop the unnecessary columns that are not using to train model

In [6]:
df2 = df1.drop(["area_type", "society", "balcony","availability"], axis="columns")
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


### Data cleansing : Handle the missing values

In [7]:
df2.isna().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [8]:
df2.shape

(13320, 5)

##### Decide to drop missing value since the propotion is to small compare to the whole dataset

In [9]:
df3 = df2.dropna()
df3.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [10]:
df3.shape

(13246, 5)

### Feature engineering

#### Add new feature(integer) for bhk (Bedrooms Hall Kitchen)

In [11]:
df3["size"].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [12]:
df3["bhk"] = df3["size"].apply(lambda x: int(x.split(" ")[0]))
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [13]:
df3.bhk.unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [14]:
df3[df3.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


See! the house has more than 20 bedrooms. We will handle it later. Let see another feature

In [15]:
df3.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

This feature has some rows be a range and data type is not float

In [16]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [17]:
df3[~df3["total_sqft"].apply(is_float)]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.000,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.000,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.490,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.800,2
...,...,...,...,...,...,...
12975,Whitefield,2 BHK,850 - 1060,2.0,38.190,2
12990,Talaghattapura,3 BHK,1804 - 2273,3.0,122.000,3
13059,Harlur,2 BHK,1200 - 1470,2.0,72.760,2
13265,Hoodi,2 BHK,1133 - 1384,2.0,59.135,2


After apply is_float function row by row, 190 rows are not float

In [18]:
def convert_sqft_to_num(x):
    token = x.split("-")
    if len(token) == 2:
        return (float(token[0])+float(token[1]))/2
    try:
        return float(x)
    except:
        return None

In [19]:
convert_sqft_to_num('2100')

2100.0

In [20]:
convert_sqft_to_num('2100 - 2200')

2150.0

In [21]:
convert_sqft_to_num('300 sqft')

This function can convert data like expected

In [22]:
df4 = df3.copy()
df4["total_sqft"] = df4["total_sqft"].apply(convert_sqft_to_num)
df4.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [23]:
df4.isna().sum()

location       0
size           0
total_sqft    46
bath           0
price          0
bhk            0
dtype: int64

In [24]:
df4.loc[648]

location        Arekere
size          9 Bedroom
total_sqft          NaN
bath                9.0
price             265.0
bhk                   9
Name: 648, dtype: object

In [25]:
df4 = df4[df4.total_sqft.notnull()]

In [26]:
df4.isna().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

Drop 46 none column to make data cleaner

#### Create new feature called price per sqft 

In [27]:
df5 = df4.copy()
df5["price_per_sqft"] = df5["price"]*10000/df5["total_sqft"] ## convert rupee to dollar
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,369.981061
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,461.538462
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,430.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,624.589086
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,425.0


In [28]:
df5["price_per_sqft"].describe()

count    1.320000e+04
mean     7.920759e+02
std      1.067272e+04
min      2.678298e+01
25%      4.267701e+02
50%      5.438331e+02
75%      7.317073e+02
max      1.200000e+06
Name: price_per_sqft, dtype: float64

#### Now. Let handle the location data

In [29]:
df5.location = df5.location.apply(lambda x: x.strip())
len(df5.location.unique())

1287

More than 1000 location. There is too much. The dimension reduction techqiue is needed

In [30]:
location_stats = df5['location'].value_counts(ascending=False)
location_stats

Whitefield                   533
Sarjapur  Road               392
Electronic City              304
Kanakpura Road               264
Thanisandra                  235
                            ... 
Rajanna Layout                 1
Subramanyanagar                1
Lakshmipura Vidyaanyapura      1
Malur Hosur Road               1
Abshot Layout                  1
Name: location, Length: 1287, dtype: int64

In [31]:
len(location_stats[location_stats <= 10])

1047

In [32]:
location_stats_less_than_10 = location_stats[location_stats <= 10]
location_stats_less_than_10

BTM 1st Stage                10
Gunjur Palya                 10
Nagappa Reddy Layout         10
Sector 1 HSR Layout          10
Thyagaraja Nagar             10
                             ..
Rajanna Layout                1
Subramanyanagar               1
Lakshmipura Vidyaanyapura     1
Malur Hosur Road              1
Abshot Layout                 1
Name: location, Length: 1047, dtype: int64

In [33]:
df5.location = df5.location.apply(lambda x: "other" if x in location_stats_less_than_10 else x)
len(df5.location.unique())

241

In [34]:
df5.sample(n=10, random_state=1)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
8399,Hennur,2 BHK,960.0,2.0,48.0,2,500.0
5779,Yelahanka,4 Bedroom,1800.0,4.0,180.0,4,1000.0
11025,Electronic City,2 BHK,1000.0,2.0,29.95,2,299.5
12722,Whitefield,3 BHK,1740.0,3.0,85.0,3,488.505747
8599,LB Shastri Nagar,2 BHK,1250.0,2.0,48.5,2,388.0
2895,Raja Rajeshwari Nagar,3 BHK,1580.0,3.0,53.56,3,338.987342
347,Bommasandra,3 BHK,1260.0,3.0,49.36,3,391.746032
12699,Yelahanka,1 BHK,602.0,2.0,30.0,1,498.33887
9833,other,3 BHK,3500.0,4.0,500.0,3,1428.571429
3939,Thigalarapalya,2 BHK,1418.0,2.0,103.0,2,726.375176


Here, some rows of location column is replace by 'other' since it appear less than 11 times