In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,30)

In [237]:
df = pd.read_csv('./Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [238]:
df.area_type.unique(), df.area_type.unique().shape

# area_type is a categorical feature

(array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
        'Carpet  Area'], dtype=object),
 (4,))

In [239]:
df.availability.unique().shape

(81,)

In [240]:
df.location.unique().shape

(1306,)

In [241]:
df['size'].unique(), df['size'].unique().shape

(array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
        '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
        '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
        '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
        '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
        '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object),
 (32,))

In [242]:
df.society.unique().shape

(2689,)

In [243]:
df.dtypes

area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object

In [244]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [245]:
(5502/df.shape[0])*100

41.306306306306304

In [246]:
list(df.balcony.unique()), df.balcony.unique().shape[0]

([1.0, 3.0, nan, 2.0, 0.0], 5)

**Fix: bath & balcony column missing values**

In [247]:
df.bath.unique()

array([ 2.,  5.,  3.,  4.,  6.,  1.,  9., nan,  8.,  7., 11., 10., 14.,
       27., 12., 16., 40., 15., 13., 18.])

In [248]:
df['balcony'] = df['balcony'].fillna(df['balcony'].mean())
df['bath'] = df['bath'].fillna(df['bath'].mean())

In [249]:
df['balcony'] = df['balcony'].apply(lambda x: round(x))
df['bath'] = df['bath'].apply(lambda x: round(x))

In [250]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2,1,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5,3,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2,3,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3,1,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2,1,51.0


In [251]:
df.society.unique()[0:20]

array(['Coomee ', 'Theanmp', nan, 'Soiewre', 'DuenaTa', 'Jaades ',
       'Brway G', 'Prrry M', 'Shncyes', 'Skityer', 'PrntaEn', 'Prityel',
       'GrrvaGr', 'PeBayle', 'She 2rk', 'Soitya ', 'Bhe 2ko', 'Itelaa ',
       'ViistLa', 'KBityo '], dtype=object)

**Dropping 'society' column as this column has >40% of missing values and unique values in this column are more**

In [252]:
df.drop('society', axis='columns', inplace=True)

In [253]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2,1,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5,3,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2,3,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3,1,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2,1,51.0


**Fix: total_sqft column**

In [254]:
df.total_sqft.unique()[50:100]

array(['1600', '3010 - 3410', '1500', '1407', '840', '4395', '845',
       '5700', '1160', '3000', '1140', '1220', '1350', '1005', '500',
       '1358', '1569', '1240', '2089', '1206', '1150', '2511', '460',
       '4400', '1660', '2957 - 3450', '1326', '1325', '1499', '1665',
       '708', '1060', '710', '1450', '2894', '1330', '2502', '650',
       '2400', '1007', '966', '1630', '1640', '782', '1260', '1413',
       '1116', '1530', '3700', '2497'], dtype=object)

In [255]:
df.total_sqft = [ (float(x[0])+float(x[1]))/2 if len(x)==2 else x[0] for x in df.total_sqft.str.split(' - ')]

In [256]:
df.total_sqft = [ float(str(x).split('Sq. Meter')[0])*10.7639 if 'Sq. Meter' in str(x) else x for x in df.total_sqft]
df.total_sqft = [ float(str(x).split('Perch')[0])*272.25 if 'Perch' in str(x) else x for x in df.total_sqft]
df.total_sqft = [ float(str(x).split('Sq. Yards')[0])*9 if 'Sq. Yards' in str(x) else x for x in df.total_sqft]
df.total_sqft = [ float(str(x).split('Acres')[0])*43560 if 'Acres' in str(x) else x for x in df.total_sqft]
df.total_sqft = [ float(str(x).split('Cents')[0])*435.6 if 'Cents' in str(x) else x for x in df.total_sqft]
df.total_sqft = [ float(str(x).split('Guntha')[0])*1089 if 'Guntha' in str(x) else x for x in df.total_sqft]
df.total_sqft = [ float(str(x).split('Grounds')[0])*2400 if 'Grounds' in str(x) else x for x in df.total_sqft]

In [257]:
df.total_sqft = df.total_sqft.apply(lambda x: float(x))

In [258]:
df.dtypes

area_type        object
availability     object
location         object
size             object
total_sqft      float64
bath              int64
balcony           int64
price           float64
dtype: object

**Fix: size column**

In [259]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [260]:
df_size_notnull = df[~df['size'].isnull()]

In [261]:
df_size_notnull['size'] = df_size_notnull['size'].apply(lambda x: int(str(x).split(' ')[0]))
df_size_notnull.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,1056.0,2,1,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,2600.0,5,3,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3,1440.0,2,3,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,1521.0,3,1,95.0
4,Super built-up Area,Ready To Move,Kothanur,2,1200.0,2,1,51.0


In [262]:
df_size_notnull['size'].median()

3.0

In [263]:
df['size'] = df['size'].fillna(df_size_notnull['size'].median())

In [264]:
df['size'] = df['size'].apply(lambda x: str(x).split(' ')[0])

In [265]:
df['size'] = df['size'].apply(lambda x: int(float(x)))

In [266]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,1056.0,2,1,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,2600.0,5,3,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3,1440.0,2,3,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,1521.0,3,1,95.0
4,Super built-up Area,Ready To Move,Kothanur,2,1200.0,2,1,51.0


In [267]:
df.dtypes

area_type        object
availability     object
location         object
size              int64
total_sqft      float64
bath              int64
balcony           int64
price           float64
dtype: object

In [268]:
df['size'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

**Fix: availability column**

In [269]:
df.availability.unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       'Immediate Possession', '19-May', '17-Nov', '20-Oct', '20-Jun',
       '19-Feb', '21-Oct', '21-Jan', '17-Mar', '17-Apr', '22-May',
       '19-Oct', '21-Jul', '21-Nov', '21-Mar', '16-Dec', '22-Mar',
       '20-Jan', '21-Sep', '21-Aug', '14-Nov', '19-Nov', '15-Nov',
       '16-Jul', '15-Jun', '17-Feb', '20-Nov', '20-Jul', '16-Sep',
       '15-Oct', '15-Dec', '16-Oct', '22-Nov', '15-Aug', '17-Jan',
       '16-Nov', '20-Apr', '16-Jan', '14-Jul'], dtype=object)

###### i'm dropping the availability column as i'm not aware of how to deal with date columns

In [270]:
df.drop('availability', axis='columns', inplace=True)

In [271]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2,1056.0,2,1,39.07
1,Plot Area,Chikka Tirupathi,4,2600.0,5,3,120.0
2,Built-up Area,Uttarahalli,3,1440.0,2,3,62.0
3,Super built-up Area,Lingadheeranahalli,3,1521.0,3,1,95.0
4,Super built-up Area,Kothanur,2,1200.0,2,1,51.0


In [322]:
df.dtypes

area_type      object
location       object
size            int64
total_sqft    float64
bath            int64
balcony         int64
price         float64
dtype: object

**Fix: moving on to other columns**

In [273]:
df.isnull().sum()

area_type     0
location      1
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [274]:
df[df.location.isnull()]

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
568,Super built-up Area,,3,1600.0,3,2,86.0


In [275]:
df.groupby('location')['location'].count().sort_values(ascending=False)

location
Whitefield                              540
Sarjapur  Road                          399
Electronic City                         302
Kanakpura Road                          273
Thanisandra                             234
                                       ... 
Kodanda Reddy Layout                      1
Kirloskar layout, Basaveshwarnagar        1
Kengeri Satellite Town Stage II           1
Kengeri Satellite Town KHB Apartment      1
 Anekal                                   1
Name: location, Length: 1305, dtype: int64

In [291]:
df[df.location.isnull()]

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
568,Super built-up Area,,3,1600.0,3,2,86.0


In [282]:
df.location.value_counts()

Whitefield                540
Sarjapur  Road            399
Electronic City           302
Kanakpura Road            273
Thanisandra               234
                         ... 
Chuchangatta Colony         1
Bande Nallasandra           1
Banashankari 6th Stage      1
Sabari Nagar                1
Pillanna Gardens            1
Name: location, Length: 1305, dtype: int64

In [294]:
df.loc[568,'location'] = 'Whitefield'
df.location.value_counts()

Whitefield                541
Sarjapur  Road            399
Electronic City           302
Kanakpura Road            273
Thanisandra               234
                         ... 
Chuchangatta Colony         1
Bande Nallasandra           1
Banashankari 6th Stage      1
Sabari Nagar                1
Pillanna Gardens            1
Name: location, Length: 1305, dtype: int64

In [295]:
df.shape

(13320, 7)

In [316]:
df_location_stats = df.groupby('location')['location'].count().sort_values(ascending=False)
df_location_stats

location
Whitefield                              541
Sarjapur  Road                          399
Electronic City                         302
Kanakpura Road                          273
Thanisandra                             234
                                       ... 
Kodanda Reddy Layout                      1
Kirloskar layout, Basaveshwarnagar        1
Kengeri Satellite Town Stage II           1
Kengeri Satellite Town KHB Apartment      1
 Anekal                                   1
Name: location, Length: 1305, dtype: int64

###### Categorizing all the location whose count is less 10 to a single category

In [317]:
len(df_location_stats<=10)

1305

In [318]:
df_location_count_less_than_10 = df_location_stats[df_location_stats <=10]
df_location_count_less_than_10

location
BTM 1st Stage                           10
Basapura                                10
Ganga Nagar                             10
1st Block Koramangala                   10
Dodsworth Layout                        10
                                        ..
Kodanda Reddy Layout                     1
Kirloskar layout, Basaveshwarnagar       1
Kengeri Satellite Town Stage II          1
Kengeri Satellite Town KHB Apartment     1
 Anekal                                  1
Name: location, Length: 1064, dtype: int64

In [319]:
df.location = df.location.apply(lambda x : 'other' if x in df_location_count_less_than_10 else x)

In [320]:
df.location.unique().shape

(242,)

In [321]:
df.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2,1056.0,2,1,39.07
1,Plot Area,Chikka Tirupathi,4,2600.0,5,3,120.0
2,Built-up Area,Uttarahalli,3,1440.0,2,3,62.0
3,Super built-up Area,Lingadheeranahalli,3,1521.0,3,1,95.0
4,Super built-up Area,Kothanur,2,1200.0,2,1,51.0
