In [1]:
import numpy as np
import pandas as pd
from pandas import value_counts
from sklearn.compose import make_column_transformer

In [2]:
df = pd.read_csv('dataset/Bengaluru_House_Data.csv')

In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.shape

(13320, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
for columns in df.columns:
    print(df[columns].value_counts())
    print('-'*30)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
------------------------------
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
------------------------------
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
---------------

In [8]:
df.drop(columns=['availability', 'area_type', 'society'], inplace=True)

In [9]:
df['balcony'].mode()

0    2.0
Name: balcony, dtype: float64

In [10]:
df['balcony'] = df['balcony'].fillna(2)

In [11]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
balcony        0
price          0
dtype: int64

In [12]:
df.dropna(inplace = True)
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [13]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13246.0,13246.0,13246.0
mean,2.692586,1.601163,112.389392
std,1.341506,0.804759,149.076587
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [14]:
df['bhk'] = df['size'].str.split().str.get(0).astype(int)

In [15]:
df.drop(columns=['size'], inplace=True)

In [16]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      shape=(2067,), dtype=object)

In [17]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [18]:
df['total_sqft'] = df['total_sqft'].apply(convertRange)

In [19]:
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Kothanur,1200.0,2.0,1.0,51.0,2


In [20]:
df['price_per_sqft'] = df['price']*100000/df['total_sqft']

In [21]:
df[('price_per_sqft')]

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13246, dtype: float64

In [22]:
df['location'].value_counts()

location
Whitefield           534
Sarjapur  Road       392
Electronic City      302
Kanakpura Road       266
Thanisandra          233
                    ... 
Vidyapeeta             1
Maruthi Extension      1
Okalipura              1
Old Town               1
Abshot Layout          1
Name: count, Length: 1304, dtype: int64

In [23]:
df['location'] = df['location'].apply(lambda x: x.strip())
location_count = df['location'].value_counts()

In [24]:
location_count_10 = location_count[location_count<=10]
location_count_10

location
Naganathapura                     10
Sadashiva Nagar                   10
Nagappa Reddy Layout              10
BTM 1st Stage                     10
Sector 1 HSR Layout               10
                                  ..
Vasantapura main road              1
Bapuji Layout                      1
1st Stage Radha Krishna Layout     1
BEML Layout 5th stage              1
Abshot Layout                      1
Name: count, Length: 1052, dtype: int64

In [25]:
df['location'] = df['location'].apply(lambda x: 'others' if x in location_count_10 else x)

In [26]:
df.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft
count,13200.0,13246.0,13246.0,13246.0,13246.0,13200.0
mean,1555.302783,2.692586,1.601163,112.389392,2.801902,7920.759
std,1237.323445,1.341506,0.804759,149.076587,1.295758,106727.2
min,1.0,1.0,0.0,8.0,1.0,267.8298
25%,1100.0,2.0,1.0,50.0,2.0,4267.701
50%,1275.0,2.0,2.0,72.0,3.0,5438.331
75%,1672.0,3.0,2.0,120.0,3.0,7317.073
max,52272.0,40.0,3.0,3600.0,43.0,12000000.0


In [27]:
df['location'].value_counts()

location
others                2881
Whitefield             535
Sarjapur  Road         392
Electronic City        304
Kanakpura Road         266
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

In [28]:
(df['total_sqft']/df['bhk']).describe()

count    13200.000000
mean       573.847262
std        388.079980
min          0.250000
25%        473.000000
50%        552.000000
75%        625.000000
max      26136.000000
dtype: float64

In [29]:
df = df[((df['total_sqft']/df['bhk']) >=300)]
df.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft
count,12456.0,12456.0,12456.0,12456.0,12456.0,12456.0
mean,1590.189927,2.562781,1.60228,111.18796,2.649004,6308.502826
std,1260.404795,1.080275,0.798015,152.203367,0.976046,4168.127339
min,300.0,1.0,0.0,9.0,1.0,267.829813
25%,1115.0,2.0,1.0,49.0,2.0,4210.526316
50%,1300.0,2.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,2.0,115.0,3.0,6916.666667
max,52272.0,16.0,3.0,3600.0,16.0,176470.588235


In [30]:
df.shape

(12456, 7)

In [31]:
def remove_sqft_outlier(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        s = np.std(subdf.price_per_sqft)

        gen_df = subdf[(subdf.price_per_sqft > (m-s)) & (subdf.price_per_sqft < (m+s))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output
df = remove_sqft_outlier(df)
df.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft
count,10241.0,10241.0,10241.0,10241.0,10241.0,10241.0
mean,1503.877034,2.474075,1.605312,90.98273,2.57221,5657.702572
std,876.716232,0.981338,0.788137,86.147549,0.896219,2266.47698
min,300.0,1.0,0.0,10.0,1.0,1250.0
25%,1108.0,2.0,1.0,49.0,2.0,4244.762955
50%,1282.0,2.0,2.0,67.0,2.0,5172.413793
75%,1650.0,3.0,2.0,100.0,3.0,6426.099852
max,30400.0,16.0,3.0,2200.0,16.0,24509.803922


In [32]:
def bhk_outlier_remover(df):
    excluded_indices = np.array([])
    for location,location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                excluded_indices = np.append(excluded_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(excluded_indices, axis = 'index')
df = bhk_outlier_remover(df)

In [33]:
df.shape

(7329, 7)

In [34]:
df.drop(columns = ['price_per_sqft'], inplace = True)

In [35]:
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2


In [36]:
df['location'].value_counts()

location
others                   1152
Whitefield                241
Sarjapur  Road            191
Electronic City           162
Raja Rajeshwari Nagar     140
                         ... 
Vishwapriya Layout          4
HAL 2nd Stage               4
Thyagaraja Nagar            4
Banjara Layout              4
Marsur                      3
Name: count, Length: 242, dtype: int64

In [37]:
df.to_csv('cleaned_df.csv')

In [38]:
x = df.drop(columns = ['price'])
y = df['price']

In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

In [40]:
print(x_train.shape, x_test.shape)

(5863, 5) (1466, 5)


In [41]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [42]:
col_trans = make_column_transformer((OneHotEncoder(), ['location']), remainder='passthrough')

In [43]:
scaler = StandardScaler(with_mean=False)

In [44]:
lr = LinearRegression()

In [45]:
pipe_lr = make_pipeline(col_trans, scaler, lr)
pipe_lr.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [46]:
y_pred_lr = pipe_lr.predict(x_test)

In [47]:
r2_lr = r2_score(y_test, y_pred_lr)
r2_lr

0.8698036295770355

In [48]:
l1 = Lasso()
l2 = Ridge()
e = ElasticNet()

In [49]:
pipe_l1 = make_pipeline(col_trans, scaler, l1)
pipe_l2 = make_pipeline(col_trans, scaler, l2)
pipe_e = make_pipeline(col_trans, scaler, e)

In [50]:
pipe_l1.fit(x_train, y_train)
pipe_l2.fit(x_train, y_train)
pipe_e.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [51]:
y_pred_l1 = pipe_l1.predict(x_test)
y_pred_l2 = pipe_l2.predict(x_test)
y_pred_e = pipe_e.predict(x_test)

In [52]:
r2_l1 = r2_score(y_test, y_pred_l1)
r2_l1

0.8539289780022353

In [53]:
r2_l2 = r2_score(y_test, y_pred_l2)
r2_l2

0.8698117369106181

In [54]:
r2_e = r2_score(y_test, y_pred_e)
r2_e

0.8105660204805192

In [55]:
import pickle


In [56]:
pickle.dump(pipe_l2, open('Ridge.pkl', 'wb'))

In [57]:
df[('bath')].unique()

array([ 4.,  3.,  2.,  5.,  8.,  1.,  6.,  7.,  9., 12., 16., 13.])

In [58]:
columns = ['total_sqft', 'bath', 'balcony', 'bhk'] + list(sorted(df['location'].str.lower().unique()))
pickle.dump(columns, open("columns.pkl", "wb"))