# House Price Predictor (Bangalore)

## 1.) Data Preprocessing

In [4]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('Bengaluru_House_Data.csv')

In [6]:
data.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [4]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [7]:
data['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [8]:
data['location'] = data['location'].fillna('Sarjapur  Road')

In [9]:
data['size'] = data['size'].fillna('2 BHK')

In [10]:
data['bath'] = data['bath'].fillna(data['bath'].median())

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
location      13320 non-null object
size          13320 non-null object
total_sqft    13320 non-null object
bath          13320 non-null float64
price         13320 non-null float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [11]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [12]:
data[data['bhk']>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [13]:
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [14]:
def convert_range(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [15]:
data['total_sqft'] = data['total_sqft'].apply(convert_range)

In [16]:
data['total_sqft'].unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [17]:
data['price_per_sqft'] = data['price']*100000 / data['total_sqft']

In [18]:
data['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [19]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [20]:
data['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    400
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [21]:
data['location'] = data['location'].apply(lambda x:x.strip())
location_count = data['location'].value_counts()

In [22]:
location_count_less_10 = location_count[location_count<=10]
location_count_less_10

Dairy Circle                      10
Nagappa Reddy Layout              10
Basapura                          10
1st Block Koramangala             10
Sector 1 HSR Layout               10
                                  ..
Bapuji Layout                      1
1st Stage Radha Krishna Layout     1
BEML Layout 5th stage              1
singapura paradise                 1
Abshot Layout                      1
Name: location, Length: 1053, dtype: int64

In [23]:
data['location'] = data['location'].apply(lambda x:'other' if x in location_count_less_10 else x)

In [24]:
data = data[((data['total_sqft']/data['bhk']) >= 300)]


In [25]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)

        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output,gen_df],ignore_index =True)
    return df_output
data = remove_outliers_sqft(data)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [26]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] ={
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
            
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [27]:
data = bhk_outlier_remover(data)

In [28]:
data.shape

(7360, 7)

In [29]:
data.drop(columns = ['size','price_per_sqft'],inplace= True)

In [30]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [31]:
data.to_csv('Cleaned_data.csv')

In [32]:
x = data.drop(columns=['price'])
y = data['price']

## 2.) Building Model

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [34]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

In [35]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(5888, 4)
(5888,)
(1472, 4)
(1472,)


### Linear Regression 

In [36]:
column_trans = make_column_transformer((OneHotEncoder(sparse= False),['location']),
                                       remainder='passthrough')

In [37]:
scaler = StandardScaler()

In [39]:
lr = LinearRegression()

In [40]:
pipe = make_pipeline(column_trans,scaler,lr)

In [41]:
pipe.fit(X_train,Y_train)



In [42]:
Y_pred_LR = pipe.predict(X_test)

In [43]:
r2_score(Y_test,Y_pred_LR)

0.8296165353105762

### Ridge Regression

In [44]:
ridge =Ridge()

In [45]:
pipe = make_pipeline(column_trans,scaler,ridge)

In [46]:
pipe.fit(X_train,Y_train)



In [47]:
Y_pred_ridge = pipe.predict(X_test)

In [48]:
r2_score(Y_test,Y_pred_ridge)

0.8296651410179635

### Lasso Regression 

In [67]:
lasso =Lasso()

In [68]:
pipe = make_pipeline(column_trans,scaler,lasso)

In [69]:
pipe.fit(X_train,Y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=False),
                                                  ['location'])],
                            

In [70]:
Y_pred_lasso = pipe.predict(X_test)

In [73]:
r2_score(Y_test,Y_pred_lasso)

0.8199181874762704

### pickling

In [49]:
import pickle

In [51]:
pickle.dump(pipe, open('bestRidgeModel.pkl','wb'))