<a href="https://colab.research.google.com/github/MishaelThomas/Housing-price-predictor-Web-App-/blob/main/Group2_Mishael%2CSaurav.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Inporting important libraries to perform data analysis
import pandas as pd
import numpy as np

# Obtaining the input data from a CSV file as a dataframe
df1 = pd.read_csv('/content/Bengaluru Housing Price Data.csv')

# Displaying first 5 entries of dataframe
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


**Removing null values from dataset**

In [None]:
# Dimensions of our dataset
df1.shape

(13320, 9)

In [None]:
# Check for columns with null values
df1.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [None]:
# Remove 'society' column since it has 5502 null values out of 13320 entries
df2 = df1.drop('society', axis=1)
df2.dropna(inplace = True)
df2.shape
df2.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

Following line retrieves the numerical data from 'size' column.

In [None]:
df2['size'] = df2['size'].apply(lambda x: int(x.split(sep=' ')[0]))

Following function retrieves the numerical data from 'total_sqft' column.

In [None]:
def convert_to_num(input_value):
  value_1 = input_value.split('-')
  if len(value_1) == 2:
    return((float(value_1[0]) + float(value_1[1]))/2)
  value_2 = input_value.split('Sq. Meter')
  if len(value_2) == 2:
    return(float(value_2[0]))
  try:
    return float(input_value)
  except:
    return None

In [None]:
df2['total_sqft'] = df2['total_sqft'].apply(convert_to_num)

In [None]:
df2['total_sqft'].isnull().sum()

25

For removing null values from 'total_sqft' column

In [None]:
df2.dropna(inplace=True)

Reducing the number of unique values

In [None]:
loc_count = df2['location'].value_counts()
loc_count_less_10 = loc_count[loc_count<=10]

In [None]:
df2['location'] = df2['location'].apply(lambda x: 'Other' if x in loc_count_less_10 else x)

# Treating Outliers

In [None]:
df2 = df2[~(df2['total_sqft']/df2['size'] < 300)]

In [None]:
df2['price_per_sqft'] = df2['price'] * 100000 / df2['total_sqft']

In [None]:
df3 = pd.DataFrame()

for key,dfloc in df2.groupby('location'):
  m = np.mean(dfloc['price_per_sqft'])
  st = np.std(dfloc['price_per_sqft'])

  extracted_df = dfloc[(dfloc['price_per_sqft'] > (m-st)) & (dfloc['price_per_sqft'] < (m+st))]
  df3 = pd.concat([df3,extracted_df],ignore_index=True)

In [None]:
df3 = df3[df3['bath'] < df3['size']+2]

Calculating no. of months after which house can be availed

In [None]:
import datetime
def avail_period(avail_date):
  if avail_date in ['Ready To Move', 'Immediate Possession']:
    return 0
  year = 2000 + int(avail_date.split(sep='-')[0])
  month = avail_date.split(sep='-')[1]
  months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
  
  end_date = datetime.datetime(year, months.index(month) + 1, 1)
  start_date = datetime.datetime(2014, 1, 1)

  num_months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
  return num_months

In [None]:
df3['availability'] = df3['availability'].apply(avail_period)

One-hot encoding for 'area_type' and 'location' columns

In [None]:
df_encoded_area_type = pd.get_dummies(df3['area_type'])
df4 = pd.concat([df3,df_encoded_area_type],axis='columns')
df4.drop('area_type',axis=1,inplace=True)

In [None]:
df_encoded = pd.get_dummies(df4['location'])

In [None]:
df5 = pd.concat([df4,df_encoded],axis='columns')

In [None]:
df5.drop('location',axis=1,inplace=True)

In [None]:
df5.to_csv('Bengaluru Housing Price Data Processed.csv')

In [None]:
import pandas as pd
dfb = pd.read_csv('Bengaluru Housing Price Data Processed.csv',index_col=0)
dfb.head()

Unnamed: 0,availability,size,total_sqft,bath,balcony,price,price_per_sqft,Built-up Area,Carpet Area,Plot Area,Super built-up Area,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,Abbigere,Akshaya Nagar,Ambalipura,Ambedkar Nagar,Amruthahalli,Anandapura,Ananth Nagar,Anekal,Anjanapura,Ardendale,Arekere,Attibele,BEML Layout,BTM 2nd Stage,BTM Layout,Babusapalaya,Badavala Nagar,Balagere,...,Sahakara Nagar,Sanjay nagar,Sarakki Nagar,Sarjapur,Sarjapur Road,Sarjapura - Attibele Road,Sector 2 HSR Layout,Sector 7 HSR Layout,Seegehalli,Shampura,Shivaji Nagar,Singasandra,Somasundara Palya,Sompura,Sonnenahalli,Subramanyapura,Sultan Palaya,TC Palaya,Talaghattapura,Thanisandra,Thigalarapalya,Thubarahalli,Tindlu,Tumkur Road,Ulsoor,Uttarahalli,Varthur,Varthur Road,Vasanthapura,Vidyaranyapura,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,52,3,1250.0,2.0,3.0,44.0,3520.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,2,1250.0,2.0,2.0,40.0,3200.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,2,1200.0,2.0,2.0,83.0,6916.666667,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,2,1170.0,2.0,2.0,40.0,3418.803419,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,3,1425.0,2.0,2.0,65.0,4561.403509,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
X = dfb.drop(['price','price_per_sqft'],axis=1)
Y = dfb[['price']]

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,random_state=2)

In [None]:
from sklearn.linear_model import LinearRegression
lmodel = LinearRegression()
lmodel.fit(xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
lmodel.score(xtrain,ytrain)

0.8059946171243533

In [None]:
lmodel.score(xtest,ytest)

0.8235296212485703

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=800, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
rf.fit(xtrain,ytrain)

  if __name__ == '__main__':


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=800, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [None]:
rf.score(xtrain,ytrain)

0.849815542118161

In [None]:
rf.score(xtest,ytest)

0.7799485611876427

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt= DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')
dt.fit(xtrain,ytrain)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=42, splitter='best')

In [None]:
dt.score(xtrain,ytrain)

0.8375622982354954

In [None]:
dt.score(xtest,ytest)

0.7735578120298253

 Calculating Error values

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

print('Training Mean Absolute Error', mean_absolute_error(ytrain, lmodel.predict(xtrain)))
print('Testing Mean Absolute Error', mean_absolute_error(ytest, lmodel.predict(xtest)))

Training Mean Absolute Error 16.657652531276195
Testing Mean Absolute Error 17.539908253385924


In [None]:
print('Training Mean Squared Error', mean_squared_error(ytrain, lmodel.predict(xtrain)))
print('Testing Mean Squared Error', mean_squared_error(ytest, lmodel.predict(xtest)))

Training Mean Squared Error 1048.394596173145
Testing Mean Squared Error 999.3523306569582


K-fold Cross Validation

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cross = ShuffleSplit(n_splits = 7, test_size=0.2, random_state=0)

cross_val_score(LinearRegression() , X, Y, cv = cross)

array([0.80575975, 0.81918469, 0.81286504, 0.73690094, 0.83531596,
       0.81406607, 0.80516141])

In [None]:
def predict_my_price(area_type,availability,location,size,society,sqft,bath,balcony):
  
  loc_1 = np.where(X.columns == area_type)[0][0]
  loc_2 = np.where(X.columns == location)[0][0]
  
  x = np.zeros(X.shape[1])
  
  x[0] = avail_period(availability)
  x[1] = size.split(sep=' ')[0]
  x[2] = sqft
  x[3] = bath
  x[4] = balcony
  x[loc_1] = 1 
  x[loc_2] = 1 
  
  return(lmodel.predict([x]))

In [None]:
predict_my_price('Super built-up  Area',	'19-Dec',	'Electronic City Phase II',	'2 BHK',	'Coomee',	1056,	2.0,	1.0	)

array([[36.5171149]])

Saving the model for further deployment

In [None]:
import pickle
with open('Bengaluru_Housing_Price_Predictor.pickle','wb') as f:
  pickle.dump(lmodel,f)

In [None]:
import json
columns = {
    'data_columns' : [col for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))