In [38]:
from ast import increment_lineno
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib

In [39]:
df = pd.read_csv("properties_data.csv")
df.head()

Unnamed: 0,id,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,...,balcony,central_ac,childrens_play_area,covered_parking,private_pool,security,shared_gym,shared_pool,view_of_landmark,view_of_water
0,5528049,Palm Jumeirah,25.113208,55.138932,2700000,1079,2502.32,1,2,Medium,...,True,True,True,False,False,False,True,False,False,True
1,6008529,Palm Jumeirah,25.106809,55.151201,2850000,1582,1801.52,2,2,Medium,...,True,True,True,False,False,False,True,True,False,True
2,6034542,Jumeirah Lake Towers,25.063302,55.137728,1150000,1951,589.44,3,5,Medium,...,True,False,False,True,False,True,True,True,True,True
3,6326063,Culture Village,25.227295,55.341761,2850000,2020,1410.89,2,3,Low,...,True,False,False,True,False,False,False,False,False,False
4,6356778,Palm Jumeirah,25.114275,55.139764,1729200,507,3410.65,0,1,Medium,...,False,True,False,True,False,True,True,True,True,True


In [40]:
# We will drop this column as it provide no value to our model
# Since we only have few data on other neighborhoods, we will create a transformation that groups them under neighborhood = other

df1 = df.drop(['id','price_per_sqft','longitude','latitude'],axis = 'columns')


neighborhood_num = df1.groupby('neighborhood')['neighborhood'].agg('count')
neighborhood_less_than_10 = neighborhood_num[neighborhood_num <=10]

df1.neighborhood = df1.neighborhood.apply(lambda x: 'Other' if x in neighborhood_less_than_10 else x)
df1['neighborhood'].value_counts().sort_values(ascending= False)

Unnamed: 0_level_0,count
neighborhood,Unnamed: 1_level_1
Downtown Dubai,302
Dubai Marina,288
Jumeirah Village Circle,200
Palm Jumeirah,178
Jumeirah Beach Residence,116
Other,115
Business Bay,97
Jumeirah Lake Towers,70
Dubai Hills Estate,53
The Views,47


In [41]:
from sklearn.preprocessing import OrdinalEncoder

real_estate_quality = df1[['quality']]
real_estate_quality.head(10)

ordinal_encoder = OrdinalEncoder()
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High', 'Ultra']])
real_estate_quality_encoded = ordinal_encoder.fit_transform(real_estate_quality)
real_estate_quality_encoded[:10]


df1['quality_encoded'] = real_estate_quality_encoded

df1 = df1.drop(['quality'],axis = 'columns')

In [42]:
binary_columns = df.select_dtypes(include=['bool']).columns
df1[binary_columns] = df1[binary_columns].astype(int)
df1.head()

Unnamed: 0,neighborhood,price,size_in_sqft,no_of_bedrooms,no_of_bathrooms,maid_room,unfurnished,balcony,central_ac,childrens_play_area,covered_parking,private_pool,security,shared_gym,shared_pool,view_of_landmark,view_of_water,quality_encoded
0,Palm Jumeirah,2700000,1079,1,2,0,0,1,1,1,0,0,0,1,0,0,1,1.0
1,Palm Jumeirah,2850000,1582,2,2,0,0,1,1,1,0,0,0,1,1,0,1,1.0
2,Jumeirah Lake Towers,1150000,1951,3,5,1,1,1,0,0,1,0,1,1,1,1,1,1.0
3,Culture Village,2850000,2020,2,3,0,1,1,0,0,1,0,0,0,0,0,0,0.0
4,Palm Jumeirah,1729200,507,0,1,0,0,0,1,0,1,0,1,1,1,1,1,1.0


In [43]:
dummies = pd.get_dummies(df1.neighborhood).astype(int)
dummies.head()

Unnamed: 0,Al Furjan,Arjan,Business Bay,City Walk,Culture Village,DAMAC Hills,DIFC,Downtown Dubai,Dubai Creek Harbour (The Lagoons),Dubai Harbour,...,Meydan,Mohammed Bin Rashid City,Motor City,Old Town,Other,Palm Jumeirah,The Hills,The Views,Town Square,Umm Suqeim
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [44]:
df_final = pd.concat([df1.drop('neighborhood',axis = 'columns'),dummies.drop('Other',axis = 'columns')],axis = 'columns')

In [45]:
X = df_final.drop(['price'], axis = 'columns')
X.shape

(1905, 45)

In [46]:
y = df_final.price
y.head()

Unnamed: 0,price
0,2700000
1,2850000
2,1150000
3,2850000
4,1729200


In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [48]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.7440573723538669

In [49]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.74405737, 0.68661051, 0.7741596 , 0.77197306, 0.7042387 ])

In [50]:
# we decided to use Linear Regression after all
# Now lets create a function to predict new instances

def predict_price(neighborhood,size_in_sqft,no_of_bedrooms,
                        no_of_bathrooms,maid_room,unfurnished,balcony,
                        central_ac,childrens_play_area,covered_parking,
                        private_pool,security,shared_gym,
                        shared_pool,view_of_landmark,view_of_water,quality_encoded):

  loc_index = np.where(X.columns==neighborhood)[0][0]
  x = np.zeros(len(X.columns))

  x[0] = size_in_sqft
  x[1] = no_of_bedrooms
  x[2] = no_of_bathrooms
  x[3] = maid_room
  x[4] = unfurnished
  x[5] = balcony
  x[6] = central_ac
  x[7] = childrens_play_area
  x[8] = covered_parking
  x[9] = private_pool
  x[10] = security
  x[11] = shared_gym
  x[12] = shared_pool
  x[13] = view_of_landmark
  x[14] = view_of_water
  x[15] = quality_encoded



  if loc_index >= 0:
    x[loc_index] = 1

  return model.predict([x])[0]





In [51]:
predict_price('Palm Jumeirah',1079,1,1,0,0,1,0,1,1,0,0,0,0,1,1,2)



2085544.8111586506

In [52]:
import pickle
with open('dubai_re_model.pickle','wb') as f:
  pickle.dump(model,f)

In [53]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open('columns.json','w') as f:
  f.write(json.dumps(columns))