# Predicting Estimated Rent Price Of Houses In Lagos

### Data Description

In [127]:
# import the relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

### Load Data

In [192]:
data = pd.read_csv('training_test_data.csv')

# choose relevants columns
data.columns

Index(['location', 'price', 'bedroom', 'bathroom', 'toilet', 'parking_lot',
       'serviced', 'newly_built', 'furnished'],
      dtype='object')

### Data Encoding

In [193]:
# get dummny data
df = pd.get_dummies(data)
df

Unnamed: 0,price,bedroom,bathroom,toilet,parking_lot,serviced,newly_built,furnished,location_Ado-Odo/Ota,location_Agege,...,location_Maryland,location_Mowe Ofada,location_Ogudu,location_Ojodu,location_Ojota,location_Oshodi,location_Shomolu,location_Surulere,location_Victoria Island (VI),location_Yaba
0,1.2,2,3,3,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2,2,3,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.1,2,2,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.8,1,1,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,17.0,3,3,4,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1456,0.7,2,2,3,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1457,0.8,2,2,3,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1458,2.5,3,3,3,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [194]:
data.describe()

Unnamed: 0,price,bedroom,bathroom,toilet,parking_lot,serviced,newly_built,furnished
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,5.322496,2.415753,2.427397,3.153425,3.240411,0.069863,0.080822,0.012329
std,31.947491,0.976993,1.058697,1.280323,3.872241,0.255003,0.272655,0.110386
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.7,2.0,2.0,2.0,2.0,0.0,0.0,0.0
50%,1.8,3.0,3.0,4.0,2.0,0.0,0.0,0.0
75%,7.0,3.0,3.0,4.0,3.0,0.0,0.0,0.0
max,1200.0,12.0,6.0,12.0,50.0,1.0,1.0,1.0


### Data Preprocessing

In [225]:
# creating X and y variables
X_m = df.drop('price', axis=1)
y_m = df.price

In [250]:
# Checking for outliers
# Scaling
z = np.abs(stats.zscore(X_m))

In [227]:
len(np.where(z > 3)[0])

689

In [228]:
# removing outliers
outliers = list(set(np.where(z > 3)[0]))
X_new = df.drop(outliers,axis = 0).reset_index(drop = False)
display(X_new)

y_new = y_m[list(X_new["index"])]
len(y_new)

Unnamed: 0,index,price,bedroom,bathroom,toilet,parking_lot,serviced,newly_built,furnished,location_Ado-Odo/Ota,...,location_Maryland,location_Mowe Ofada,location_Ogudu,location_Ojodu,location_Ojota,location_Oshodi,location_Shomolu,location_Surulere,location_Victoria Island (VI),location_Yaba
0,0,1.2,2,3,3,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1.0,2,2,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1.0,2,2,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1.8,1,1,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,6.0,2,2,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,1448,1.4,3,3,4,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886,1454,14.0,4,4,6,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
887,1455,17.0,3,3,4,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
888,1456,0.7,2,2,3,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


890

### Feature Scaling

In [229]:
## pre-processing
X_new = X_new.drop('index', axis=1)

X_processed = StandardScaler().fit_transform(X_new)

### Machine Learning Application

In [230]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_new, test_size=0.2, random_state=42)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

### Define And Fit Model

In [231]:
# multiple linear regression
lx = LinearRegression()
lx.fit(X_train, y_train)

np.mean(cross_val_score(lx, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-4.293049986987167e-15

In [232]:
# lass regression
ls = Lasso()
ls.fit(X_train, y_train)

np.mean(cross_val_score(ls, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-0.7711537855941651

In [233]:
sv = SVR()
sv.fit(X_train, y_train)

np.mean(cross_val_score(sv, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-0.8322957314380723

In [234]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

np.mean(cross_val_score(dt, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-0.29342153846153846

In [235]:
# random forrest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

np.mean(cross_val_score(rf, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-0.3031067076923078

### Model Prediction

In [236]:
lx_pred = lx.predict(X_test)

In [237]:
ls_pred = ls.predict(X_test)

In [238]:
rf_pred = rf.predict(X_test)

In [239]:
dt_pred = dt.predict(X_test)

In [240]:
sv_pred = sv.predict(X_test)

In [241]:
mean_absolute_error(y_test, lx_pred)

6.1332805251535944e-15

In [242]:
mean_absolute_error(y_test, ls_pred)

0.6534222832219554

In [243]:
mean_absolute_error(y_test, rf_pred)

0.014478438202247389

In [244]:
mean_absolute_error(y_test, gs_pred)

0.042050734406982926

In [245]:
mean_absolute_error(y_test, dt_pred)

0.004171910112359563

<h3>Evaluate Models For Both train And Test Datasets</h3>

In [246]:
# Generalisation
best_model = pd.DataFrame({
    'model': ['Linear Regression', 'Lasso', 'SVR', 'Decision Tree', 'Random Forrest'],
    
    'score': [round(lx.score(X_test,y_test) * 100, 2), round(ls.score(X_test,y_test) * 100, 2),
                   round(sv.score(X_test,y_test) * 100, 2), round(dt.score(X_test,y_test) * 100, 2),
                  round(rf.score(X_test,y_test) * 100, 2)],
                   
    'mae': [round(mean_absolute_error(y_test, lx_pred), 3), round(mean_absolute_error(y_test, ls_pred), 3),
           round(mean_absolute_error(y_test, sv_pred), 3), round(mean_absolute_error(y_test, dt_pred), 3),
           round(mean_absolute_error(y_test, rf_pred), 3)]
})
best_model

Unnamed: 0,model,score,mae
0,Linear Regression,100.0,0.0
1,Lasso,98.35,0.653
2,SVR,95.79,0.481
3,Decision Tree,100.0,0.004
4,Random Forrest,99.98,0.014


### Downlaod As Pickle File

In [247]:
# save model to a file using python pickle
import pickle
pickl = {'model': gs.best_estimator_}
pickle.dump(pickl, open('model_file'+'.p','wb'))

file_name = 'model_file.p'
with open(file_name, 'rb') as pickled:
    data = pickle.load(pickled)
    model = data['model']