In [379]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

In [380]:
data = pd.read_csv('training_test_data.csv')

# choose relevants columns
data.columns

Index(['location', 'price', 'bedroom', 'bathroom', 'toilet', 'parking_lot',
       'serviced', 'newly_built', 'furnished'],
      dtype='object')

In [381]:
#data.price = data.price 

In [382]:
# get dummny data
df = pd.get_dummies(data)
df

Unnamed: 0,price,bedroom,bathroom,toilet,parking_lot,serviced,newly_built,furnished,location_Ado-Odo/Ota,location_Agege,...,location_Maryland,location_Mowe Ofada,location_Ogudu,location_Ojodu,location_Ojota,location_Oshodi,location_Shomolu,location_Surulere,location_Victoria Island (VI),location_Yaba
0,1.2,2,3,3,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2,2,3,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.1,2,2,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.8,1,1,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,17.0,3,3,4,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1456,0.7,2,2,3,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1457,0.8,2,2,3,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1458,2.5,3,3,3,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [383]:
# Scaling
z = np.abs(stats.zscore(df))

In [384]:
len(np.where(z > 3)[0])

691

In [385]:
outliers = list(set(np.where(z > 3)[0]))
df2 = df.drop(outliers,axis = 0).reset_index(drop = False)
display(df2)

Unnamed: 0,index,price,bedroom,bathroom,toilet,parking_lot,serviced,newly_built,furnished,location_Ado-Odo/Ota,...,location_Maryland,location_Mowe Ofada,location_Ogudu,location_Ojodu,location_Ojota,location_Oshodi,location_Shomolu,location_Surulere,location_Victoria Island (VI),location_Yaba
0,0,1.2,2,3,3,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1.0,2,2,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1.0,2,2,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1.8,1,1,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,6.0,2,2,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,1448,1.4,3,3,4,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
885,1454,14.0,4,4,6,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886,1455,17.0,3,3,4,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
887,1456,0.7,2,2,3,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [386]:
# creating X and y variables
X = df2.drop(['price', 'index', 'toilet'], axis=1)
y = df2.price.values

In [387]:
## pre-processing
X_processed = StandardScaler().fit_transform(X)

In [388]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [389]:
# # multiple linear regression
# X_sm = X = sm.add_constant(X)
# model = sm.OLS(y, X_sm)
# model.fit().summary()

In [390]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

cross_val_score(LinearRegression(), X, y, cv = cv)

array([0.53015329, 0.54543866, 0.5479705 , 0.52477551, 0.5668773 ])

In [391]:
# multiple linear regression
lx = LinearRegression()
lx.fit(X_train, y_train)

np.mean(cross_val_score(lx, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.540893888263475

In [375]:
# lass regression
ls = Lasso()
ls.fit(X_train, y_train)

np.mean(cross_val_score(ls, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.932275434635399

In [376]:
sv = SVR()
sv.fit(X_train, y_train)

np.mean(cross_val_score(sv, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.271308624793678

In [377]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

np.mean(cross_val_score(dt, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.439381901510385

In [378]:
# random forrest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

np.mean(cross_val_score(rf, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.3955528548275824

In [None]:
# turn models using GridSearchCV
parameters = {'C':[0.1,1,100,1000], 'kernel':['rbf','poly','sigmond','linear'], 'gamma':['scale','auto']}

In [None]:
gs = GridSearchCV(sv, parameters, scoring='neg_mean_absolute_error', cv = cv)
gs.fit(X_train, y_train)

In [326]:
# test ensembles
gs.best_score_

-2.3854407032348806

In [327]:
gs.best_estimator_

RandomForestRegressor(criterion='mae', max_features='log2', n_estimators=50)

In [335]:
lx_pred = lx.predict(X_test)

In [336]:
ls_pred = ls.predict(X_test)

In [337]:
sv_pred = sv.predict(X_test)

In [338]:
dt_pred = dt.predict(X_test)

In [341]:
gs_pred = gs.predict(X_test)

In [342]:
mean_absolute_error(y_test, lx_pred)

2.604443863818862

In [343]:
mean_absolute_error(y_test, ls_pred)

2.5555624234059966

In [344]:
mean_absolute_error(y_test, gs_pred)

2.4519110112359552

In [345]:
# prediction score
lx.score(X_test, y_test)

0.5645577098613761

In [346]:
ls.score(X_test, y_test)

0.5643525402062047

In [347]:
gs.score(X_test, y_test)

-2.4519110112359552

In [348]:
sv.score(X_test, y_test)

0.5375264853004023

In [349]:
dt.score(X_test, y_test)

0.49945001373706344