In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats

In [2]:
data = pd.read_csv('training_test_data.csv')

# choose relevants columns
data.columns

Index(['location', 'price', 'bedroom', 'bathroom', 'toilet', 'parking_lot',
       'serviced', 'newly_built', 'furnished'],
      dtype='object')

In [3]:
#data.price = data.price 

In [4]:
# get dummny data
df = pd.get_dummies(data)
df

Unnamed: 0,price,bedroom,bathroom,toilet,parking_lot,serviced,newly_built,furnished,location_Ado-Odo/Ota,location_Agege,...,location_Maryland,location_Mowe Ofada,location_Ogudu,location_Ojodu,location_Ojota,location_Oshodi,location_Shomolu,location_Surulere,location_Victoria Island (VI),location_Yaba
0,1.2,2,3,3,12,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,2,2,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2,2,3,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.1,2,2,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.8,1,1,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,17.0,3,3,4,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1456,0.7,2,2,3,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1457,0.8,2,2,3,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1458,2.5,3,3,3,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Scaling
z = np.abs(stats.zscore(df))

In [6]:
len(np.where(z > 3)[0])

691

In [7]:
outliers = list(set(np.where(z > 3)[0]))
df2 = df.drop(outliers,axis = 0).reset_index(drop = False)
display(df2)

Unnamed: 0,index,price,bedroom,bathroom,toilet,parking_lot,serviced,newly_built,furnished,location_Ado-Odo/Ota,...,location_Maryland,location_Mowe Ofada,location_Ogudu,location_Ojodu,location_Ojota,location_Oshodi,location_Shomolu,location_Surulere,location_Victoria Island (VI),location_Yaba
0,0,1.2,2,3,3,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1.0,2,2,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1.0,2,2,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1.8,1,1,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,6.0,2,2,3,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,1448,1.4,3,3,4,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
885,1454,14.0,4,4,6,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886,1455,17.0,3,3,4,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
887,1456,0.7,2,2,3,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# creating X and y variables
X = df2.drop(['price', 'index', 'toilet'], axis=1)
y = df2.price.values

In [9]:
## pre-processing
X_processed = StandardScaler().fit_transform(X)

In [10]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [11]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state = 0)

cross_val_score(LinearRegression(), X, y, cv = cv)

array([0.53015329, 0.54543866, 0.5479705 , 0.52477551, 0.5668773 ])

In [12]:
# multiple linear regression
lx = LinearRegression()
lx.fit(X_train, y_train)

np.mean(cross_val_score(lx, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.540893888263475

In [13]:
# lass regression
ls = Lasso()
ls.fit(X_train, y_train)

np.mean(cross_val_score(ls, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.932275434635399

In [14]:
sv = SVR()
sv.fit(X_train, y_train)

np.mean(cross_val_score(sv, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.271308624793678

In [15]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

np.mean(cross_val_score(dt, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.442306585453893

In [16]:
# random forrest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

np.mean(cross_val_score(rf, X_train, y_train, scoring='neg_mean_absolute_error', cv = cv))

-2.404265469156874

In [17]:
# turn models using GridSearchCV
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

In [18]:
gs = GridSearchCV(sv, param_grid, scoring='neg_mean_absolute_error', cv = cv)
gs.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=0, test_size=0.2, train_size=None),
             estimator=SVR(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}],
             scoring='neg_mean_absolute_error')

In [19]:
# test ensembles
gs.best_score_

-2.25874429943278

In [20]:
gs.best_estimator_

SVR(C=1000, gamma=0.001)

In [21]:
lx_pred = lx.predict(X_test)

In [22]:
ls_pred = ls.predict(X_test)

In [23]:
rf_pred = rf.predict(X_test)

In [24]:
dt_pred = dt.predict(X_test)

In [25]:
gs_pred = gs.best_estimator_.predict(X_test)

In [26]:
mean_absolute_error(y_test, lx_pred)

2.5919414842817097

In [27]:
mean_absolute_error(y_test, ls_pred)

3.116117336509765

In [28]:
mean_absolute_error(y_test, rf_pred)

2.5094544646323294

In [29]:
mean_absolute_error(y_test, gs_pred)

2.3322447889060385

In [30]:
mean_absolute_error(y_test, dt_pred)

2.543280396968304

In [34]:
# prediction score
svR = SVR(C=1000, gamma=0.001)
svR.fit(X_train, y_train)
svR.score(X_test, y_test)

0.543162950254017

In [35]:
svR.score(X_train, y_train)

0.5323423657272678

In [50]:
# save model to a file using python pickle
import pickle
pickl = {'model': gs.best_estimator_}
pickle.dump(pickl, open('model_file'+'.p','wb'))

file_name = 'model_file.p'
with open(file_name, 'rb') as pickled:
    data = pickle.load(pickled)
    model = data['model']

In [53]:
model.predict(X_test.iloc[1, : ].values.reshape(1,-1))
X_test.iloc[1, : ].values

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'