#### Import libraries

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import statistics
import pickle
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings

In [42]:
data1 = pd.read_csv('../data_cleaning.preprocessing/location_data/real_dif_data.csv')
data2 = pd.read_csv('../data_cleaning.preprocessing/location_data/same_data_ok2.csv')

### First approach

##### Concatenate both data frames (one containing coordinates based on address, the other one coordinates based on neghbourhood)

In [43]:
data = pd.concat([data1, data2], axis=0)

In [44]:
data['price'] = data['price'].apply(lambda x: x*1000)
data = pd.DataFrame(data,columns=['address','neighbourhood','district','sqft_surface', 'bedrooms', 'bathrooms', 'new_construction', 'latitude', 'longitude', 'price'])

In [47]:
data.to_csv('data_ready.csv', index=False)

In [48]:
data.head()

Unnamed: 0,address,neighbourhood,district,sqft_surface,bedrooms,bathrooms,new_construction,latitude,longitude,price
0,Avinguda del CID,Soternes,L'Olivereta,100.0,3,1,,39.468547,-0.397934,104000.0
1,Avenida de Gaspar Aguilar,La Raiosa,Jesús,102.0,3,1,,39.45652,-0.390126,97000.0
2,Carrer de les Alqueríes de Bellver,Benicalap,Benicalap,83.0,3,1,,39.493323,-0.396183,81000.0
3,calle de Planas,En Corts,Quatre Carreres,77.0,3,1,,39.457017,-0.370454,99000.0
4,calle de Jaime Beltrán,La Raiosa,Jesús,88.0,1,1,,39.456544,-0.388492,129900.0


##### Getting dummies & dataset ready for model

In [6]:
dummies = pd.get_dummies(data[['new_construction','district','neighbourhood']], drop_first=True)

In [7]:
df = data.drop(columns=['index', 'price', 'address', 'neighbourhood', 'district','new_construction'])
X = pd.concat([df, dummies], axis=1)

In [8]:
y = data['price']

##### Train/test split and model training

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
RFReg = RandomForestRegressor(n_estimators = 700, random_state = 0)
  
#Fit the random forest regressor with training data represented by X_train and y_train
RFReg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=700, random_state=0)

In [11]:
test_predictions = RFReg.predict((X_test))
train_predictions = RFReg.predict((X_train))

##### Error matrix train set

In [12]:
MAE = mean_absolute_error(y_train,train_predictions)
MSE = mean_squared_error(y_train,train_predictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_train,train_predictions)
Adj_r2 = round(1-((1-(R2))*(2339-1))/(2339-110-1),3)

print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
print("The R2 of the model in the test set is: %4.2f" % (R2))
print("The Adjusted R2 of the model in the test set is:", (Adj_r2))


The mean absolute error of the model in the test set is: 12390.84
The mean squared error of the model in the test set is: 374625899.97
The root mean squared error of the model in the test set is: 19355.26
The R2 of the model in the test set is: 0.96
The Adjusted R2 of the model in the test set is: 0.959


##### Error matrix test set

In [13]:
MAE = mean_absolute_error(y_test,test_predictions)
MSE = mean_squared_error(y_test,test_predictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,test_predictions)
Adj_r2 = round(1-((1-(R2))*(2339-1))/(2339-110-1),3)

y_t = list(y_test)
y_pred = test_predictions
relative_error = []
for j in range(len(test_predictions)):
    relative_error.append(abs(y_t[j] - y_pred[j])/y_t[j])
rel_e = statistics.mean(relative_error)


print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
print("The R2 of the model in the test set is: %4.2f" % (R2))
print("The Adjusted R2 of the model in the test set is:", (Adj_r2))
print("MAPE is: %4.1f %%" % (rel_e*100.))

The mean absolute error of the model in the test set is: 30009.21
The mean squared error of the model in the test set is: 2168290628.98
The root mean squared error of the model in the test set is: 46564.91
The R2 of the model in the test set is: 0.78
The Adjusted R2 of the model in the test set is: 0.767
MAPE is: 18.2 %


### Second approach

In [23]:
data = pd.read_csv('../data_cleaning.preprocessing/location_data/real_dif_data.csv')

In [24]:
data['price'] = data['price'].apply(lambda x: x*1000)

##### Getting dummies & dataset ready for model

In [25]:
dummies = pd.get_dummies(data[['new_construction','district','neighbourhood']], drop_first=True)

In [26]:
df = data.drop(columns=['index', 'price', 'address', 'neighbourhood', 'district','new_construction'])
X = pd.concat([df, dummies], axis=1)

In [27]:
y = data['price']

##### Train/test split and model training

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [29]:
RFReg = RandomForestRegressor(n_estimators = 700, random_state = 0)
  
#Fit the random forest regressor with training data represented by X_train and y_train
RFReg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=700, random_state=0)

In [30]:
test_predictions = RFReg.predict((X_test))
train_predictions = RFReg.predict((X_train))

##### Error matrix train set

In [31]:
MAE = mean_absolute_error(y_train,train_predictions)
MSE = mean_squared_error(y_train,train_predictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_train,train_predictions)
Adj_r2 = round(1-((1-(R2))*(2339-1))/(2339-110-1),3)

print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
print("The R2 of the model in the test set is: %4.2f" % (R2))
print("The Adjusted R2 of the model in the test set is:", (Adj_r2))

The mean absolute error of the model in the test set is: 12006.66
The mean squared error of the model in the test set is: 316381547.67
The root mean squared error of the model in the test set is: 17787.12
The R2 of the model in the test set is: 0.97
The Adjusted R2 of the model in the test set is: 0.968


##### Error matrix test set

In [32]:
MAE = mean_absolute_error(y_test,test_predictions)
MSE = mean_squared_error(y_test,test_predictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,test_predictions)
Adj_r2 = round(1-((1-(R2))*(2339-1))/(2339-110-1),3)
relative_error = []
for j in range(len(test_predictions)):
    relative_error.append(abs(y_t[j] - y_pred[j])/y_t[j])
rel_e = statistics.mean(relative_error)

print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
print("The R2 of the model in the test set is: %4.2f" % (R2))
print("The Adjusted R2 of the model in the test set is:", (Adj_r2))
print("MAPE is: %4.1f %%" % (rel_e*100.))

The mean absolute error of the model in the test set is: 31494.38
The mean squared error of the model in the test set is: 2186998156.75
The root mean squared error of the model in the test set is: 46765.35
The R2 of the model in the test set is: 0.80
The Adjusted R2 of the model in the test set is: 0.794
MAPE is: 18.9 %


#### Create a pickle with the model

In [33]:
file_name = "model"
outfile = open(file_name,'wb')

pickle.dump(RFReg,outfile)
outfile.close()