# Model Loading and application

In this notebook, we load our trained model and use it to predict prices from our dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, ElasticNetCV, LassoCV, RidgeCV, Lasso, SGDRegressor
from data_cleaner import eliza_cleaning, eliza_fillna
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Normalizer, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import explained_variance_score as evs # evaluation metric
from sklearn.metrics import r2_score as r2 # evaluation metric
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error
from pickle import load

In [26]:
raw_datas = pd.read_csv('https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv')
datas = eliza_cleaning(raw_datas)
datas = eliza_fillna(datas)
median = pd.read_csv('median.csv')
post = pd.read_csv('post_codes.csv', sep=';')
median['Gemeente'] = median['Gemeente'].str.lower()
post['Commune Principale'] = post['Commune principale'].str.lower()
median_with_post = median.merge(post[['Code postal', 'Commune Principale']], how='left', left_on='Gemeente', right_on='Commune Principale')
median_with_post = median_with_post.groupby('Gemeente').median()
median_with_post['Mediaanprijs 2020'].fillna(median_with_post['Mediaanprijs 2019'], inplace=True)
median_with_post['Mediaanprijs 2020'].fillna(median_with_post['Mediaanprijs 2018'], inplace=True)
median_with_post.sort_values(by='Code postal', inplace=True)
median_with_post.fillna(method='bfill', inplace=True)
median_with_post.reset_index(inplace=True)
median = median.merge(median_with_post[['Gemeente', 'Mediaanprijs 2020']], on='Gemeente')
median_with_post = median.merge(post[['Code postal', 'Commune Principale']], how='left', left_on='Gemeente', right_on='Commune Principale')
median_prices = median_with_post[['Code postal', 'Mediaanprijs 2020_y']]
median_prices.columns = ['postal_code', 'median_price']
median_prices = median_prices.groupby('postal_code').mean()
median_prices.reset_index(inplace=True)
median_prices['postal_code'] = median_prices['postal_code'].astype('int64')
datas = datas.merge(median_prices, how='left', left_on='locality', right_on='postal_code')
datas.drop('postal_code', inplace=True, axis=1)
datas.sort_values(by='locality', ascending=False, inplace=True)
datas['median_price'].fillna(method='ffill', inplace=True)
datas.sort_index(inplace=True)
col = datas.columns
col = [col[0]]+[col[-1]]+list(col[1:-1])
datas = datas[col]

In [27]:
#y = datas.pop('price')
X = datas
X[~X.area.isnull()]

Unnamed: 0,price,median_price,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
2,764999.0,426.25,2970,2,153.0,0,0,0,1,62,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
4,294999.0,247.00,3200,2,80.0,0,0,0,0,0,0,0,0.0,,0,unkown,Vlaams-Brabant,Vlaams,apartment
6,233999.0,280.00,8211,2,90.0,0,0,0,0,0,0,0,0.0,,0,unkown,West-Vlanderen,Vlaams,apartment
7,329899.0,350.00,2630,1,87.0,0,0,0,1,28,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
9,359899.0,350.00,2630,1,95.0,0,0,0,1,47,0,0,0.0,4,0,unkown,Antwerp,Vlaams,apartment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72427,307242.0,228.50,8902,3,150.0,0,0,0,0,0,0,0,288.0,3,0,unkown,West-Vlanderen,Vlaams,house
72428,315000.0,250.00,9600,3,150.0,0,0,0,1,,0,0,582.0,3,0,good,Oost-Vlanderen,Vlaams,house
72429,315000.0,250.00,9600,3,150.0,0,0,0,1,,0,0,610.0,3,0,good,Oost-Vlanderen,Vlaams,house
72433,175000.0,126.00,6000,4,205.0,0,0,0,1,23,1,600,800.0,2,0,unkown,Hainaut,Wallonie,house


In [6]:
pipe = load(open('preprocessor.pkl', 'rb'))
model_xgb = xgb.XGBRegressor()
model_xgb.load_model("model.json")


In [11]:
X = pipe.transform(X)

preds = model_xgb.predict(X)
rmse = np.sqrt(mean_squared_error( y, np.exp(preds)))
print("RMSE: %f" % (rmse))
print("R2 score: ", model_xgb.score(X,  np.log(y)))

RMSE: 197875.914774
R2 score:  0.8365436930880337


In [34]:
building = {'median_price':[600], #get_median_price(loc)
            'locality':[1000],
            'number_of_rooms':[1],
            'area':[70],
            'fully_equipped_kitchen':[1],
            'furnished':[0],
            'open_fire':[0],
            "terrace":[0],
            'terrace_area':[0],
            'garden':[0],
            'garden_area':[0],
            'surface_of_the_land':[0],
            "number_of_facades":[2],
            'swimming_pool':[0],
            'state_of_the_building':['good'],
            'province':['Brussels'],
            'region': ['Brussels Capital'],
            'type_of_property':['apartment']}
data = pd.DataFrame.from_dict(building)
data

Unnamed: 0,median_price,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
0,600,1000,1,70,1,0,0,0,0,0,0,0,2,0,good,Brussels,Brussels Capital,apartment


In [35]:
X = pipe.transform(data)

preds = model_xgb.predict(X)
print(np.exp(preds))

[263872.44]
