In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, ElasticNetCV, LassoCV, RidgeCV, Lasso, SGDRegressor
from data_cleaner import eliza_cleaning, eliza_fillna
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Normalizer, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import explained_variance_score as evs # evaluation metric
from sklearn.metrics import r2_score as r2 # evaluation metric
from sklearn.metrics import mean_squared_error

In [4]:


raw_datas = pd.read_csv('https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv')
datas = eliza_cleaning(raw_datas)
datas = eliza_fillna(datas)
#datas.drop(columns=['locality'], inplace=True)
house = datas[datas['type_of_property']=='house'].copy()
appart = datas[datas['type_of_property']=='apartment'].copy()
datas = datas[datas.price>50000]
y = datas.pop('price')
X = datas
house_y = house.pop('price')
house_x = house
appart_y = appart.pop('price')
appart_x = appart

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)


fillna = ColumnTransformer(
        [ ('imp', KNNImputer(n_neighbors=2, weights="uniform"), list(range(1,12)))],
         remainder='passthrough')
fillna = ColumnTransformer(
        [('imp_col1', SimpleImputer(strategy='mean'), ['area', 'terrace_area', 'garden_area', 
                                                      'surface_of_the_land']),
         ('imp_col2', SimpleImputer(strategy='median'), ['number_of_rooms', 'number_of_facades']),
        ],remainder='passthrough')
enc = ColumnTransformer(
        [
         ('enc', OneHotEncoder(sparse = False, drop ='first'), [-4, -3,-2,-1]),
         #('enc2', OneHotEncoder(sparse = False, handle_unknown='ignore'), [-7])
        ], remainder='passthrough')


model = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=20, min_samples_leaf=10, 
                               n_jobs=-1, warm_start=True)
#model = ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)

#pipe = make_pipeline(fillna, enc, Normalizer())
#pipe.fit(X_train)


#xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
#                max_depth = 5, alpha = 10, n_estimators = 10)
#xg_reg.fit(pipe.transform(X_train),y_train)
#preds = xg_reg.predict(pipe.transform(X_val))
#rmse = np.sqrt(mean_squared_error(y_val, preds))
#print("RMSE: %f" % (rmse))


alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
model = ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)




In [None]:
pipe = make_pipeline(fillna, enc, Normalizer(), PolynomialFeatures(2), model )
pipe.fit(X_train, y_train)
preds = pipe.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
print("RMSE: %f" % (rmse))
print(pipe.score(X_val, y_val))
print(evs(y_val, preds))
print(r2(y_val, preds))

preds_train = pipe.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(pipe.score(X_train, y_train))
print(evs(y_train, preds_train))
print(r2(y_train, preds_train))

In [5]:
pipe = make_pipeline(fillna, enc, StandardScaler())
pipe.fit(X_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)

In [18]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
model =  ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)
model.fit(X_train, np.log1p(y_train))
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(model.score(X_val,  np.log1p(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log1p(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(model.score(X_train,  np.log1p(y_train)))
print(evs( np.log1p(y_train),  preds_train))


RMSE: 0.510622
0.3055113531185044
0.3055345528609442
RMSE_Train: 0.509004
0.30966406006713276
0.30966406006713276


In [26]:
model =  make_pipeline(PolynomialFeatures(2), Ridge(5))
model.fit(X_train, np.log1p(y_train))
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(model.score(X_val,  np.log1p(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log1p(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(model.score(X_train,  np.log1p(y_train)))
print(evs( np.log1p(y_train),  preds_train))


RMSE: 0.507959
0.312735116688046
0.31275222174227213
RMSE_Train: 0.506765
0.3157230210377473
0.3157230210377474


In [108]:
model = make_pipeline(PolynomialFeatures(2),Lasso(1))
model.fit(X_train, np.log1p(y_train))
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(model.score(X_val,  np.log1p(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log1p(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(model.score(X_train,  np.log1p(y_train)))
print(evs( np.log1p(y_train),  preds_train))

RMSE: 0.613471
0.0025255843666625033
0.00252933853418158
RMSE_Train: 0.597570
0.006878350529813004
0.006878350529813004


In [66]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

my_XGB_model = XGBRegressor()
my_XGB_model.fit(X_train, np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(X_val)
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(X_train, np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(X_val, np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(X_val)
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log(y_val), preds))
print("RMSE: %f" % (rmse))
print(my_XGB_model.score(X_val,  np.log(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = my_XGB_model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(X_train,  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))
print("RMSE: %f" % (rmse))
print(my_XGB_model.score(X_val,  np.log(y_val)))
print(evs( np.log(y_val), preds))




Validation MAE for XGBoost Model : 91290.46625236742
Validation MAE for multi-pass XGBoost Model : 89948.58695819805
RMSE: 0.299181
0.7627636546822527
0.7628442797722688
RMSE_Train: 0.265319
0.8042234441848317
0.8042234441926572
RMSE: 0.299181
0.7627636546822527
0.7628442797722688
RMSE_Train: 0.265319
0.8042234441848317
0.8042234441926572


In [72]:
poly = PolynomialFeatures(2)
poly.fit(X_train)

my_XGB_model = XGBRegressor()
my_XGB_model.fit(poly.transform(X_train), np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(poly.transform(X_val))
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(poly.transform(X_train), np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(poly.transform(X_val), np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(poly.transform(X_val))
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(poly.transform(X_val))
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(my_XGB_model.score(poly.transform(X_val),  np.log1p(y_val)))
print(evs( np.log(y_val), preds))

preds_train = my_XGB_model.predict(poly.transform(X_train))
rmse_train = np.sqrt(mean_squared_error( np.log(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(poly.transform(X_train),  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))




Validation MAE for XGBoost Model : 92382.0016101867
Validation MAE for multi-pass XGBoost Model : 89508.91307968073
RMSE: 0.296273
0.7673526209578967
0.7674292101448412
RMSE_Train: 0.242655
0.8362428318096249
0.8362428319105412


In [95]:
model = SGDRegressor(max_iter=10*12, tol=1e-3)
model.fit(X_train, np.log1p(y_train))
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(model.score(X_val,  np.log1p(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log1p(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(model.score(X_train,  np.log1p(y_train)))
print(evs( np.log1p(y_train),  preds_train))


RMSE: 0.577182
0.11704270548828088
0.11764209670120385
RMSE_Train: 0.454427
0.4256801833173314
0.42571124577496877


In [93]:
model = make_pipeline(PolynomialFeatures(2), RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]))
model.fit(X_train, np.log1p(y_train))
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(model.score(X_val,  np.log1p(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log1p(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(model.score(X_train,  np.log1p(y_train)))
print(evs( np.log1p(y_train),  preds_train))



RMSE: 0.429755
0.5104956760020802
0.5105589287315657
RMSE_Train: 0.380113
0.5981637241120028
0.5981637241120028


In [99]:
from sklearn.linear_model import Lars

model = make_pipeline(PolynomialFeatures(2), Lars(n_nonzero_coefs=40))
model.fit(X_train, np.log1p(y_train))
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(model.score(X_val,  np.log1p(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log1p(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(model.score(X_train,  np.log1p(y_train)))
from sklearn.linear_model import Lars
print(evs( np.log1p(y_train),  preds_train))

RMSE: 0.442299
0.4815030602166884
0.481563756322101
RMSE_Train: 0.423742
0.5006227333347059
0.5006227333347059


In [106]:
from sklearn.linear_model import BayesianRidge


model = make_pipeline(PolynomialFeatures(2),BayesianRidge(n_iter=2000))
model.fit(X_train, np.log1p(y_train))
preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error( np.log1p(y_val), preds))
print("RMSE: %f" % (rmse))
print(model.score(X_val,  np.log1p(y_val)))
print(evs( np.log1p(y_val), preds))

preds_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( np.log1p(y_train), preds_train))
print("RMSE_Train: %f" % (rmse_train))
print(model.score(X_train,  np.log1p(y_train)))
print(evs( np.log1p(y_train),  preds_train))

RMSE: 0.427820
0.5148938307184496
0.514963611363113
RMSE_Train: 0.380648
0.5970303754128237
0.5970303754128237


In [6]:
datas.head()

Unnamed: 0,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
0,2970,1,,0,0,0,1,11,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
1,2970,1,,0,0,0,1,6,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
2,2970,2,153.0,0,0,0,1,62,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
3,2970,3,,0,0,0,1,160,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
4,3200,2,80.0,0,0,0,0,0,0,0,0.0,,0,unkown,Vlaams-Brabant,Vlaams,apartment
