In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, ElasticNetCV, LassoCV, RidgeCV, ElasticNet
from data_cleaner import eliza_cleaning, eliza_fillna
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Normalizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [2]:
raw_datas = pd.read_csv('https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv')
print(raw_datas.shape)
raw_datas.head()

(73510, 24)


Unnamed: 0.1,Unnamed: 0,Url,Source,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Area,...,Terrace Area,Garden,Garden Area,Surface of the land,Surface area of the plot of land,Number of facades,Swimming pool,State of the building,Province,Region
0,0,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,319799.0,regular sale,1.0,,...,11.0,0.0,,,,,0.0,,Antwerp,Vlaams
1,1,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,291999.0,regular sale,1.0,,...,6.0,0.0,,,,,0.0,,Antwerp,Vlaams
2,2,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,764999.0,regular sale,2.0,153.0,...,62.0,0.0,,,,,0.0,,Antwerp,Vlaams
3,3,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,2970,apartment,,660264.0,regular sale,3.0,,...,160.0,0.0,,,,,0.0,,Antwerp,Vlaams
4,4,https://www.logic-immo.be/fr/vente/appartement...,logic-immo.be,3200,apartment,,294999.0,regular sale,2.0,80.0,...,,0.0,,,,,0.0,,Vlaams-Brabant,Vlaams


In [3]:
datas = eliza_cleaning(raw_datas.copy())
datas = eliza_fillna(datas)
print(datas.shape)
datas.head()

(72392, 18)


Unnamed: 0,price,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
0,319799.0,2970,1,,0,0,0,1,11,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
1,291999.0,2970,1,,0,0,0,1,6,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
2,764999.0,2970,2,153.0,0,0,0,1,62,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
3,660264.0,2970,3,,0,0,0,1,160,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
4,294999.0,3200,2,80.0,0,0,0,0,0,0,0,0.0,,0,unkown,Vlaams-Brabant,Vlaams,apartment


In [4]:
100*datas.isnull().sum()/datas.shape[0]

price                      0.000000
locality                   0.000000
type_of_property           0.000000
number_of_rooms            1.099569
area                      15.130125
fully_equipped_kitchen     0.000000
furnished                  0.000000
open_fire                  0.000000
terrace                    0.000000
terrace_area              15.565256
garden                     0.000000
garden_area                9.184717
surface_of_the_land        9.436126
number_of_facades         50.600895
swimming_pool              0.000000
state_of_the_building      0.000000
province                   0.000000
region                     0.000000
dtype: float64

In [5]:
datas[datas['type_of_property'].isnull()].describe()

Unnamed: 0,price,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,garden,surface_of_the_land,number_of_facades,swimming_pool
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,,
std,,,,,,,,,,,,
min,,,,,,,,,,,,
25%,,,,,,,,,,,,
50%,,,,,,,,,,,,
75%,,,,,,,,,,,,
max,,,,,,,,,,,,


In [6]:
datas['state_of_the_building'].unique()

array(['unkown', 'good', 'new', 'to renovate'], dtype=object)

In [7]:
house = datas[datas.type_of_property=='house']
house.isnull().sum()/house.shape[0]

price                     0.000000
locality                  0.000000
type_of_property          0.000000
number_of_rooms           0.007594
area                      0.216994
fully_equipped_kitchen    0.000000
furnished                 0.000000
open_fire                 0.000000
terrace                   0.000000
terrace_area              0.194326
garden                    0.000000
garden_area               0.130783
surface_of_the_land       0.075822
number_of_facades         0.391136
swimming_pool             0.000000
state_of_the_building     0.000000
province                  0.000000
region                    0.000000
dtype: float64

In [8]:
appart = datas[datas.type_of_property=='apartment']
appart.isnull().sum()/appart.shape[0]

price                     0.000000
locality                  0.000000
type_of_property          0.000000
number_of_rooms           0.013826
area                      0.087270
fully_equipped_kitchen    0.000000
furnished                 0.000000
open_fire                 0.000000
terrace                   0.000000
terrace_area              0.119785
garden                    0.000000
garden_area               0.055083
surface_of_the_land       0.110140
number_of_facades         0.612448
swimming_pool             0.000000
state_of_the_building     0.000000
province                  0.000000
region                    0.000000
dtype: float64

In [75]:
appart[(appart.surface_of_the_land.isnull()) & appart.garden_area==0 ].shape[0]/appart.shape[0]

0.9449438509248886

## na filling :

1) appartement : 
    -number of room: median
    -area: mean
    -furnished: 0
    -terrace_area: mean 
    -garden_area: mean 
    -surface_of_the_land: 0
    -number_of_facades: median
    -state_of_the_building: new cat
 2) house: 
     -number_of_rooms: median
     -area : mean
     -furnished:0
     -terrace_area:mean
     -garden_area:mean
     -surface_of_the_land: 0
     -number_of_facades: median
     -state_of_the_building: new cat

In [95]:
house_val = datas[datas.type_of_property=='house']['number_of_facades'].fill_function_1()
apart_val = datas[datas.type_of_property=='apartment']['number_of_facades'].apply(full_function_2)
datas['number_of_facades'] = datas.fillna(house_val).where(datas.type_of_property=='house', other=datas.fillna(apart_val))['number_of_facades']

## Train splittin and preprocessing

In [29]:
datas = eliza_cleaning(raw_datas.copy())
datas = eliza_fillna(datas)
house = datas[datas['type_of_property']=='house'].copy()
appart = datas[datas['type_of_property']=='apartment'].copy()
y = datas.pop('price')
X = datas
house_y = house.pop('price')
house_x = house
appart_y = appart.pop('price')
appart_x = appart

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [9]:
fillna = ColumnTransformer(
        [('imp_col1', SimpleImputer(strategy='mean'), ['area', 'terrace_area', 'garden_area', 
                                                      'surface_of_the_land']),
         ('imp_col2', SimpleImputer(strategy='median'), ['number_of_rooms', 'number_of_facades']),
        ],
        
        remainder='passthrough')
enc = ColumnTransformer(
        [
         ('enc', OneHotEncoder(sparse = False, drop ='first'), [-4, -3,-2,-1]),
         #('enc2', OneHotEncoder(sparse = False, handle_unknown='ignore'), [-7])
        ], remainder='passthrough')


model = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=20, min_samples_leaf=10, 
                               n_jobs=-1, warm_start=True)

In [10]:
pipe = make_pipeline(fillna, enc, model )

In [None]:
pipe.fit(X_train, y_train)

In [224]:
pipe.score(X_train, y_train)

0.44874589282123445

In [15]:
X_train.head()

Unnamed: 0,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
31922,4300,3,,0,0,0,0,0.0,0,0,643.0,2.0,0,good,Liège,Wallonie,house
27322,8670,4,,0,0,0,0,0.0,0,0,935.0,,0,unkown,West-Vlanderen,Vlaams,house
24554,8301,2,109.0,0,0,0,1,,0,0,0.0,2.0,0,good,West-Vlanderen,Vlaams,apartment
19379,9000,3,154.0,0,0,0,1,,0,0,0.0,2.0,0,good,Oost-Vlanderen,Vlaams,apartment
60080,7340,1,,0,0,0,0,0.0,0,0,135.0,,0,to renovate,Hainaut,Wallonie,house


In [26]:
fillna =  ColumnTransformer(
        [ ('imp', KNNImputer(n_neighbors=5, weights="uniform"), list(range(1,12)))],
         remainder='passthrough')
    


enc = ColumnTransformer(
        [
         ('enc', OneHotEncoder(sparse = False, drop ='first'), [-4, -3,-2,-1]),
        ], remainder='passthrough')
model = Ridge(5)
pipe = make_pipeline(enc, fillna, model )

In [20]:
pipe.fit(X_train, y_train)

ValueError: not enough values to unpack (expected 3, got 2)

In [28]:
pipe = make_pipeline(fillna, enc )
pipe.fit(X_train)
pd.DataFrame(pipe.transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0.0,0,0,643,2.0,4300,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0.0,0,0,935,3.6,8670,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,18.6,0,0,0,2.0,8301,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,16.6,0,0,0,2.0,9000,0
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0.0,0,0,135,3.0,7340,0


In [23]:
X_train.head()

Unnamed: 0,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
31922,4300,3,,0,0,0,0,0.0,0,0,643.0,2.0,0,good,Liège,Wallonie,house
27322,8670,4,,0,0,0,0,0.0,0,0,935.0,,0,unkown,West-Vlanderen,Vlaams,house
24554,8301,2,109.0,0,0,0,1,,0,0,0.0,2.0,0,good,West-Vlanderen,Vlaams,apartment
19379,9000,3,154.0,0,0,0,1,,0,0,0.0,2.0,0,good,Oost-Vlanderen,Vlaams,apartment
60080,7340,1,,0,0,0,0,0.0,0,0,135.0,,0,to renovate,Hainaut,Wallonie,house


In [52]:
np.log1p(y_val).std()

0.6091497334233146

# model Testing

In [45]:


from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, ElasticNetCV, LassoCV, RidgeCV
from data_cleaner import eliza_cleaning, eliza_fillna
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Normalizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

raw_datas = pd.read_csv('https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv')
datas = eliza_cleaning(raw_datas)
datas = eliza_fillna(datas)
#datas.drop(columns=['locality'], inplace=True)
house = datas[datas['type_of_property']=='house'].copy()
appart = datas[datas['type_of_property']=='apartment'].copy()
y = datas.pop('price')
X = datas
house_y = house.pop('price')
house_x = house
appart_y = appart.pop('price')
appart_x = appart

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

fillna = ColumnTransformer(
        [('imp_col1', SimpleImputer(strategy='mean'), ['area', 'terrace_area', 'garden_area', 
                                                      'surface_of_the_land']),
         ('imp_col2', SimpleImputer(strategy='median'), ['number_of_rooms', 'number_of_facades']),
        ],
        
        remainder='passthrough')
fillna = ColumnTransformer(
        [ ('imp', KNNImputer(n_neighbors=2, weights="uniform"), list(range(1,12)))],
         remainder='passthrough')
enc = ColumnTransformer(
        [
         ('enc', OneHotEncoder(sparse = False, drop ='first'), [-4, -3,-2,-1]),
         #('enc2', OneHotEncoder(sparse = False, handle_unknown='ignore'), [-7])
        ], remainder='passthrough')


model = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=20, min_samples_leaf=10, 
                               n_jobs=-1, warm_start=True)
#model = ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)

#pipe = make_pipeline(fillna, enc, Normalizer())
#pipe.fit(X_train)


#xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
#                max_depth = 5, alpha = 10, n_estimators = 10)
#xg_reg.fit(pipe.transform(X_train),y_train)
#preds = xg_reg.predict(pipe.transform(X_val))
#rmse = np.sqrt(mean_squared_error(y_val, preds))
#print("RMSE: %f" % (rmse))


alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
model = ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)

pipe = make_pipeline(fillna, enc,Normalizer(),  model )
pipe.fit(X_train, np.log1p(y_train))
preds = pipe.predict(X_val)
rmse = np.sqrt(mean_squared_error(np.log(y_val), preds))
print("RMSE: %f" % (rmse))
print(pipe.score(X_val, np.log1p(y_val)))


RMSE: 0.505654
0.32117467977123215


In [61]:


kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)



In [62]:


alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]



In [63]:
pipe = make_pipeline(fillna, enc,Normalizer())

ridge = make_pipeline(pipe, RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(pipe, LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(pipe, ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(pipe, SVR(C= 20, epsilon= 0.008, gamma=0.0003,))



In [64]:


gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)                             



In [66]:


xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)




In [70]:
from mlxtend.regressor import StackingCVRegressor
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [71]:
score = cv_rmse(ridge)
score = cv_rmse(lasso)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

LASSO: 303903.3238 (32447.7771)
 2021-05-28 15:25:12.435163
elastic net: 303903.3604 (32447.7217)
 2021-05-28 15:48:18.130954
SVR: 405713.9556 (35808.7061)
 2021-05-28 17:02:44.788954
gbr: nan (nan)
 2021-05-28 17:02:44.950228


Traceback (most recent call last):
  File "/home/nathanael/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/nathanael/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_gb.py", line 409, in fit
    X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
  File "/home/nathanael/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/nathanael/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/home/nathanael/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/nathanael/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/home/nat

xgboost: nan (nan)
 2021-05-28 17:02:45.072968
