In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, ElasticNetCV, LassoCV, RidgeCV, Lasso, SGDRegressor
from data_cleaner import eliza_cleaning, eliza_fillna
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Normalizer, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import explained_variance_score as evs # evaluation metric
from sklearn.metrics import r2_score as r2 # evaluation metric
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# No extra data

In [2]:
raw_datas = pd.read_csv('https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv')
datas_clean = eliza_cleaning(raw_datas)
datas_clean = eliza_fillna(datas_clean)


## preprocessing with KNN

In [3]:
y = datas_clean.pop('price')
X = datas_clean

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)


fillna = ColumnTransformer(
        [ ('imp', KNNImputer(n_neighbors=10, weights="uniform"), list(range(1,12)))],
         remainder='passthrough') #2 sur le premier test

enc = ColumnTransformer(
        [
         ('enc', OneHotEncoder(sparse = False, drop ='first'), [-4, -3,-2,-1]),
         #('enc2', OneHotEncoder(sparse = False, handle_unknown='ignore'), [-7])
        ], remainder='passthrough')

In [None]:
pipe = make_pipeline(fillna, enc, StandardScaler())
pipe.fit(X_train)
X_train_knn = pipe.transform(X_train)
X_val_knn = pipe.transform(X_val)
X_test_knn = pipe.transform(X_test)

In [33]:
my_XGB_model = XGBRegressor()
my_XGB_model.fit(X_train_knn, np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(X_val_knn)
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(X_train_knn, np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(X_val_knn, np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(X_val_knn)
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(X_test_knn)
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(X_test_knn,  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

knn_raw =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(X_train_knn)
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(X_train_knn,  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))



Validation MAE for XGBoost Model : 90473.35796129025
Validation MAE for multi-pass XGBoost Model : 89223.46003470874
RMSE: 263084.930920
0.7452970683178033
0.7452970683178033
RMSE_Train: 158992.619017
0.8007125906547221
0.8007125908451034


In [38]:
poly = PolynomialFeatures(2)
poly.fit(X_train)

my_XGB_model = XGBRegressor()
my_XGB_model.fit(poly.transform(X_train_knn), np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(poly.transform(X_val_knn))
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(poly.transform(X_train_knn), np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(poly.transform(X_val_knn), np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(poly.transform(X_val_knn))
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(poly.transform(X_test_knn))
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(poly.transform(X_test_knn),  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

knn_raw2 =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(poly.transform(X_train_knn))
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(poly.transform(X_train_knn),  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))

Validation MAE for XGBoost Model : 91111.7588565948
Validation MAE for multi-pass XGBoost Model : 88883.5656012583
RMSE: 258963.821500
0.7464916645451543
0.7464916645451543
RMSE_Train: 147782.467474
0.8245784370370017
0.8245784370487407


## manual preprocessing

In [42]:
fillna = ColumnTransformer(
        [('imp_col1', SimpleImputer(strategy='mean'), ['area', 'terrace_area', 'garden_area', 
                                                      'surface_of_the_land']),
         ('imp_col2', SimpleImputer(strategy='median'), ['number_of_rooms', 'number_of_facades']),
        ],remainder='passthrough')

In [43]:
pipe = make_pipeline(fillna, enc, StandardScaler())
pipe.fit(X_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [44]:
my_XGB_model = XGBRegressor()
my_XGB_model.fit(X_train, np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(X_val)
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(X_train, np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(X_val, np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(X_val)
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(X_test)
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(X_test,  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

manual_raw =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(X_train,  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))

Validation MAE for XGBoost Model : 88050.8466914066
Validation MAE for multi-pass XGBoost Model : 85743.92652789918
RMSE: 251974.488014
0.7495535110272445
0.7495535110272445
RMSE_Train: 155946.592321
0.8112991371169392
0.8112991374031883


In [45]:
poly = PolynomialFeatures(2)
poly.fit(X_train)

my_XGB_model = XGBRegressor()
my_XGB_model.fit(poly.transform(X_train), np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(poly.transform(X_val))
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(poly.transform(X_train), np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(poly.transform(X_val), np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(poly.transform(X_val))
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(poly.transform(X_test))
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(poly.transform(X_test),  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

manual_raw2 =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(poly.transform(X_train))
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(poly.transform(X_train),  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))

Validation MAE for XGBoost Model : 88722.86407072865
Validation MAE for multi-pass XGBoost Model : 86480.35087891469
RMSE: 262379.207298
0.7465231937615295
0.7465231937615295
RMSE_Train: 149260.718587
0.8273149301211098
0.8273149301961114


### comparisons:

In [47]:
scores = pd.DataFrame([knn_raw, knn_raw2, manual_raw, manual_raw2], 
                      index = ['KNN imputer', 'KNN imputer def 2', 'manual imputer', 'manual imputer deg 2'],
                      columns =['RMSE', 'R2 score', 'explained variance score'])
scores.head()

Unnamed: 0,RMSE,R2 score,explained variance score
KNN imputer,263084.93092,0.745298,0.745297
KNN imputer def 2,258963.8215,0.746492,0.746492
manual imputer,251974.488014,0.749557,0.749554
manual imputer deg 2,262379.207298,0.746524,0.746523


# External Datas added:

In [63]:
raw_datas = pd.read_csv('https://raw.githubusercontent.com/JulienAlardot/challenge-collecting-data/main/Data/database.csv')
datas = eliza_cleaning(raw_datas)
datas = eliza_fillna(datas)
datas.head()

Unnamed: 0,price,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
0,319799.0,2970,1,,0,0,0,1,11,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
1,291999.0,2970,1,,0,0,0,1,6,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
2,764999.0,2970,2,153.0,0,0,0,1,62,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
3,660264.0,2970,3,,0,0,0,1,160,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
4,294999.0,3200,2,80.0,0,0,0,0,0,0,0,0.0,,0,unkown,Vlaams-Brabant,Vlaams,apartment


In [64]:
median = pd.read_csv('median.csv')
post = pd.read_csv('post_codes.csv', sep=';')
median['Gemeente'] = median['Gemeente'].str.lower()
post['Commune Principale'] = post['Commune principale'].str.lower()
median_with_post = median.merge(post[['Code postal', 'Commune Principale']], how='left', left_on='Gemeente', right_on='Commune Principale')
median_with_post = median_with_post.groupby('Gemeente').median()
median_with_post['Mediaanprijs 2020'].fillna(median_with_post['Mediaanprijs 2019'], inplace=True)
median_with_post['Mediaanprijs 2020'].fillna(median_with_post['Mediaanprijs 2018'], inplace=True)
median_with_post.sort_values(by='Code postal', inplace=True)
median_with_post.fillna(method='bfill', inplace=True)
median_with_post.reset_index(inplace=True)
median = median.merge(median_with_post[['Gemeente', 'Mediaanprijs 2020']], on='Gemeente')
median_with_post = median.merge(post[['Code postal', 'Commune Principale']], how='left', left_on='Gemeente', right_on='Commune Principale')
median_prices = median_with_post[['Code postal', 'Mediaanprijs 2020_y']]
median_prices.columns = ['postal_code', 'median_price']
median_prices = median_prices.groupby('postal_code').mean()
median_prices.reset_index(inplace=True)
median_prices['postal_code'] = median_prices['postal_code'].astype('int64')
datas = datas.merge(median_prices, how='left', left_on='locality', right_on='postal_code')
datas.drop('postal_code', inplace=True, axis=1)
datas.sort_values(by='locality', ascending=False, inplace=True)
datas['median_price'].fillna(method='ffill', inplace=True)
datas.sort_index(inplace=True)
#datas.drop(columns=['locality', 'terrace', 'garden','province', 'region' ], inplace=True)
col = datas.columns
col = [col[0]]+[col[-1]]+list(col[1:-1])
datas = datas[col]
datas.head()

Unnamed: 0,price,median_price,locality,number_of_rooms,area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,province,region,type_of_property
0,319799.0,426.25,2970,1,,0,0,0,1,11,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
1,291999.0,426.25,2970,1,,0,0,0,1,6,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
2,764999.0,426.25,2970,2,153.0,0,0,0,1,62,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
3,660264.0,426.25,2970,3,,0,0,0,1,160,0,0,0.0,,0,unkown,Antwerp,Vlaams,apartment
4,294999.0,247.0,3200,2,80.0,0,0,0,0,0,0,0,0.0,,0,unkown,Vlaams-Brabant,Vlaams,apartment


## KNN preprocessing

In [65]:
y = datas.pop('price')
X = datas

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)


fillna = ColumnTransformer(
        [ ('imp', KNNImputer(n_neighbors=2, weights="uniform"), list(range(1,14)))],
         remainder='passthrough')

enc = ColumnTransformer(
        [
         ('enc', OneHotEncoder(sparse = False, drop ='first'), [-4, -3,-2,-1]),
         #('enc2', OneHotEncoder(sparse = False, handle_unknown='ignore'), [-7])
        ], remainder='passthrough')

In [66]:
pipe = make_pipeline(fillna, enc, StandardScaler())
pipe.fit(X_train)
X_train_knn = pipe.transform(X_train)
X_val_knn = pipe.transform(X_val)
X_test_knn = pipe.transform(X_test)

In [67]:
my_XGB_model = XGBRegressor()
my_XGB_model.fit(X_train_knn, np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(X_val_knn)
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(X_train_knn, np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(X_val_knn, np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(X_val_knn)
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(X_test_knn)
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(X_test_knn,  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

knn =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(X_train_knn)
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(X_train_knn,  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))



Validation MAE for XGBoost Model : 87442.30877552232
Validation MAE for multi-pass XGBoost Model : 86419.91952915912
RMSE: 241518.028781
0.7258708041654378
0.7258708041654378
RMSE_Train: 169972.954138
0.7929494215499377
0.7929494225217547


In [69]:
poly = PolynomialFeatures(2)
poly.fit(X_train_knn)

my_XGB_model = XGBRegressor()
my_XGB_model.fit(poly.transform(X_train_knn), np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(poly.transform(X_val_knn))
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(poly.transform(X_train_knn), np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(poly.transform(X_val_knn), np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(poly.transform(X_val_knn))
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(poly.transform(X_test_knn))
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(poly.transform(X_test_knn),  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

knn2 =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(poly.transform(X_train_knn))
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(poly.transform(X_train_knn),  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))

Validation MAE for XGBoost Model : 88258.69999220976
Validation MAE for multi-pass XGBoost Model : 86109.41107185639
RMSE: 243168.871752
0.7251136328821335
0.7251136328821335
RMSE_Train: 156995.905984
0.8194789135228127
0.8194789142239381


## Manual Preprocessing

In [70]:
fillna = ColumnTransformer(
        [('imp_col1', SimpleImputer(strategy='mean'), ['area', 'terrace_area', 'garden_area', 
                                                      'surface_of_the_land']),
         ('imp_col2', SimpleImputer(strategy='median'), ['number_of_rooms', 'number_of_facades']),
        ],remainder='passthrough')

In [71]:
pipe = make_pipeline(fillna, enc, StandardScaler())
pipe.fit(X_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [72]:
my_XGB_model = XGBRegressor()
my_XGB_model.fit(X_train, np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(X_val)
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(X_train, np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(X_val, np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(X_val)
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(X_test)
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(X_test,  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

manual =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(X_train,  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))

Validation MAE for XGBoost Model : 85802.56968118687
Validation MAE for multi-pass XGBoost Model : 83565.6799839339
RMSE: 238601.376548
0.7431936378579027
0.7431936378579027
RMSE_Train: 165370.207656
0.8095363307373893
0.8095363309763812


In [73]:
poly = PolynomialFeatures(2)
poly.fit(X_train)

my_XGB_model = XGBRegressor()
my_XGB_model.fit(poly.transform(X_train), np.log(y_train), verbose=False)

# make predictions
XGB_predictions = my_XGB_model.predict(poly.transform(X_val))
XGB_predictions = np.exp(XGB_predictions)
# Print MAE for initial XGB model
XGB_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for XGBoost Model : " + str(XGB_mae))
      
# Additional Passes
my_XGB_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_XGB_model.fit(poly.transform(X_train), np.log(y_train), early_stopping_rounds=5, 
             eval_set=[(poly.transform(X_val), np.log(y_val))], verbose=False)
XGB_predictions = my_XGB_model.predict(poly.transform(X_val))
XGB_predictions = np.exp(XGB_predictions)
XGB_mult_mae = mean_absolute_error(XGB_predictions, y_val)
print("Validation MAE for multi-pass XGBoost Model : " + str(XGB_mult_mae))



preds = my_XGB_model.predict(poly.transform(X_test))
rmse = np.sqrt(mean_squared_error( y_test, np.exp(preds)))
print("RMSE: %f" % (rmse))
score = my_XGB_model.score(poly.transform(X_test),  np.log(y_test))
print(score)
exp_reg_score = evs( np.log(y_test), preds)        
print(score)

manual2 =[rmse, exp_reg_score, score] 
               
preds_train = my_XGB_model.predict(poly.transform(X_train))
rmse_train = np.sqrt(mean_squared_error( y_train, np.exp(preds_train)))
print("RMSE_Train: %f" % (rmse_train))
print(my_XGB_model.score(poly.transform(X_train),  np.log(y_train)))
print(evs( np.log(y_train),  preds_train))

Validation MAE for XGBoost Model : 85528.79124140713
Validation MAE for multi-pass XGBoost Model : 85266.78001873704
RMSE: 242014.243486
0.7357884549796763
0.7357884549796763
RMSE_Train: 166041.632108
0.809314839350535
0.8093148395952371


# Comparisons

In [83]:
scores = pd.DataFrame([knn_raw, knn_raw2, manual_raw, manual_raw2,
                      knn, knn2, manual, manual2], 
                      index = ['KNN imputer', 'KNN imputer deg 2', 'manual imputer', 'manual imputer deg 2',
                              'KNN imputer extra', 'KNN imputer deg 2 extra', 'manual imputer extra', 'manual impter deg 2 extra'],
                      columns =['RMSE', 'R2 score', 'explained variance score'])
scores.drop(columns=['explained variance score'], inplace=True)
scores.sort_values(by='RMSE').head(10)

Unnamed: 0,RMSE,R2 score
manual imputer extra,238601.376548,0.743195
KNN imputer extra,241518.028781,0.725871
manual impter deg 2 extra,242014.243486,0.73579
KNN imputer deg 2 extra,243168.871752,0.725114
manual imputer,251974.488014,0.749557
KNN imputer deg 2,258963.8215,0.746492
manual imputer deg 2,262379.207298,0.746524
KNN imputer,263084.93092,0.745298
