In [33]:
# Je regroupe les imports en début de notebook
import os
import time
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import xgboost as xgb # installation : conda install -c conda-forge xgboost

# Je me positionne dans mon répertoire de travail personnel
os.chdir("C:\\Users\\Julien Gremillot\\OneDrive\\Documents\\OpenClassrooms\\Jupyter")
os.getcwd()

# Lecture du fichier CSV
data = pd.read_csv("building-energy-model-data.csv", sep=',') # séparateur = virgule

In [2]:
data.columns

Index(['CouncilDistrictCode', 'NumberofBuildings', 'NumberofFloors',
       'PropertyGFAParking', 'PropertyGFABuilding(s)',
       'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA',
       'Latitude', 'Longitude', 'ZipCode',
       ...
       '5.2', '6.2', '7.2', '8.2', '9.2', '10.2', '11.2', '0.4', '1.4', '2.4'],
      dtype='object', length=117)

In [3]:
index_consommation = -1
index_emissions = -1
for i in range(len(data.columns)):
    if data.iloc[:, i].name == 'consommation':
        index_consommation = i
    if data.iloc[:, i].name == 'emissions':
        index_emissions = i
print('index_consommation =', index_consommation, '\nindex_emissions =', index_emissions)

index_consommation = 11 
index_emissions = 12


In [4]:
# On récupère les features d'un côté...
X = pd.concat([data.iloc[:,:index_consommation - 1], data.iloc[:,index_emissions:]], axis = 1)

# et les targets de l'autre
y_consommation = data.iloc[:,index_consommation]
y_emissions = data.iloc[:,index_emissions]

In [5]:
print(X.columns, y_consommation.head(), y_emissions.head())

Index(['CouncilDistrictCode', 'NumberofBuildings', 'NumberofFloors',
       'PropertyGFAParking', 'PropertyGFABuilding(s)',
       'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA',
       'Latitude', 'Longitude', 'ZipCode',
       ...
       '5.2', '6.2', '7.2', '8.2', '9.2', '10.2', '11.2', '0.4', '1.4', '2.4'],
      dtype='object', length=115) 0    16.107916
1    15.691287
2    16.634519
3    15.314463
4    16.354152
Name: consommation, dtype: float64 0    4.234251
1    3.855241
2    4.760805
3    4.902976
4    5.424598
Name: emissions, dtype: float64


On commence définir nos jeux de données pour le modèle de prédiction de la consommation d'énergie

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y_consommation, test_size=0.2)

std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

In [7]:
X_train

Unnamed: 0,CouncilDistrictCode,NumberofBuildings,NumberofFloors,PropertyGFAParking,PropertyGFABuilding(s),SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseTypeGFA,Latitude,Longitude,ZipCode,...,5.2,6.2,7.2,8.2,9.2,10.2,11.2,0.4,1.4,2.4
12,5,1.0,1.0,0,21633,0.0,0.0,47.729694,-122.345432,98133,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
958,4,1.0,3.0,0,33176,15000.0,0.0,47.632450,-122.325290,98102,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1437,4,1.0,2.0,0,78401,9940.0,4960.0,47.664500,-122.312240,98105,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
540,7,1.0,5.0,117867,133177,12717.0,0.0,47.612870,-122.350220,98121,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1446,7,1.0,1.0,0,21420,5000.0,2000.0,47.601350,-122.333270,98104,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954,7,1.0,3.0,0,45864,0.0,0.0,47.619540,-122.358470,98119,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1338,7,1.0,39.0,112124,962428,303000.0,141450.0,47.613760,-122.334450,98101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,7,1.0,7.0,0,122810,26450.0,8000.0,47.611550,-122.341140,98101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1433,5,1.0,1.0,0,29400,0.0,0.0,47.708330,-122.344290,98133,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X_train_std

array([[ 0.25402893, -0.12451079, -0.51221037, ...,  1.76431428,
        -0.10603951, -0.09067098],
       [-0.20410613, -0.12451079, -0.18442489, ..., -0.56679244,
        -0.10603951, -0.09067098],
       [-0.20410613, -0.12451079, -0.34831763, ...,  1.76431428,
        -0.10603951, -0.09067098],
       ...,
       [ 1.17029907, -0.12451079,  0.47114607, ...,  1.76431428,
        -0.10603951, -0.09067098],
       [ 0.25402893, -0.12451079, -0.51221037, ..., -0.56679244,
        -0.10603951, -0.09067098],
       [ 0.25402893, -0.12451079, -0.34831763, ..., -0.56679244,
        -0.10603951, -0.09067098]])

Ici on définit une première fonction qui va nous afficher les différentes mesures de qualité d'après les prédictions de nos modèles et une seconde fonction qui va entrainer le modèle du type passé en paramètre, faire des prédictions et appeler la première méthode pour afficher les résultats de ce modèle.

In [9]:
def show_metrics(ytest, ypred):
    print("mean_absolute_error : {:.2f}".format(mean_absolute_error(ytest, ypred)))
    print("mean_squared_error : {:.2f}".format(mean_squared_error(ytest, ypred)))
    print("root_mean_squared_error : {:.2f}".format(np.sqrt(mean_squared_error(ytest, ypred))))
    print("rmpse : {:.2f}".format(np.sqrt(np.mean(np.square(((ytest - ypred) / ytest))))))
    print("mean_absolute_percentage_error : {:.2f}".format(mean_absolute_percentage_error(ytest, ypred)))
    print("r2_score : {:.2f}".format(r2_score(ytest, ypred)))
    
def fit_predict_and_show_metrics(model, Xtrain, Xtest, ytrain, ytest):
    model.fit(Xtrain, ytrain)
    ypred = model.predict(Xtest)
    show_metrics(ytest, ypred)

On commence par entrainer un modèle "Dummy" (qui renvoie toujours la médiane) afin d'avoir une base de comparaison des autres résultats

In [10]:
dummy_regressor = DummyRegressor(strategy="median")
fit_predict_and_show_metrics(dummy_regressor, X_train_std, X_test_std, y_train, y_test)

mean_absolute_error : 0.98
mean_squared_error : 1.60
root_mean_squared_error : 1.26
rmpse : 0.09
mean_absolute_percentage_error : 0.07
r2_score : -0.00


Puis on affiche les scores pour un modèle de régression linéaire et ses dérivées Ridge et Lasso :

In [11]:
linear_regression = LinearRegression()
fit_predict_and_show_metrics(linear_regression, X_train, X_test, y_train, y_test)

mean_absolute_error : 0.35
mean_squared_error : 0.31
root_mean_squared_error : 0.56
rmpse : 0.03
mean_absolute_percentage_error : 0.02
r2_score : 0.81


In [12]:
linear_regression = LinearRegression()
fit_predict_and_show_metrics(linear_regression, X_train_std, X_test_std, y_train, y_test) # HEUUUUUUUUUU ?????

mean_absolute_error : 29172933.42
mean_squared_error : 287515917106719904.00
root_mean_squared_error : 536205107.31
rmpse : 42027809.03
mean_absolute_percentage_error : 2286493.70
r2_score : -180201209070575936.00


In [13]:
ridge = Ridge(random_state=123)
fit_predict_and_show_metrics(ridge, X_train, X_test, y_train, y_test)

mean_absolute_error : 0.34
mean_squared_error : 0.30
root_mean_squared_error : 0.55
rmpse : 0.03
mean_absolute_percentage_error : 0.02
r2_score : 0.81


In [14]:
lasso = Lasso(random_state=123)
fit_predict_and_show_metrics(lasso, X_train, X_test, y_train, y_test)

mean_absolute_error : 0.73
mean_squared_error : 2.68
root_mean_squared_error : 1.64
rmpse : 0.09
mean_absolute_percentage_error : 0.05
r2_score : -0.68


De ces 3 modèles, on constate que c'est la variante Ridge qui présente les meilleurs résultats (score R2 le plus proche de 1), suivi de très prêt par la régression linéaire.

On va tester également 2 autres types de modèles : le gradient et le random forest.

In [39]:
gradient_boosting_regressor = GradientBoostingRegressor(random_state=123)
fit_predict_and_show_metrics(gradient_boosting_regressor, X_train, X_test, y_train, y_test)
print("---")
random_forest_regressor = RandomForestRegressor(random_state=123)
fit_predict_and_show_metrics(random_forest_regressor, X_train, X_test, y_train, y_test)
print("---")
xgb_regressor = xgb.XGBRegressor(random_state=123)
fit_predict_and_show_metrics(xgb_regressor, X_train, X_test, y_train, y_test)

mean_absolute_error : 0.25
mean_squared_error : 0.13
root_mean_squared_error : 0.36
rmpse : 0.03
mean_absolute_percentage_error : 0.02
r2_score : 0.92
---
mean_absolute_error : 0.26
mean_squared_error : 0.15
root_mean_squared_error : 0.38
rmpse : 0.03
mean_absolute_percentage_error : 0.02
r2_score : 0.91
---
mean_absolute_error : 0.27
mean_squared_error : 0.15
root_mean_squared_error : 0.38
rmpse : 0.03
mean_absolute_percentage_error : 0.02
r2_score : 0.91




In [40]:
gradient_boosting_regressor.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 123,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

D'après les chiffres obtenus, le modèle du Gradient est le plus performant avec un score R2 de 92%.
Nous allons donc essayer d'afiner ce résultat en modifiant ses paramètres.

In [21]:
tps1 = time.time()
gradient_boosting_regressor = GradientBoostingRegressor(random_state=123)
parameters = {
    'learning_rate': [ 0.01, 0.02, 0.03, 0.04 ],
    'subsample'    : [ 0.9, 0.5, 0.2, 0.1 ],
    'n_estimators' : [ 100, 500, 1000, 1500 ],
    'max_depth'    : [ 4, 6, 8, 10 ]
}
grid_search_cv = GridSearchCV(estimator = gradient_boosting_regressor, param_grid = parameters, cv = 2, n_jobs=-1)
grid_search_cv.fit(X_train, y_train)
print("best estimator :", grid_search_cv.best_estimator_)
print("best score :", grid_search_cv.best_score_)
print("best params :", grid_search_cv.best_params_)
tps2 = time.time()
print("\nTemps de traitement total :", tps2 - tps1, "secondes")

best estimator : GradientBoostingRegressor(learning_rate=0.01, max_depth=8, n_estimators=1000,
                          random_state=123, subsample=0.2)
best score : 0.9180428893715404
best params : {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1000, 'subsample': 0.2}

Temps de traitement total : 275.19457268714905 secondes


On ré-exécute le Gradient avec les meilleurs paramètres trouvés afin de vérifier son score via la méthode précédente.

In [22]:
gradient_boosting_regressor = GradientBoostingRegressor(learning_rate=0.01, max_depth=8, n_estimators=1000,
                          random_state=123, subsample=0.2)
fit_predict_and_show_metrics(gradient_boosting_regressor, X_train, X_test, y_train, y_test)

mean_absolute_error : 0.25
mean_squared_error : 0.14
root_mean_squared_error : 0.37
rmpse : 0.03
mean_absolute_percentage_error : 0.02
r2_score : 0.91


MOINS BIEN QU'AVEC LES PARAMETRES PAR DEFAUT ???

On essaie d'affiner les résultats du Random Forest qui obtenait le deuxième meilleur score lors de notre première comparaison, pour voir s'il dépasse le Gradient avec des meilleurs paramètres.

In [30]:
tps1 = time.time()
random_forest_regressor = RandomForestRegressor(random_state=123)
parameters = {
    'bootstrap': [True, False],
    'max_depth': [50, 60, 80],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [1, 2, 3],
    'n_estimators': [200, 300, 400]
}
grid_search_cv = GridSearchCV(estimator = random_forest_regressor, param_grid = parameters, cv = 3, n_jobs = -1)
grid_search_cv.fit(X_train, y_train)
print("best estimator :", grid_search_cv.best_estimator_)
print("best score :", grid_search_cv.best_score_)
print("best params :", grid_search_cv.best_params_)
tps2 = time.time()
print("\nTemps de traitement total :", tps2 - tps1, "secondes")

 0.90831379 0.90827367 0.90802417        nan        nan        nan
 0.90920125 0.90930957 0.90926233 0.90920125 0.90930957 0.90926233
        nan        nan        nan 0.90893858 0.90880437 0.90881245
 0.90893858 0.90880437 0.90881245        nan        nan        nan
 0.90821843 0.90834224 0.90809414 0.90831379 0.90827367 0.90802417
        nan        nan        nan 0.90920125 0.90930957 0.90926233
 0.90920125 0.90930957 0.90926233        nan        nan        nan
 0.90893858 0.90880437 0.90881245 0.90893858 0.90880437 0.90881245
        nan        nan        nan 0.90821843 0.90834224 0.90809414
 0.90831379 0.90827367 0.90802417        nan        nan        nan
 0.90920125 0.90930957 0.90926233 0.90920125 0.90930957 0.90926233
        nan        nan        nan 0.90893858 0.90880437 0.90881245
 0.90893858 0.90880437 0.90881245        nan        nan        nan
 0.84921478 0.84945712 0.84946759 0.85031589 0.85048222 0.85043319
        nan        nan        nan 0.85755328 0.85762017 0.8576

best estimator : RandomForestRegressor(max_depth=50, min_samples_leaf=2, n_estimators=300,
                      random_state=123)
best score : 0.9093095747710763
best params : {'bootstrap': True, 'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}

Temps de traitement total : 325.37567615509033 secondes


In [36]:
random_forest_regressor = RandomForestRegressor(max_depth=50, min_samples_leaf=2, n_estimators=300,
                      random_state=123)
fit_predict_and_show_metrics(random_forest_regressor, X_train, X_test, y_train, y_test)

mean_absolute_error : 0.26
mean_squared_error : 0.14
root_mean_squared_error : 0.38
rmpse : 0.03
mean_absolute_percentage_error : 0.02
r2_score : 0.91


In [35]:
tps1 = time.time()
xgb_regressor = xgb.XGBRegressor(random_state=123)
parameters = {
    'gamma':[ i/10.0 for i in range(3,6) ],
    'nthread':[ 4 ],
    'objective':[ 'reg:linear' ],
    'learning_rate': [ .03, 0.05, .07 ],
    'max_depth': [ 2, 4, 6, 8 ],
    'min_child_weight': [4, 5],
    'silent': [ 1 ],
    'subsample': [ i/10.0 for i in range(6, 11) ],
    'colsample_bytree': [ i/10.0 for i in range(6, 11) ],
    'n_estimators': [ 500 ]
}
grid_search_cv = GridSearchCV(estimator = xgb_regressor, param_grid = parameters, cv = 3, n_jobs=-1)
grid_search_cv.fit(X_train, y_train)
print("best estimator :", grid_search_cv.best_estimator_)
print("best score :", grid_search_cv.best_score_)
print("best params :", grid_search_cv.best_params_)
tps2 = time.time()
print("\nTemps de traitement total :", tps2 - tps1, "secondes")

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


best estimator : XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.3, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=8,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
             objective='reg:linear', random_state=123, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, silent=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)
best score : 0.9181128318111208
best params : {'colsample_bytree':

In [38]:
xgb_regressor = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.3, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=8,
             min_child_weight=5, monotone_constraints='()',
             n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
             objective='reg:linear', random_state=123, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, silent=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

fit_predict_and_show_metrics(xgb_regressor, X_train, X_test, y_train, y_test)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


mean_absolute_error : 0.25
mean_squared_error : 0.12
root_mean_squared_error : 0.35
rmpse : 0.02
mean_absolute_percentage_error : 0.02
r2_score : 0.92


