In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("whitegrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("data"))

['2015-building-energy-benchmarking.csv', '2016-building-energy-benchmarking.csv', 'clean-building-energy-benchmarking.csv', 'socrata_metadata_2015-building-energy-benchmarking.json', 'socrata_metadata_2016-building-energy-benchmarking.json']


In [2]:
raw = pd.read_csv("data/clean-building-energy-benchmarking.csv")
data = raw.copy()

In [3]:
data.head()

Unnamed: 0,DataYear,ENERGYSTARScore,NumberofBuildings,NumberofFloors,OSEBuildingID,PropertyGFABuilding(s),PropertyGFAParking,PropertyGFATotal,SiteEnergyUseWN(kBtu),TotalGHGEmissions,...,Neighborhood_EAST,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,ENERGYSTARCertified
0,2015,65.0,1.0,12.0,1,88434,0,88434,7097539.0,249.43,...,0,0,0,0,0,0,0,0,0,0
1,2015,51.0,1.0,11.0,2,88502,15064,103566,8765788.0,263.51,...,0,0,0,0,0,0,0,0,0,0
2,2015,18.0,1.0,41.0,3,961990,0,961990,,,...,0,0,0,0,0,0,0,0,0,0
3,2015,,1.0,10.0,5,61320,0,61320,28363444.0,,...,0,0,0,0,0,0,0,0,0,0
4,2015,67.0,1.0,18.0,8,107430,12460,119890,15078243.0,507.7,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import time

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [51]:
def test(models, X_train, X_test, y_train, y_test):
    results = {}
    for i in models:
        start_time = time.time()
        models[i].fit(X_train, y_train)
        y_pred = models[i].predict(X_test)
        r2_predict = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        end_time = time.time() - start_time
        results[i] = [r2_predict, rmse, end_time]
    return pd.DataFrame(results, index=["R2 Score", "RMSE", "Time"])

In [4]:
df = data.copy()

In [7]:
df.dropna(subset=['TotalGHGEmissions', 'SiteEnergyUseWN(kBtu)', 'ENERGYSTARScore'], inplace=True)

# TotalGHGEmissions

In [8]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["TotalGHGEmissions"]]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
models = {'OLS': LinearRegression(),
          'Lasso': Lasso(),
          'Ridge': Ridge(),
          'SVM': SVR(kernel="linear"),
          'Random Forest': RandomForestRegressor(),
         }

In [13]:
baseline_ghg = test(models, X_train, X_test, y_train, y_test)

In [14]:
baseline_ghg

Unnamed: 0,OLS,Lasso,Ridge,SVM,Random Forest
R2 Score,0.393114,0.54326,0.535914,0.446793,0.785375
RMSE,85.247178,73.953882,74.546229,81.389879,50.695121
Time,0.010971,0.031914,0.003989,1.96386,0.292241


In [15]:
lasso_params = {'alpha': np.logspace(-2, -1, 20)}

grid_lasso = GridSearchCV(Lasso(), param_grid=lasso_params, n_jobs=-1, verbose = 5)
grid_lasso.fit(X_train, y_train)
grid_lasso.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  60 | elapsed:    3.7s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  55 out of  60 | elapsed:    3.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.0s finished


Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [16]:
ridge_params = {'alpha': np.logspace(-5, 5, 20)}

grid_ridge = GridSearchCV(Ridge(), param_grid=ridge_params, n_jobs=-1, verbose = 5)
grid_ridge.fit(X_train, y_train)
grid_ridge.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  60 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  29 out of  60 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished


Ridge(alpha=233.57214690901213, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [17]:
svm_params = {'kernel': ['linear'], 'C': np.logspace(0, 2, 3)}
grid_svm = GridSearchCV(SVR(), param_grid=svm_params, n_jobs=-1, verbose = 5)
grid_svm.fit(X_train, y_train)
grid_svm.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    1.3s remaining:    4.8s
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    1.8s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    2.1s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   11.2s finished


SVR(C=100.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [18]:
rfr_params = {'bootstrap': [True, False],
              'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10],
              'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

random_rfr = RandomizedSearchCV(RandomForestRegressor(), param_distributions=rfr_params, n_iter=100, cv=3, verbose=5, n_jobs=-1)
random_rfr.fit(X_train, y_train)
random_rfr.best_estimator_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.6min finished


RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=60,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=400,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [19]:
models2 = {'OLS': LinearRegression(),
           'Lasso': grid_lasso.best_estimator_,
           'Ridge': grid_ridge.best_estimator_,
           'SVM': grid_svm.best_estimator_,
           'Random Forest': random_rfr.best_estimator_}

In [20]:
optimize_ghg = test(models2, X_train, X_test, y_train, y_test)
optimize_ghg

Unnamed: 0,OLS,Lasso,Ridge,SVM,Random Forest
R2 Score,0.393114,0.55118,0.540757,0.44858,0.795122
RMSE,85.247178,73.309862,74.156232,81.258284,49.530608
Time,0.009973,0.190491,0.004986,18.304625,3.94461


In [21]:
scores_ghg = pd.concat([baseline_ghg, optimize_ghg], keys=['Baseline', 'Optimize'])
scores_ghg

Unnamed: 0,Unnamed: 1,OLS,Lasso,Ridge,SVM,Random Forest
Baseline,R2 Score,0.393114,0.54326,0.535914,0.446793,0.785375
Baseline,RMSE,85.247178,73.953882,74.546229,81.389879,50.695121
Baseline,Time,0.010971,0.031914,0.003989,1.96386,0.292241
Optimize,R2 Score,0.393114,0.55118,0.540757,0.44858,0.795122
Optimize,RMSE,85.247178,73.309862,74.156232,81.258284,49.530608
Optimize,Time,0.009973,0.190491,0.004986,18.304625,3.94461


## SiteEnergyUseWN(kBtu)

In [52]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["SiteEnergyUseWN(kBtu)"]]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [54]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [55]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [56]:
baseline_energy = test(models, X_train, X_test, y_train, y_test)

In [57]:
optimize_energy = test(models2, X_train, X_test, y_train, y_test)

In [70]:
scores_energy = pd.concat([baseline_energy, optimize_energy], keys=['Baseline', 'Optimize'])
scores_energy

Unnamed: 0,Unnamed: 1,OLS,Lasso,Ridge,SVM,Random Forest
Baseline,R2 Score,0.6762,0.5641,0.5643,-0.1382,0.8325
Baseline,RMSE,2622389.0,3042724.0,3041984.0,4916434.0,1886242.0
Baseline,Time,0.0099,0.2154,0.005,1.7065,0.2744
Optimize,R2 Score,0.6762,0.564,0.6001,0.0511,0.8824
Optimize,RMSE,2622389.0,3042840.0,2914192.0,4489220.0,1580436.0
Optimize,Time,0.0109,0.2084,0.004,1.7045,3.9572


## EnergyStarScore

Nous allons mesurer l'intérêt de l'EnergyStarScore pour la prédiction de TotalGHGEmissions

In [29]:
df = data.copy()

In [30]:
df.dropna(subset=['TotalGHGEmissions', 'ENERGYSTARScore'], inplace=True)
df = df.drop("SiteEnergyUseWN(kBtu)", axis=1)

In [31]:
X = df.drop(["TotalGHGEmissions"], axis=1)
y = df[["TotalGHGEmissions"]]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [33]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [34]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [35]:
baseline_ghg_ess = test(models, X_train, X_test, y_train, y_test)

In [36]:
optimize_ghg_ess = test(models2, X_train, X_test, y_train, y_test)

In [65]:
scores_ghg_ess = pd.concat([baseline_ghg_ess, optimize_ghg_ess], keys=['Baseline', 'Optimize'])
scores_ghg_ess

Unnamed: 0,Unnamed: 1,OLS,Lasso,Ridge,SVM,Random Forest
Baseline,R2 Score,-2.688221e+23,0.625,0.5952,0.5469,0.8143
Baseline,RMSE,59339750000000.0,70.0901,72.8181,77.035,49.3186
Baseline,Time,0.013,0.0728,0.004,1.9973,0.3098
Optimize,R2 Score,-2.688221e+23,0.6249,0.6125,0.5498,0.8877
Optimize,RMSE,59339750000000.0,70.0958,71.2481,76.7936,38.3607
Optimize,Time,0.011,0.1725,0.007,16.2444,4.0352


In [72]:
final_ghg = pd.concat([scores_ghg, scores_ghg_ess], keys=["Sans EnergyStarScore", "Avec EnergyStarScore"])
final_ghg

Unnamed: 0,Unnamed: 1,Unnamed: 2,OLS,Lasso,Ridge,SVM,Random Forest
Sans EnergyStarScore,Baseline,R2 Score,0.3931138,0.54326,0.535914,0.446793,0.785375
Sans EnergyStarScore,Baseline,RMSE,85.24718,73.953882,74.546229,81.389879,50.695121
Sans EnergyStarScore,Baseline,Time,0.01097083,0.031914,0.003989,1.96386,0.292241
Sans EnergyStarScore,Optimize,R2 Score,0.3931138,0.55118,0.540757,0.44858,0.795122
Sans EnergyStarScore,Optimize,RMSE,85.24718,73.309862,74.156232,81.258284,49.530608
Sans EnergyStarScore,Optimize,Time,0.009973288,0.190491,0.004986,18.304625,3.94461
Avec EnergyStarScore,Baseline,R2 Score,-2.688221e+23,0.624952,0.595189,0.546946,0.814307
Avec EnergyStarScore,Baseline,RMSE,59339750000000.0,70.090068,72.818105,77.035011,49.318618
Avec EnergyStarScore,Baseline,Time,0.01296711,0.072837,0.003956,1.997331,0.309827
Avec EnergyStarScore,Optimize,R2 Score,-2.688221e+23,0.624891,0.612457,0.549781,0.887657
