In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("whitegrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("data"))

['socrata_metadata_2016-building-energy-benchmarking.json', '.DS_Store', 'socrata_metadata_2015-building-energy-benchmarking.json', 'clean-building-energy-benchmarking.csv', '2015-building-energy-benchmarking.csv', '2016-building-energy-benchmarking.csv']


In [2]:
raw = pd.read_csv("data/clean-building-energy-benchmarking.csv")
data = raw.copy()

In [3]:
data.head()

Unnamed: 0,DataYear,ENERGYSTARScore,GHGEmissionsIntensity,NumberofBuildings,NumberofFloors,OSEBuildingID,PropertyGFABuilding(s),PropertyGFAParking,PropertyGFATotal,SiteEnergyUse(kBtu),...,Neighborhood_EAST,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,ENERGYSTARCertified
0,2015,65.0,2.64,1.0,12.0,1,88434,0,88434,6981428.0,...,0,0,0,0,0,0,0,0,0,0
1,2015,51.0,2.38,1.0,11.0,2,88502,15064,103566,8354235.0,...,0,0,0,0,0,0,0,0,0,0
2,2015,18.0,1.92,1.0,41.0,3,961990,0,961990,,...,0,0,0,0,0,0,0,0,0,0
3,2015,,,1.0,10.0,5,61320,0,61320,28229320.0,...,0,0,0,0,0,0,0,0,0,0
4,2015,67.0,4.02,1.0,18.0,8,107430,12460,119890,14829099.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [5]:
seed = 42

linear = LinearRegression(n_jobs=-1)
ridge = Ridge(random_state=seed)
lasso = Lasso(random_state=seed)
svm = SVR()
svm_linear = SVR(kernel="linear")
svm_rbf = SVR(kernel="rbf")
rfr = RandomForestRegressor(n_jobs=-1, random_state=seed)

In [6]:
df = data.copy()

# TotalGHGEmissions

In [7]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["TotalGHGEmissions"]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
y_train = imputer.fit_transform(y_train)
y_test = imputer.transform(y_test)

In [10]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
def train_test_split_score(model):
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    rmse = np.sqrt(np.round(mean_squared_error(prediction, y_test), 5))
    r2 = r2_score(y_test, prediction)
    return rmse, r2

In [12]:
models = [linear, lasso, ridge, svm_linear, svm_rbf, rfr]
train_test_split_rmse = []
train_test_split_r2 = []
for model in models:
    rmse, r2 = train_test_split_score(model)
    train_test_split_rmse.append(rmse)
    train_test_split_r2.append(r2)

In [13]:
train_test_score = pd.DataFrame(data = train_test_split_rmse, columns= ['RMSE'])
train_test_score['R2 Score'] = train_test_split_r2
train_test_score.index = ['LinearRegression', 'Lasso', 'Ridge', 'SVM Linear', 'SVM RBF', 'RandomForestRegressor']
train_test_score = train_test_score.round(5)
train_test_score

Unnamed: 0,RMSE,R2 Score
LinearRegression,71.92054,0.66952
Lasso,72.03212,0.6685
Ridge,71.92549,0.66948
SVM Linear,77.30484,0.61819
SVM RBF,109.14383,0.23892
RandomForestRegressor,37.21975,0.91149


## Optimisation des hyperparamètres

In [18]:
def grid_search_cv(model, params):
    global best_params, best_score
    from sklearn.model_selection import GridSearchCV
    grid_search = GridSearchCV(estimator = model, param_grid = params, cv = 10, verbose = 1,
                            scoring = 'neg_mean_squared_error', n_jobs = -1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_grid = grid_search.best_estimator_
    best_score = np.sqrt(-1*(np.round(grid_search.best_score_, 5)))/100
    return best_params, best_score

### Optimisation de Lasso

In [19]:
# 0.74916
# 0.75256

In [21]:
alpha_grid = np.logspace(-3, 3, 20)

lasso_params = {'alpha': alpha_grid, 'random_state': [seed]}

lasso_best_params, lasso_best_score = grid_search_cv(lasso, lasso_params)
print('Lasso best params: {} & best_score: {:0.5f}'.format(lasso_best_params, lasso_best_score))

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.9s


Lasso best params: {'alpha': 1.438449888287663, 'random_state': 42} & best_score: 0.74916


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   24.7s finished


### Optimisation de Ridge

In [16]:
alpha_grid = np.logspace(-1, 1, 20)
ridge_params = {'alpha': alpha_grid, 'random_state': [seed]}

grid_search_cv(ridge, ridge_params)
ridge_best_params, ridge_best_score = best_params, best_score
print('Ridge best params: {} & best_score: {:0.5f}'.format(ridge_best_params, ridge_best_score))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    1.7s


Ridge best params: {'alpha': 10.0, 'random_state': 42} & best_score: 0.78576


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.3s finished


### Optimisation de SVM Linear & RBF

In [17]:
gamma_grid = np.logspace(-5, 5, 10)
svm_params = {'kernel': ['linear', 'rbf'], 'C': [4,5], 'gamma': gamma_grid}

grid_search_cv(svm, svm_params)
svm_best_params, svm_best_score = best_params, best_score
print('SVM best params: {} & best_score: {:0.5f}'.format(svm_best_params, svm_best_score))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 13.1min finished


SVM best params: {'C': 5, 'gamma': 1e-05, 'kernel': 'linear'} & best_score: 0.78276


### Optimisation du Random Forest Regressor

In [None]:
params_grid = {'max_depth': [80, 90, 100, 110], 
              'min_samples_leaf': [2, 3, 4, 5],
              'min_samples_split': [4, 6, 8],
              'n_estimators': [100, 200, 300, 500]
             }

In [None]:
params_grid = {
    'min_samples_split': [4, 6, 8],
    'n_estimators': [100, 200]
              }

grid_search_cv(rfr, params_grid)
rfr_best_params, rfr_best_score = best_params, best_score
print('RFR best params: {} & best_score: {:0.5f}'.format(rfr_best_params, rfr_best_score))

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
train_test_score['R2 Score'] = train_test_split_r2
train_test_score.index = ['LinearRegression', 'Lasso', 'Ridge', 'SVM Linear', 'SVM RBF', 'RandomForestRegressor']
train_test_score = train_test_score.round(5)
train_test_score