In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("whitegrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("data"))

['socrata_metadata_2016-building-energy-benchmarking.json', '.DS_Store', 'socrata_metadata_2015-building-energy-benchmarking.json', 'clean-building-energy-benchmarking.csv', '2015-building-energy-benchmarking.csv', '2016-building-energy-benchmarking.csv']


In [2]:
raw = pd.read_csv("data/clean-building-energy-benchmarking.csv")
data = raw.copy()

In [3]:
data.head()

Unnamed: 0,DataYear,ENERGYSTARScore,GHGEmissionsIntensity,NumberofBuildings,NumberofFloors,OSEBuildingID,PropertyGFABuilding(s),PropertyGFAParking,PropertyGFATotal,SiteEnergyUse(kBtu),...,Neighborhood_EAST,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,ENERGYSTARCertified
0,2015,65.0,2.64,1.0,12.0,1,88434,0,88434,6981428.0,...,0,0,0,0,0,0,0,0,0,0
1,2015,51.0,2.38,1.0,11.0,2,88502,15064,103566,8354235.0,...,0,0,0,0,0,0,0,0,0,0
2,2015,18.0,1.92,1.0,41.0,3,961990,0,961990,,...,0,0,0,0,0,0,0,0,0,0
3,2015,,,1.0,10.0,5,61320,0,61320,28229320.0,...,0,0,0,0,0,0,0,0,0,0
4,2015,67.0,4.02,1.0,18.0,8,107430,12460,119890,14829099.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [5]:
seed = 42

linear = LinearRegression(n_jobs=-1)
ridge = Ridge(random_state=seed)
lasso = Lasso(random_state=seed)
svm = SVR()
svm_linear = SVR(kernel="linear")
svm_rbf = SVR(kernel="rbf")
rfr = RandomForestRegressor(n_jobs=-1, random_state=seed)

In [6]:
df = data.copy()

imputer = SimpleImputer()
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# TotalGHGEmissions

In [7]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["TotalGHGEmissions"]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [10]:
def train_test_split_score(model):
    model.fit(X_train_std, y_train)
    prediction = model.predict(X_test_std)
    rmse = np.sqrt(mean_squared_error(prediction, y_test))
    r2 = r2_score(y_test, prediction)
    return rmse, r2

In [11]:
models = [linear, lasso, ridge, svm_linear, svm_rbf, rfr]
train_test_split_rmse = []
train_test_split_r2 = []
for model in models:
    rmse, r2 = train_test_split_score(model)
    train_test_split_rmse.append(rmse)
    train_test_split_r2.append(r2)

In [12]:
train_test_score = pd.DataFrame(data = train_test_split_rmse, columns= ['RMSE'])
train_test_score['R2 Score'] = train_test_split_r2
train_test_score.index = ['LinearRegression', 'Lasso', 'Ridge', 'SVM Linear', 'SVM RBF', 'RandomForestRegressor']
train_test_score = train_test_score.round(5)
train_test_score

Unnamed: 0,RMSE,R2 Score
LinearRegression,74.22998,0.64796
Lasso,74.27277,0.64755
Ridge,74.25448,0.64773
SVM Linear,77.42409,0.61701
SVM RBF,109.03902,0.24038
RandomForestRegressor,38.18362,0.90685


## Optimisation des hyperparamètres

In [13]:
def grid_search_cv(model, params):
    global best_params, best_score
    from sklearn.model_selection import GridSearchCV
    grid_search = GridSearchCV(estimator = model, param_grid = params, cv = 10, verbose = 1,
                            scoring = 'neg_mean_squared_error', n_jobs = -1)
    grid_search.fit(X_train_std, y_train)
    best_params = grid_search.best_params_ 
    best_score = np.sqrt(-1*(np.round(grid_search.best_score_, 5)))
    return best_params, best_score

### Optimisation de Lasso

In [14]:
alpha = [0.0001, 0.0002, 0.00025, 0.0003, 0.00031, 0.00032, 0.00033, 0.00034, 0.00035, 0.00036, 0.00037, 0.00038, 
         0.0004, 0.00045, 0.0005, 0.00055, 0.0006, 0.0008,  0.001, 0.002, 0.005, 0.007, 0.008, 0.01]

lasso_params = {'alpha': alpha, 'random_state': [seed]}

grid_search_cv(lasso, lasso_params)
lasso_best_params, lasso_best_score = best_params, best_score
print('Lasso best params: {} & best_score: {:0.5f}'.format(lasso_best_params, lasso_best_score))

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min


Lasso best params: {'alpha': 0.01, 'random_state': 42} & best_score: 81.02388


[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.4min finished


In [15]:
ridge_params = {'alpha': [ 9, 9.2, 9.4, 9.5, 9.52, 9.54, 9.56, 9.58, 9.6, 9.62, 9.64, 9.66, 9.68, 9.7,  9.8], 'random_state': [seed]}

grid_search_cv(ridge, ridge_params)
ridge_best_params, ridge_best_score = best_params, best_score
print('Ridge best params: {} & best_score: {:0.5f}'.format(ridge_best_params, ridge_best_score))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 15 candidates, totalling 150 fits
Ridge best params: {'alpha': 9.8, 'random_state': 42} & best_score: 80.78596


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    4.0s finished


In [None]:
svm_params = {'kernel': ['linear', 'rbf'], 'C': [4,5], 'gamma': [0.0001, 0.001]}

grid_search_cv(svm, svm_params)
svm_best_params, svm_best_score = best_params, best_score
print('SVM best params: {} & best_score: {:0.5f}'.format(svm_best_params, svm_best_score))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 8 candidates, totalling 80 fits
