In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("whitegrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("data"))

['socrata_metadata_2016-building-energy-benchmarking.json', '.DS_Store', 'socrata_metadata_2015-building-energy-benchmarking.json', 'clean-building-energy-benchmarking.csv', '2015-building-energy-benchmarking.csv', '2016-building-energy-benchmarking.csv']


In [2]:
raw = pd.read_csv("data/clean-building-energy-benchmarking.csv")
data = raw.copy()

In [3]:
data.head()

Unnamed: 0,DataYear,ENERGYSTARScore,GHGEmissionsIntensity,NumberofBuildings,NumberofFloors,OSEBuildingID,PropertyGFABuilding(s),PropertyGFAParking,PropertyGFATotal,SiteEnergyUse(kBtu),...,Neighborhood_EAST,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,ENERGYSTARCertified
0,2015,65.0,2.64,1.0,12.0,1,88434,0,88434,6981428.0,...,0,0,0,0,0,0,0,0,0,0
1,2015,51.0,2.38,1.0,11.0,2,88502,15064,103566,8354235.0,...,0,0,0,0,0,0,0,0,0,0
2,2015,18.0,1.92,1.0,41.0,3,961990,0,961990,,...,0,0,0,0,0,0,0,0,0,0
3,2015,,,1.0,10.0,5,61320,0,61320,28229320.0,...,0,0,0,0,0,0,0,0,0,0
4,2015,67.0,4.02,1.0,18.0,8,107430,12460,119890,14829099.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [5]:
def scatter_plot(x, y, size, hue):
    plt.figure(figsize=(10, 10))
    sns.scatterplot(x=train_score.index, y="Training_R2", data=train_score, s=1000, hue="Training_R2")
    plt.tight_layout()
    plt.show()

In [6]:
seed = 42

linear = LinearRegression(n_jobs=-1)
ridge = Ridge(random_state=seed)
lasso = Lasso(random_state=seed)
svm_linear = SVR(kernel="linear")
svm_rbf = SVR(kernel="rbf")
rfr = RandomForestRegressor(n_jobs=-1, random_state=seed)

# TotalGHGEmissions

In [7]:
df = data.copy()

imputer = SimpleImputer()
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [8]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["TotalGHGEmissions"]]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [11]:
def train_test_split_score(model):
    model.fit(X_train_std, y_train)
    prediction = model.predict(X_test_std)
    rmse = np.sqrt(mean_squared_error(prediction, y_test))
    return rmse

In [12]:
models = [linear, lasso, ridge, svm_linear, svm_rbf, rfr]
train_test_split_rmse = []
for model in models:
    rmse = train_test_split_score(model)
    train_test_split_rmse.append(rmse)    

In [13]:
train_test_score = pd.DataFrame(data = train_test_split_rmse, columns= ['Train_Test_RMSE'])
train_test_score.index = ['LinearRegression', 'Lasso', 'Ridge', 'SVM Linear', 'SVM RBF', 'RandomForestRegressor']
train_test_score = train_test_score.round(5)
train_test_score

Unnamed: 0,Train_Test_RMSE
LinearRegression,74.22998
Lasso,74.27277
Ridge,74.25448
SVM Linear,77.42409
SVM RBF,109.03902
RandomForestRegressor,38.18362


## Optimisation des hyperparamètres

In [15]:
def grid_search_cv(model, params):
    global best_params, best_score
    from sklearn.model_selection import GridSearchCV
    grid_search = GridSearchCV(estimator = model, param_grid = params, cv = 10, verbose = 1,
                            scoring = 'neg_mean_squared_error', n_jobs = -1)
    grid_search.fit(X_train_std, y_train)
    best_params = grid_search.best_params_ 
    best_score = np.sqrt(-1*(np.round(grid_search.best_score_, 5)))
    return best_params, best_score