In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("whitegrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("data"))

['socrata_metadata_2016-building-energy-benchmarking.json', '.DS_Store', 'socrata_metadata_2015-building-energy-benchmarking.json', 'clean-building-energy-benchmarking.csv', '2015-building-energy-benchmarking.csv', '2016-building-energy-benchmarking.csv']


In [2]:
raw = pd.read_csv("data/clean-building-energy-benchmarking.csv")
data = raw.copy()

In [3]:
data.head()

Unnamed: 0,DataYear,ENERGYSTARScore,GHGEmissionsIntensity,NumberofBuildings,NumberofFloors,OSEBuildingID,PropertyGFABuilding(s),PropertyGFAParking,PropertyGFATotal,SiteEnergyUse(kBtu),...,Neighborhood_EAST,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,ENERGYSTARCertified
0,2015,65.0,2.64,1.0,12.0,1,88434,0,88434,6981428.0,...,0,0,0,0,0,0,0,0,0,0
1,2015,51.0,2.38,1.0,11.0,2,88502,15064,103566,8354235.0,...,0,0,0,0,0,0,0,0,0,0
2,2015,18.0,1.92,1.0,41.0,3,961990,0,961990,,...,0,0,0,0,0,0,0,0,0,0
3,2015,,,1.0,10.0,5,61320,0,61320,28229320.0,...,0,0,0,0,0,0,0,0,0,0
4,2015,67.0,4.02,1.0,18.0,8,107430,12460,119890,14829099.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df = data.copy()

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# TotalGHGEmissions

In [6]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["TotalGHGEmissions"]]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
y_train = imputer.fit_transform(y_train)
y_test = imputer.transform(y_test)

In [9]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
def test(models, X_train, X_test, y_train, y_test):
    results = {}
    for i in models:
        r2_test = r2_score(y_test, models[i].fit(X_train, y_train).predict(X_test))
        r2_train = r2_score(y_train, models[i].fit(X_train, y_train).predict(X_train))
        results[i] = [r2_train, r2_test]
    return pd.DataFrame(results, index=["Train", "Test"])

In [11]:
models = {'OLS': LinearRegression(),
         'Lasso': Lasso(),
         'Ridge': Ridge(),
         'SVM Linear': SVR(kernel="linear"),
          'Random Forest': RandomForestRegressor(),
         }

In [12]:
test(models, X_train, X_test, y_train, y_test)

Unnamed: 0,OLS,Lasso,Ridge,SVM Linear,Random Forest
Train,0.686158,0.682039,0.686156,0.639873,0.978782
Test,0.669525,0.668498,0.669479,0.618191,0.898168


In [13]:
from sklearn.linear_model import LassoCV
from yellowbrick.regressor import AlphaSelection
from yellowbrick.datasets import load_concrete

# Load the regression dataset
X, y = load_concrete()

# Create a list of alphas to cross-validate against
alphas = np.logspace(-10, 1, 400)

# Instantiate the linear model and visualizer
model = LassoCV(alphas=alphas)
visualizer = AlphaSelection(model)
visualizer.fit(X_train, y_train)
visualizer.show()

ModuleNotFoundError: No module named 'yellowbrick'

In [64]:
ridge = Ridge()
ridge.fit(X_train, y_train)
pred = ridge.predict(X_test)
r2_score(y_test, pred)

0.6694791418836389

In [70]:
ridge_params = {'alpha': np.logspace(-2, -1, 20)}
grid = GridSearchCV(Ridge(), param_grid=ridge_params).fit(X_train, y_train).best_estimator_
grid.fit(X_train, y_train)
pred = grid.predict(X_test)
r2_score(y_test, pred)

0.6694804573993667

In [18]:
print(grid.best_estimator_, grid.best_score_)

Lasso(alpha=3.593813663804626, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False) 0.6462881723479159


In [14]:
lasso_params = {'alpha': np.logspace(-2, -1, 20)}
ridge_params = {'alpha': np.logspace(-5, 5, 20)}
svm_params = {'kernel': ['linear', 'rbf'], 'C': [4,5], 'gamma': np.logspace(-5, 5, 10)}
rfr_params = {'min_samples_leaf': [1,2,4], 'min_samples_split': [2], 'n_estimators': [100]}

models2 = {'OLS': LinearRegression(),
           'Lasso': GridSearchCV(Lasso(), 
                               param_grid=lasso_params).fit(X_train, y_train).best_estimator_,
           'Ridge': GridSearchCV(Ridge(), 
                               param_grid=ridge_params).fit(X_train, y_train).best_estimator_,
          'Random Forest': GridSearchCV(RandomForestRegressor(), param_grid=rfr_params).fit(X_train, y_train).best_estimator_,}

In [15]:
test(models2, X_train, X_test, y_train, y_test)

Unnamed: 0,OLS,Lasso,Ridge,Random Forest
Train,0.686158,0.666551,0.684188,0.968234
Test,0.669525,0.654999,0.667422,0.9122


In [None]:
params_grid = {'max_depth': [80, 90, 100, 110], 
              'min_samples_leaf': [2, 3, 4, 5],
              'min_samples_split': [4, 6, 8],
              'n_estimators': [100]
             }

In [None]:
def test(models, X, Y, iterations = 100):
    results = {}
    for i in models:
        r2_train = []
        r2_test = []
        for j in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
            r2_test.append(r2_score(y_test, models[i].fit(X_train, y_train).predict(X_test)))
            r2_train.append(r2_score(y_train, models[i].fit(X_train, y_train).predict(X_train)))
        results[i] = [np.mean(r2_train), np.mean(r2_test)]
    return pd.DataFrame(results, index=["Train", "Test"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
y_train = imputer.fit_transform(y_train)
y_test = imputer.transform(y_test)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
def train_test_split_score(model):
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    rmse = np.sqrt(np.round(mean_squared_error(prediction, y_test), 5))
    r2 = r2_score(y_test, prediction)
    return rmse, r2

In [None]:
models = [linear, lasso, ridge, svm_linear, svm_rbf, rfr]
train_test_split_rmse = []
train_test_split_r2 = []
for model in models:
    rmse, r2 = train_test_split_score(model)
    train_test_split_rmse.append(rmse)
    train_test_split_r2.append(r2)

In [None]:
train_test_score = pd.DataFrame(data = train_test_split_rmse, columns= ['RMSE'])
train_test_score['R2 Score'] = train_test_split_r2
train_test_score.index = ['LinearRegression', 'Lasso', 'Ridge', 'SVM Linear', 'SVM RBF', 'RandomForestRegressor']
train_test_score = train_test_score.round(5)
train_test_score

## Optimisation des hyperparamètres

In [None]:
def grid_search_cv(model, params):
    global best_params, best_score
    from sklearn.model_selection import GridSearchCV
    grid_search = GridSearchCV(estimator = model, 
                               param_grid = params, 
                               cv = 10, 
                               verbose = 3,
                               iid='warn',
                               scoring='r2',
                               n_jobs = -1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    #rmse = np.sqrt(np.abs(grid_search.cv_results_['mean_test_e2'].mean()))
    #r2 = grid_search.cv_results_['mean_test_r2'].mean()
    return grid_search#best_params, rmse, r2
#refit='r2',
#{'r2': 'r2','e2': 'neg_mean_squared_error'},

### Optimisation de Lasso

In [None]:
#0.6420310325756636
#0.7525624884353458
#

In [None]:
alpha_grid = np.logspace(-5, 5, 20)

lasso_params = {'alpha': alpha_grid, 'random_state': [seed]}

#lasso_best_params, rmse, r2 = grid_search_cv(lasso, lasso_params)

grid_search = grid_search_cv(lasso, lasso_params)

#lasso_best_params, lasso_best_score = grid_search_cv(lasso, lasso_params)
#print('Lasso best params: {} & best_score: {:0.5f}'.format(lasso_best_params))
#print('Lasso best params: {}'.format(lasso_best_params))

In [None]:
grid_search.best_score_

### Optimisation de Ridge

In [None]:
alpha_grid = np.logspace(-1, 1, 20)
ridge_params = {'alpha': alpha_grid, 'random_state': [seed]}

grid_search_cv(ridge, ridge_params)
ridge_best_params, ridge_best_score = best_params, best_score
print('Ridge best params: {} & best_score: {:0.5f}'.format(ridge_best_params, ridge_best_score))

### Optimisation de SVM Linear & RBF

In [None]:
gamma_grid = np.logspace(-5, 5, 10)
svm_params = {'kernel': ['linear', 'rbf'], 'C': [4,5], 'gamma': gamma_grid}

grid_search_cv(svm, svm_params)
svm_best_params, svm_best_score = best_params, best_score
print('SVM best params: {} & best_score: {:0.5f}'.format(svm_best_params, svm_best_score))

### Optimisation du Random Forest Regressor

In [None]:
params_grid = {'max_depth': [80, 90, 100, 110], 
              'min_samples_leaf': [2, 3, 4, 5],
              'min_samples_split': [4, 6, 8],
              'n_estimators': [100, 200, 300, 500]
             }

In [None]:
params_grid = {
    'min_samples_split': [4, 6, 8],
    'n_estimators': [100, 200]
              }

grid_search_cv(rfr, params_grid)
rfr_best_params, rfr_best_score = best_params, best_score
print('RFR best params: {} & best_score: {:0.5f}'.format(rfr_best_params, rfr_best_score))

In [None]:
train_test_score['R2 Score'] = train_test_split_r2
train_test_score.index = ['LinearRegression', 'Lasso', 'Ridge', 'SVM Linear', 'SVM RBF', 'RandomForestRegressor']
train_test_score = train_test_score.round(5)
train_test_score