In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("whitegrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("data"))

['2015-building-energy-benchmarking.csv', '2016-building-energy-benchmarking.csv', 'clean-building-energy-benchmarking.csv', 'socrata_metadata_2015-building-energy-benchmarking.json', 'socrata_metadata_2016-building-energy-benchmarking.json']


In [2]:
raw = pd.read_csv("data/clean-building-energy-benchmarking.csv")
data = raw.copy()

In [3]:
data.head()

Unnamed: 0,DataYear,ENERGYSTARScore,GHGEmissionsIntensity,NumberofBuildings,NumberofFloors,OSEBuildingID,PropertyGFABuilding(s),PropertyGFAParking,PropertyGFATotal,SiteEnergyUse(kBtu),...,Neighborhood_EAST,Neighborhood_GREATER DUWAMISH,Neighborhood_LAKE UNION,Neighborhood_MAGNOLIA / QUEEN ANNE,Neighborhood_NORTH,Neighborhood_NORTHEAST,Neighborhood_NORTHWEST,Neighborhood_SOUTHEAST,Neighborhood_SOUTHWEST,ENERGYSTARCertified
0,2015,65.0,2.64,1.0,12.0,1,88434,0,88434,6981428.0,...,0,0,0,0,0,0,0,0,0,0
1,2015,51.0,2.38,1.0,11.0,2,88502,15064,103566,8354235.0,...,0,0,0,0,0,0,0,0,0,0
2,2015,18.0,1.92,1.0,41.0,3,961990,0,961990,,...,0,0,0,0,0,0,0,0,0,0
3,2015,,,1.0,10.0,5,61320,0,61320,28229320.0,...,0,0,0,0,0,0,0,0,0,0
4,2015,67.0,4.02,1.0,18.0,8,107430,12460,119890,14829099.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df = data.copy()

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [6]:
def test(models, X_train, X_test, y_train, y_test):
    results = {}
    for i in models:
        r2_predict = r2_score(y_test, models[i].fit(X_train, y_train).predict(X_test))
        results[i] = r2_predict
    return pd.DataFrame(results, index=["R2 Score"])

# TotalGHGEmissions

In [7]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["TotalGHGEmissions"]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
y_train = imputer.fit_transform(y_train)
y_test = imputer.transform(y_test)

In [10]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
models = {'OLS': LinearRegression(),
         'Lasso': Lasso(),
         'Ridge': Ridge(),
         'SVM Linear': SVR(kernel="linear"),
          'Random Forest': RandomForestRegressor(),
         }

In [12]:
r2_baseline = test(models, X_train, X_test, y_train, y_test)

In [None]:
lasso_params = {'alpha': np.logspace(-2, -1, 20)}
ridge_params = {'alpha': np.logspace(-5, 5, 20)}
svm_params = {'kernel': ['linear'],'C': [4,5], 'gamma': np.logspace(-5, 5, 10)}
rfr_params = {'min_samples_leaf': [1,2,4], 'min_samples_split': [2], 'n_estimators': [100]}

models2 = {'OLS': LinearRegression(),
           'Lasso': GridSearchCV(Lasso(), 
                               param_grid=lasso_params).fit(X_train, y_train).best_estimator_,
           'Ridge': GridSearchCV(Ridge(), 
                               param_grid=ridge_params).fit(X_train, y_train).best_estimator_,
           'SVM Linear': GridSearchCV(SVR(), param_grid=svm_params).fit(X_train, y_train).best_estimator_,
          'Random Forest': GridSearchCV(RandomForestRegressor(), param_grid=rfr_params).fit(X_train, y_train).best_estimator_,}

In [None]:
r2_optimize = test(models2, X_train, X_test, y_train, y_test)

In [None]:
r2_scores = pd.concat([r2_baseline, r2_optimize])
r2_scores

## SiteEnergyUseWN(kBtu)

In [15]:
X = df.drop(["TotalGHGEmissions", "SiteEnergyUseWN(kBtu)", "ENERGYSTARScore"], axis=1)
y = df[["SiteEnergyUseWN(kBtu)"]]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [18]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
y_train = imputer.fit_transform(y_train)
y_test = imputer.transform(y_test)

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [20]:
test(models, X_train, X_test, y_train, y_test)

Unnamed: 0,OLS,Lasso,Ridge,SVM Linear,Random Forest
Train,0.79201,0.792004,0.792004,-0.128755,0.993934
Test,0.82904,0.829048,0.829039,-0.127118,0.937307


In [21]:
test(models2, X_train, X_test, y_train, y_test)

Unnamed: 0,OLS,Lasso,Ridge,Random Forest
Train,0.79201,0.792004,0.790374,0.99082
Test,0.82904,0.829048,0.825264,0.947728
