In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from pickle import dump

# Import des données

In [5]:
data_quanti = pd.read_csv('../files_cleaned/data_quanti.csv')
data_quali = pd.read_csv('../files_cleaned/data_quali.csv')
try:
    data_quanti = data_quanti.drop(columns=['Unnamed: 0'])
except:
    data_quanti = data_quanti
    
try:
    data_quanti = data_quanti.drop(columns=['index'])
except:
    data_quanti = data_quanti

# Mise en place d'une fonction d'erreur

In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error

In [7]:
saved_errors = pd.DataFrame(columns=['Modèle', 'MSE', 'Med_abs_error', 'mean_abs_error'])

In [8]:
def error_metric(modele_name, y_test, y_predict, saved=False):
    #Calcul des métriques : 
    mse = np.sqrt(mean_squared_error(y_test, y_predict))
    mae = median_absolute_error(y_test, y_predict)
    mean_abs_error = mean_absolute_error(y_test, y_predict)
    #Enregistrement dans un dataframe
    if saved == True:
        err = [modele_name, mse, mae, mean_abs_error]
        saved_errors.loc[len(saved_errors)] = err
    #Return la métrique
    return mae

# TotalGHGEmissions

In [9]:
target = "TotalGHGEmissions"

In [10]:
data_quanti_ = data_quanti.dropna(subset=[target])

In [11]:
x = data_quanti_.drop(columns=["SiteEnergyUseWN(kBtu)", "TotalGHGEmissions",
                              "SiteEnergyUse(kBtu)",
                              'ENERGYSTARScore', 'GHGEmissionsIntensity', 
                              'Electricity(kWh)', 'Electricity(kBtu)',
                              'NaturalGas(kBtu)', 'NaturalGas(therms)',
                              'SiteEnergyUseWN(kBtu)', 'SiteEUIWN(kBtu/sf)',
                              'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
                              'SteamUse(kBtu)', 'OSEBuildingID', 'DataYear',
                              'SiteEUI(kBtu/sf)'])
y = data_quanti_[[target]]

In [12]:
x.columns

Index(['CouncilDistrictCode', 'LargestPropertyUseTypeGFA', 'Latitude',
       'Longitude', 'NumberofBuildings', 'NumberofFloors', 'YearBuilt',
       'BuildingType_Campus', 'BuildingType_Multifamily HR (10+)',
       'BuildingType_Multifamily LR (1-4)',
       'BuildingType_Multifamily MR (5-9)', 'BuildingType_NonResidential',
       'BuildingType_Nonresidential COS', 'BuildingType_Nonresidential WA',
       'BuildingType_SPS-District K-12'],
      dtype='object')

In [13]:
x_1 = x.iloc[:, 0:7]
x_2 = x.iloc[:, 7:]
x_1 = x_1.fillna(x_1.mean())
x_2 = x_2.fillna(0)
x = pd.merge(x_1, x_2, left_index=True, right_index=True)
y = y.astype(int)

In [14]:
x.head()

Unnamed: 0,CouncilDistrictCode,LargestPropertyUseTypeGFA,Latitude,Longitude,NumberofBuildings,NumberofFloors,YearBuilt,BuildingType_Campus,BuildingType_Multifamily HR (10+),BuildingType_Multifamily LR (1-4),BuildingType_Multifamily MR (5-9),BuildingType_NonResidential,BuildingType_Nonresidential COS,BuildingType_Nonresidential WA,BuildingType_SPS-District K-12
0,7.0,88434.0,47.6122,-122.33799,1.0,12.0,1927.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,7.0,83880.0,47.61317,-122.33393,1.0,11.0,1996.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,7.0,756493.0,47.61393,-122.3381,1.0,41.0,1969.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,7.0,61320.0,47.61412,-122.33664,1.0,10.0,1926.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,7.0,123445.0,47.61375,-122.34047,1.0,18.0,1980.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
from sklearn import preprocessing
std_scale_GHG = preprocessing.StandardScaler().fit(x)
x = std_scale_GHG.transform(x)

## Division des datasets en deux 

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## Random Forest

Préparation des données

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

In [19]:
rf_TotalGHGEmissions = RandomForestRegressor(n_estimators = 888, min_samples_split = 2, 
                                             max_depth = 75, n_jobs = -1)
rf_TotalGHGEmissions.fit(x_train, y_train)
y_predict = rf_TotalGHGEmissions.predict(x_test)
error_metric('Random_forest Opti', y_test, y_predict, saved=True)

  This is separate from the ipykernel package so we can avoid doing imports until


15.30518018018018

# SiteEnergyUse(kBtu)

In [20]:
target = "SiteEnergyUse(kBtu)"

In [21]:
data_quanti_ = data_quanti.dropna(subset=[target, 'ENERGYSTARScore'])

In [22]:
x = data_quanti_.drop(columns=["SiteEnergyUseWN(kBtu)", "TotalGHGEmissions",
                              "SiteEnergyUse(kBtu)", 'GHGEmissionsIntensity', 
                              'Electricity(kWh)', 'Electricity(kBtu)',
                              'NaturalGas(kBtu)', 'NaturalGas(therms)',
                              'SiteEnergyUseWN(kBtu)', 'SiteEUIWN(kBtu/sf)',
                              'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
                              'SteamUse(kBtu)', 'OSEBuildingID', 'DataYear',
                              'SiteEUI(kBtu/sf)'])

z = data_quanti_[[target]]

In [23]:
x.columns

Index(['CouncilDistrictCode', 'ENERGYSTARScore', 'LargestPropertyUseTypeGFA',
       'Latitude', 'Longitude', 'NumberofBuildings', 'NumberofFloors',
       'YearBuilt', 'BuildingType_Campus', 'BuildingType_Multifamily HR (10+)',
       'BuildingType_Multifamily LR (1-4)',
       'BuildingType_Multifamily MR (5-9)', 'BuildingType_NonResidential',
       'BuildingType_Nonresidential COS', 'BuildingType_Nonresidential WA',
       'BuildingType_SPS-District K-12'],
      dtype='object')

In [24]:
x_1 = x.iloc[:, 0:8]
x_2 = x.iloc[:, 8:]
x_1 = x_1.fillna(x_1.mean())
x_2 = x_2.fillna(0)
x = pd.merge(x_1, x_2, left_index=True, right_index=True)
z = z.astype(int)

In [25]:
std_scale_ENERGY = preprocessing.StandardScaler().fit(x)
x = std_scale_ENERGY.transform(x)

## Division du dataset en deux

In [26]:
x_train, x_test, z_train, z_test = train_test_split(x, z, test_size=0.2)

## Random Forest

In [27]:
rf_SiteEnergyUse = RandomForestRegressor(n_estimators = 777, min_samples_split = 2, 
                                         max_depth = 60, n_jobs = -1)
rf_SiteEnergyUse.fit(x_train, z_train)
y_predict = rf_SiteEnergyUse.predict(x_test)
error_metric('Random_forest Opti', z_test, y_predict, saved=True)

  This is separate from the ipykernel package so we can avoid doing imports until


316027.9987129987

# Save des modèles et du scaler

In [28]:
dump(std_scale_GHG, open('std_scale_GHG.pkl', 'wb'))
dump(std_scale_ENERGY, open('std_scale_ENERGY.pkl', 'wb'))
dump(rf_TotalGHGEmissions, open('rf_TotalGHGEmissions.pkl', 'wb'))
dump(rf_SiteEnergyUse, open('rf_SiteEnergyUse.pkl', 'wb'))