In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# Import des données

In [2]:
data_quanti = pd.read_csv('files_cleaned/data_quanti.csv')
data_quali = pd.read_csv('files_cleaned/data_quali.csv')
data_quanti = data_quanti.drop(columns=['Unnamed: 0'])

# Mise en place d'une fonction d'erreur

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error

In [4]:
saved_errors = pd.DataFrame(columns=['Modèle', 'MSE', 'Med_abs_error', 'mean_abs_error'])

In [5]:
def error_metric(modele_name, y_test, y_predict, saved=False):
    #Calcul des métriques : 
    mse = np.sqrt(mean_squared_error(y_test, y_predict))
    mae = median_absolute_error(y_test, y_predict)
    mean_abs_error = mean_absolute_error(y_test, y_predict)
    #Enregistrement dans un dataframe
    if saved == True:
        err = [modele_name, mse, mae, mean_abs_error]
        saved_errors.loc[len(saved_errors)] = err
    #Return la métrique
    return mae

# TotalGHGEmissions

In [6]:
target = "TotalGHGEmissions"

In [7]:
data_quanti = data_quanti.dropna(subset=[target, 'ENERGYSTARScore'])

In [8]:
x = data_quanti.drop(columns=["SiteEnergyUseWN(kBtu)", "TotalGHGEmissions",
                              "SiteEnergyUse(kBtu)", 'GHGEmissionsIntensity', 
                              'Electricity(kWh)', 'Electricity(kBtu)',
                              'NaturalGas(kBtu)', 'NaturalGas(therms)',
                              'SiteEnergyUseWN(kBtu)', 'SiteEUIWN(kBtu/sf)',
                              'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
                              'SteamUse(kBtu)', 'OSEBuildingID', 'DataYear',
                              'ENERGYSTARScore'])
y = data_quanti[[target]]

In [9]:
x = x.fillna(x.mean())
y = y.astype(int)

## Division des datasets en deux 

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [12]:
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(x_train)
x_train = std_scale.transform(x_train)
x_test = std_scale.transform(x_test)  

## Mise en place d'une baseline

## Random Forest

Préparation des données

In [13]:
x_train = np.array(x_train)
x_test = np.array(x_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

In [15]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_train, y_train)
select = SelectFromModel(rf, prefit=True, threshold=0.003)
x_train2 = select.transform(x_train)
x_test2 = select.transform(x_test)
rf.fit(x_train2, y_train)
y_predict = rf.predict(x_test2)
error_metric('Random_forest Opti', y_test, y_predict, saved=True)

  
  


9.045499999999997

Il y a donc une amélioration du modèle en utilisant l'energy star score

# SiteEnergyUse(kBtu)

In [16]:
target = "SiteEnergyUse(kBtu)"

In [17]:
data_quanti = data_quanti.dropna(subset=[target, 'ENERGYSTARScore'])

In [18]:
x = data_quanti.drop(columns=["SiteEnergyUseWN(kBtu)", "TotalGHGEmissions",
                              "SiteEnergyUse(kBtu)", 'GHGEmissionsIntensity', 
                              'Electricity(kWh)', 'Electricity(kBtu)',
                              'NaturalGas(kBtu)', 'NaturalGas(therms)',
                              'SiteEnergyUseWN(kBtu)', 'SiteEUIWN(kBtu/sf)',
                              'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
                              'SteamUse(kBtu)', 'OSEBuildingID', 'DataYear',
                              'ENERGYSTARScore'])
y = data_quanti[[target]]

In [19]:
x = x.fillna(x.mean())
y = y.astype(int)

## Division du dataset en deux

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [21]:
std_scale = preprocessing.StandardScaler().fit(x_train)
x_train = std_scale.transform(x_train)
x_test = std_scale.transform(x_test) 

## Mise en place d'une baseline

## Random Forest

In [22]:
x_train = np.array(x_train)
x_test = np.array(x_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [23]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_train, y_train)
select = SelectFromModel(rf, prefit=True, threshold=0.003)
x_train2 = select.transform(x_train)
x_test2 = select.transform(x_test)
rf.fit(x_train2, y_train)
y_predict = rf.predict(x_test2)
error_metric('Random_forest Opti', y_test, y_predict, saved=True)

  
  


44676.51399999991

Il y a donc une amélioration du modèle en utilisant l'energy star score