In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time

In [2]:
data = pd.read_csv('../day10/data/energy_data.csv')
data.head(10)

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,7063,76,10,29.84,Weekday,2713.95
1,Commercial,44372,66,45,16.72,Weekday,5744.99
2,Industrial,19255,37,17,14.3,Weekend,4101.24
3,Residential,13265,14,41,32.82,Weekday,3009.14
4,Commercial,13375,26,18,11.92,Weekday,3279.17
5,Commercial,37377,26,32,16.24,Weekend,4687.67
6,Industrial,38638,92,14,21.01,Weekend,5526.83
7,Residential,34950,60,18,28.24,Weekday,4116.32
8,Industrial,29741,99,44,13.08,Weekday,5841.65
9,Residential,17467,42,36,28.84,Weekday,3419.13


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        1100 non-null   object 
 1   Square Footage       1100 non-null   int64  
 2   Number of Occupants  1100 non-null   int64  
 3   Appliances Used      1100 non-null   int64  
 4   Average Temperature  1100 non-null   float64
 5   Day of Week          1100 non-null   object 
 6   Energy Consumption   1100 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 60.3+ KB


In [4]:
data['Energy Usage Density'] = data['Appliances Used'] / (data['Number of Occupants'] + 1)
data['Temperature Anomaly'] = abs(data['Average Temperature'] - 22)

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
X = data.drop('Energy Consumption', axis=1)
y = data['Energy Consumption']
num_features = X.select_dtypes(include=['int64','float64']).columns
cat_features = X.select_dtypes(include=object).columns
print(num_features)
print(cat_features)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('label', OneHotEncoder())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer, num_features),
        ('cat',categorical_transformer, cat_features)
    ]
)
X = data.drop('Energy Consumption', axis=1)
y = data['Energy Consumption']
X = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

Index(['Square Footage', 'Number of Occupants', 'Appliances Used',
       'Average Temperature', 'Energy Usage Density', 'Temperature Anomaly'],
      dtype='object')
Index(['Building Type', 'Day of Week'], dtype='object')


In [6]:
from sklearn.ensemble import RandomForestRegressor
time_start = time.time()
forest = RandomForestRegressor(random_state=0)
forest_params = param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
forest_grid = GridSearchCV(forest, forest_params)
forest_grid.fit(X_train, y_train)
print('RandomForest best params:', forest_grid.best_params_)
print('RandomForest best score:', forest_grid.best_score_)
time_end = time.time()
print(f'Total time: {time_end - time_start} seconds') 

RandomForest best params: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RandomForest best score: 0.973288206734925
Total time: 284.74564480781555 seconds


In [7]:
from sklearn.ensemble import ExtraTreesRegressor
time_start = time.time()

etr = ExtraTreesRegressor(random_state=0)

etr_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

etr_grid = GridSearchCV(etr, etr_params, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
etr_grid.fit(X_train, y_train)

print('Extra Trees best params:', etr_grid.best_params_)
print('Extra Trees best score:', etr_grid.best_score_)

time_end = time.time()
print(f'Total time: {time_end - time_start} seconds')


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Extra Trees best params: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Extra Trees best score: -13794.15987198241
Total time: 19.454933643341064 seconds


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=0)

gbr_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gbr_grid = GridSearchCV(gbr, gbr_params, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
gbr_grid.fit(X_train, y_train)

print('Gradient Boosting best params:', gbr_grid.best_params_)
print('Gradient Boosting best score:', gbr_grid.best_score_)

time_end = time.time()
print(f'Total time: {time_end - time_start} seconds')

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Gradient Boosting best params: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Gradient Boosting best score: -6919.601685731203
Total time: 48.89579510688782 seconds
