In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sqlalchemy import create_engine
from urllib.parse import quote_plus

In [2]:
#connect to SQLAlchemy
engine = create_engine("postgresql://postgres:%s@localhost/energy_consumption" % quote_plus("PW"))
engine.connect()

<sqlalchemy.engine.base.Connection at 0x176e6c20b90>

In [3]:
df_housing = pd.read_sql_table('housing_characteristics', engine)
df_housing.head()

Unnamed: 0,doeid,typehuq,yearmaderange,totrooms,walltype,rooftype,adqinsul,numfrig,equipm,acequipm_pub,totsqft_en,totalbtu,totaldol
0,100001,2,4,8,1,5,2,2,3,1,2100,144647.71,2656.89
1,100002,5,5,3,1,-2,2,1,3,1,590,28034.61,975.0
2,100003,5,3,4,1,-2,2,0,2,1,900,30749.71,522.65
3,100004,2,5,9,3,5,2,2,3,1,2100,86765.19,2061.77
4,100005,5,3,3,7,-2,2,2,3,1,800,59126.93,1463.04


In [4]:
#Read housing_characteristics csv file in the Table_CSVs folder
#df_housing = pd.read_csv("Table_CSVs/housing_characteristics.csv")

#drop missing values
df_housing.dropna(inplace=True)

#drop unnecessary 
df_housing.drop(columns=["doeid", 'totaldol'], inplace=True)

#review dataframe
df_housing.head(20)

Unnamed: 0,typehuq,yearmaderange,totrooms,walltype,rooftype,adqinsul,numfrig,equipm,acequipm_pub,totsqft_en,totalbtu
0,2,4,8,1,5,2,2,3,1,2100,144647.71
1,5,5,3,1,-2,2,1,3,1,590,28034.61
2,5,3,4,1,-2,2,0,2,1,900,30749.71
3,2,5,9,3,5,2,2,3,1,2100,86765.19
4,5,3,3,7,-2,2,2,3,1,800,59126.93
5,2,6,8,1,5,1,2,3,1,4520,85400.64
6,2,2,5,1,5,3,1,3,1,2100,131875.03
7,5,7,4,3,-2,2,1,4,-2,900,41446.59
8,5,7,3,7,-2,2,1,5,4,750,14512.02
9,5,5,4,4,-2,2,1,4,1,760,12393.76


In [5]:
#Split data into features and target variables
X = df_housing.drop(['totalbtu'], axis=1)
y = df_housing['totalbtu']

In [6]:
#Split the data into training and testing datasets by using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

Linear Regression Model

In [7]:
#initialize the model
LR_model = LinearRegression()

#train the model
LR_model.fit(X_train, y_train)

In [8]:
#make prediction
y_pred = LR_model.predict(X_test)

In [9]:
#Evaluate the model using MSE & r2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1463427265.7045362
R-squared: 0.4257262460037563


In [10]:
coefficients = LR_model.coef_
intercept = LR_model.intercept_
#print("Coefficients:", coefficients)
#print("Intercept:", intercept)
# Get the names of the features from your dataset
feature_names = X.columns

# Combine the coefficients with their corresponding feature names
coefficients_with_names = list(zip(feature_names, coefficients))

# Print the coefficients along with their corresponding feature names
for feature, coefficient in coefficients_with_names:
    print(f"{feature}: {coefficient}")

# Print the intercept
print("Intercept:", intercept)

typehuq: -5377.108072722938
yearmaderange: -3685.516981941524
totrooms: 4347.508307754276
walltype: 6.71460556343866
rooftype: -20.299562432743738
adqinsul: 236.8157271841776
numfrig: 7845.921980032588
equipm: -130.22385060959314
acequipm_pub: 858.9995591175184
totsqft_en: 17.696840403620865
Intercept: 37978.744140971925


Optimize the Linear Regression Model

In [11]:
#drop the features with the weakest relationship
X2 = df_housing.drop(['totalbtu', 'walltype', 'rooftype', 'equipm', 'totsqft_en'], axis=1)
y2 = df_housing['totalbtu']

In [12]:
#split the data
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

#initialize the model
LR2_model = LinearRegression()

#train the model
LR2_model.fit(X2_train, y2_train)

#make prediction
y2_pred = LR2_model.predict(X2_test)

#Evaluate the model using MSE & r2
mse = mean_squared_error(y2_test, y2_pred)
r2 = r2_score(y2_test, y2_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1635380750.5435705
R-squared: 0.3582487733849117


In [13]:
coefficients = LR_model.coef_
intercept = LR_model.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [-5.37710807e+03 -3.68551698e+03  4.34750831e+03  6.71460556e+00
 -2.02995624e+01  2.36815727e+02  7.84592198e+03 -1.30223851e+02
  8.58999559e+02  1.76968404e+01]
Intercept: 37978.744140971925


Trying Lasso

In [14]:
#import modules
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [15]:
#define the model
LR_model = Lasso()

#define hyperparameters
param_grid = {'alpha': [0.01, 0.1, 1, 10]}

#perform grid search
grid_search = GridSearchCV(LR_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

#best hyperparameters
best_params = grid_search.best_params_

print(best_params)


{'alpha': 10}


Use Best Parameters

In [16]:
#use best_params
best_LR_model = Lasso(alpha=best_params['alpha'])


#train
best_LR_model.fit(X_train, y_train)

In [17]:
#evaluate
y_pred = best_LR_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(mse)
print(r2)

1463421257.2659745
0.42572860381725985


Random Forest Regressor

In [43]:
#import Random Forest module
from sklearn.ensemble import RandomForestRegressor

#train model
RF_model= RandomForestRegressor(n_estimators=100, random_state=42)
#RF_model= RandomForestRegressor(n_estimators=500, random_state=42)

#fit model
RF_model.fit(X_train, y_train)


In [44]:
#evaluate model
y_pred = RF_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1358081282.0452914
R-squared: 0.46706580207338866


Optimizing Random Forest

In [20]:
feature_importance = pd.Series(RF_model.feature_importances_, index=X.columns)
print("Feature Importance:")
print(feature_importance)

Feature Importance:
typehuq          0.072862
yearmaderange    0.084581
totrooms         0.085815
walltype         0.054569
rooftype         0.040222
adqinsul         0.035159
numfrig          0.044820
equipm           0.103470
acequipm_pub     0.032609
totsqft_en       0.445895
dtype: float64


In [21]:
#remove 'unimportant' features
RF_X = df_housing.drop(['totalbtu', 'acequipm_pub', 'adqinsul', 'walltype', 'rooftype', 'numfrig'], axis=1)
RF_y = df_housing['totalbtu']

#Split the data into training and testing datasets by using train_test_split
RF_X_train, RF_X_test, RF_y_train, RF_y_test = train_test_split(RF_X, RF_y, test_size=0.2, random_state=42)

In [22]:
#train optimized model
ORF_model= RandomForestRegressor(n_estimators=100, random_state=42)

#fit model
ORF_model.fit(RF_X_train, RF_y_train)

In [23]:
#evaluate optimized model
ORF_y_pred = ORF_model.predict(RF_X_test)
mse = mean_squared_error(RF_y_test, ORF_y_pred)
r2 = r2_score(RF_y_test, ORF_y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 1617793756.3282416
R-squared: 0.3651502090942462


In [24]:
#import necessary modules
#from sklearn.model_selection import GridSearchCV

In [25]:
#initialize model
#optimized_rf = RandomForestRegressor()

In [26]:
#define hyperparameters
#param_grid = {
    #'n_estimators': [100, 200, 300],
    #'max_depth': [None, 5, 10, 15],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4],
    #'max_features': ['auto', 'sqrt', 'log2']
#}

In [27]:
#perform hyperparameter tuning
#grid_search = GridSearchCV(estimator=optimized_rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
#grid_search.fit(X_train, y_train)

Deep Neural Network

In [28]:
#import modules
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

In [29]:
#normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [30]:
#define model

DNN_model = Sequential([
    Input(shape=(X.shape[1],)),
    Dense(128, activation='relu'), 
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [31]:
#compile model
DNN_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

#train
history = DNN_model.fit(X_scaled, y, batch_size=32, epochs=100, validation_split=0.2)

Epoch 1/100
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 9684916224.0000 - val_loss: 9502686208.0000
Epoch 2/100
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 10117497856.0000 - val_loss: 9502686208.0000
Epoch 3/100
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9948770304.0000 - val_loss: 9502686208.0000
Epoch 4/100
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9818743808.0000 - val_loss: 9502686208.0000
Epoch 5/100
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9944419328.0000 - val_loss: 9502686208.0000
Epoch 6/100
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9943287808.0000 - val_loss: 9502686208.0000
Epoch 7/100
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 10445696000.0000 - val_loss: 9502686208.0000
Epoch 8/100
[1m463/463[

In [32]:
#evaluate
loss = DNN_model.evaluate(X_scaled, y)
y_pred = DNN_model(X_test)

r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", loss)
print("R-squared:", r2)

[1m578/578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9482345472.0000
Mean Squared Error: 9719939072.0
R-squared: -2.649817173100648


Gradient Boosting Regressor

In [33]:
#import modules
from sklearn.ensemble import GradientBoostingRegressor

In [34]:
#initiate model
GBR_model = GradientBoostingRegressor()

#train the model
GBR_model.fit(X_train, y_train)

In [35]:
#make predictions
y_pred = GBR_model.predict(X_test)

#evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Value:", r2)

Mean Squared Error: 1197767836.5721838
R-squared Value: 0.5299755252317797


Optimize Gradient Boosting Regressor Model

In [36]:
#hypertuning
#define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

#initialize grid search
grid_search = GridSearchCV(estimator=GBR_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

#run grid search
grid_search.fit(X_train, y_train)

In [37]:
#get best params
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


In [38]:
#train 
best_GBR_model = GradientBoostingRegressor(**best_params)
best_GBR_model.fit(X_train, y_train)

In [39]:
#evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Value:", r2)

Mean Squared Error: 1197767836.5721838
R-squared Value: 0.5299755252317797
