In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from urllib.parse import quote_plus

In [2]:
# SQLAlchemy connectable
# DEFAULT engine = create_engine("postgresql://USERNAME:%s@HOST/mydatabase" % quote_plus("Password"))
engine = create_engine("postgresql://postgres:%s@localhost/Energy_Output_Expenses" % quote_plus("Password"))
engine.connect()

# Read in SQL table
data = pd.read_sql_table('demographics',engine)

# Previous binning technique that did not work as intended
# labels = [1, 2, 3, 4, 5]
#data['totalbtu'] = pd.qcut(data['totalbtu'], 5, labels=labels)
#data['TOTALDOL'] = pd.qcut(data['TOTALDOL'], 5, labels=labels)

data.head()

Unnamed: 0,doeid,hhsex,hhage,employhh,education,sdescent,householder_race,nhsldmem,numchild,moneypy,totalbtu,totaldol
0,100001,1,65,3,5,0,1,2,0,13,144647.71,2656.89
1,100002,1,79,3,3,0,1,1,0,6,28034.61,975.0
2,100003,2,82,3,4,0,1,1,0,11,30749.71,522.65
3,100004,2,70,3,3,0,1,2,0,10,86765.19,2061.77
4,100005,2,30,1,5,0,1,2,0,16,59126.93,1463.04


In [3]:
# Drop any rows with missing values
data.dropna(inplace=True)

# Drop the 'DOEID' column and update the DataFrame
data.drop('doeid', axis=1, inplace=True)

In [4]:
# First attempt
# Target and feature variables
y = data['totalbtu']
X = data[['hhsex', 'hhage', 'employhh', 'education', 'sdescent', 'householder_race', 'nhsldmem', 'numchild', 'moneypy']]  # Features

In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# Initialize the Linear Regression model and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [7]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [8]:
# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
mse

2328086457.795613

In [9]:
# Calculate r2 score
r2 = r2_score(y_test, predictions)
r2

0.18746129854499471

In [10]:
# Attempt 2
# Define target vector
y = data['totalbtu']
X = data[['hhsex', 'hhage', 'employhh', 'education', 'sdescent', 'householder_race', 'nhsldmem', 'numchild', 'moneypy']]

In [11]:
import sklearn as skl
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a Keras Sequential model and add more than one Dense hidden layer
import tensorflow as tf

nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=6, activation="relu", input_dim=2))

nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# Compile the model and train over more than 100 epochs
# nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# fit_model = nn_model.fit(X_train_scaled, y_train, epochs=10)

# ERROR:
# ValueError: Exception encountered when calling Sequential.call().

# Input 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 2, but received input with shape (None, 9)

In [14]:
# Attempt 3 (SUCCESSFUL)

# Define a function to perform binning on totalbtu column
def bin_total_btu(total_btu):
    if total_btu < 50000:
        return 'Low'
    elif total_btu >= 50000 and total_btu < 100000:
        return 'Medium'
    else:
        return 'High'

In [15]:
# Apply binning function to create a new column 'BTU_Bin'
data['BTU_Bin'] = data['totalbtu'].apply(bin_total_btu)

In [16]:
# Perform one-hot encoding on the 'BTU_Bin' column
X_encoded = pd.get_dummies(data.drop(['totalbtu'], axis=1), columns=['BTU_Bin'], drop_first=True)
X_encoded = X_encoded.rename(str,axis="columns")
y = data['totalbtu']

In [17]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.1, random_state=45)

In [18]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

In [20]:
# Train the model
lr_model.fit(X_train_scaled, y_train)

In [21]:
# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

In [22]:
# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 595052131.5206738
R-squared (R2): 0.7938565177892568
