In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from urllib.parse import quote_plus

In [2]:
# SQLAlchemy connectable
# DEFAULT engine = create_engine("postgresql://USERNAME:%s@HOST/mydatabase" % quote_plus("Password"))
engine = create_engine("postgresql://postgres:%s@localhost/Energy_Output_Expenses" % quote_plus("postgres"))
engine.connect()

# Read in SQL table
data = pd.read_sql_table('demographics',engine)

# Previous binning technique that did not work as intended
# labels = [1, 2, 3, 4, 5]
#data['TOTALBTU'] = pd.qcut(data['TOTALBTU'], 5, labels=labels)
#data['TOTALDOL'] = pd.qcut(data['TOTALDOL'], 5, labels=labels)

data.head()

ModuleNotFoundError: No module named 'psycopg2'

In [None]:
# Drop any rows with missing values
data.dropna(inplace=True)

# Drop the 'DOEID' column and update the DataFrame
data.drop('DOEID', axis=1, inplace=True)

In [None]:
# First attempt
# Target and feature variables
y = data['TOTALBTU']
X = data[['HHSEX', 'HHAGE', 'EMPLOYHH', 'EDUCATION', 'SDESCENT', 'HOUSEHOLDER_RACE', 'NHSLDMEM', 'NUMCHILD', 'MONEYPY']]  # Features

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Initialize the Linear Regression model and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
mse

In [None]:
# Calculate r2 score
r2 = r2_score(y_test, predictions)
r2

In [None]:
# Attempt 2
# Define target vector
y = data['TOTALBTU']
X = data[['HHSEX', 'HHAGE', 'EMPLOYHH', 'EDUCATION', 'SDESCENT', 'HOUSEHOLDER_RACE', 'NHSLDMEM', 'NUMCHILD', 'MONEYPY']]

In [None]:
import sklearn as skl
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a Keras Sequential model and add more than one Dense hidden layer
import tensorflow as tf

nn_model = tf.keras.models.Sequential()

nn_model.add(tf.keras.layers.Dense(units=6, activation="relu", input_dim=2))

nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))

nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

In [None]:
# Compile the model and train over more than 100 epochs
# nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# fit_model = nn_model.fit(X_train_scaled, y_train, epochs=10)

# ERROR:
# ValueError: Exception encountered when calling Sequential.call().

# Input 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 2, but received input with shape (None, 9)

In [None]:
# Attempt 3 (SUCCESSFUL)

# Define a function to perform binning on TOTALBTU column
def bin_total_btu(total_btu):
    if total_btu < 50000:
        return 'Low'
    elif total_btu >= 50000 and total_btu < 100000:
        return 'Medium'
    else:
        return 'High'

In [None]:
# Apply binning function to create a new column 'BTU_Bin'
data['BTU_Bin'] = data['TOTALBTU'].apply(bin_total_btu)

In [None]:
# Perform one-hot encoding on the 'BTU_Bin' column
X_encoded = pd.get_dummies(data.drop(['TOTALBTU'], axis=1), columns=['BTU_Bin'], drop_first=True)
y = data['TOTALBTU']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.1, random_state=45)

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

In [None]:
# Train the model
lr_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

In [None]:
# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")