In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.utils.multiclass import type_of_target

import psycopg2
import numpy as np
import pandas as pd
import altair as alt
import os  # <-- NEW: Import the os module

In [75]:
import dotenv

dotenv.load_dotenv(dotenv_path='database.env')

# --- Configuration ---
DB_NAME = os.environ['DB_NAME']
DB_USER = os.environ['DB_USER']
DB_PASS = os.environ['DB_PASS']
HOST = os.environ['HOST']
PORT = os.environ['PORT']

In [77]:
# 2. Database Connection and Setup
conn = psycopg2.connect(
    dbname=DB_NAME, user=DB_USER, password=DB_PASS, host=HOST, port=PORT
)
conn.autocommit = True
cur = conn.cursor()

cur.execute('SELECT "facilityId","co2e","fuelHint","createdAt" FROM stationary_combustion_activity')

data = pd.DataFrame(cur.fetchall(), columns = ["facilityId","co2e","fuelHint","createdAt"])
print(data)

                                facilityId          co2e  \
0     68f4851d-4959-49b6-96a1-63d80c816ed3      0.000000   
1     bf2278d8-0d33-4e28-a55e-ad3c1537af7e      0.000054   
2     4f4bf05a-0a52-48db-b66d-3b3da6b23619      0.000054   
3     4f4bf05a-0a52-48db-b66d-3b3da6b23619      0.004033   
4     4f4bf05a-0a52-48db-b66d-3b3da6b23619      0.010790   
...                                    ...           ...   
6160  beae4852-c882-4706-8dd9-f8ce4dc11989      0.222116   
6161  d0f9256f-17f4-4b7e-abc7-2eeeba8d59c0      0.053958   
6162  d0f9256f-17f4-4b7e-abc7-2eeeba8d59c0   9049.547300   
6163  e0bd2a9c-69d4-403e-beed-3b9a6b08b382  12163.778891   
6164  e0bd2a9c-69d4-403e-beed-3b9a6b08b382   6588.125943   

                           fuelHint                         createdAt  
0                            Bamboo  2025-01-05 21:53:26.836000-08:00  
1                       Natural Gas  2025-01-20 00:33:24.881000-08:00  
2                       Natural Gas  2025-01-20 02:59:01.888000

In [41]:
# 1. Convert 'createdAt' to datetime objects and sort the data (crucial for time series)
data['createdAt'] = pd.to_datetime(data['createdAt'], utc=True)
data = data.sort_values(by='createdAt').reset_index(drop=True)

# 2. Engineer Temporal Feature (e.g., as Unix timestamp)
# This converts the datetime object to a continuous numerical feature (seconds since epoch).
data['timestamp'] = data['createdAt'].apply(lambda x: x.timestamp())

# 3. Encode Categorical Features ('facilityId' and 'fuelHint')
# Using One-Hot Encoding (get_dummies) is a good general starting point.
data_encoded = pd.get_dummies(data, columns=['facilityId', 'fuelHint'], dtype=int)

# 4. Prepare X and Y
TARGET_VAR = "co2e"
# Drop the original 'createdAt' column and the target variable from features (X)
X = data_encoded.drop(columns=[TARGET_VAR, 'createdAt']).values.astype('float32') 
Y = data_encoded[TARGET_VAR].values.astype('float32')

# *** NEW: Apply a log transformation to Y to stabilize the variance and mitigate outliers ***
# Use log(1+x) to handle zero values
Y_log = np.log1p(Y) 

# 5. Scale the features (X) and the target (Y)
# Initialize separate scalers for X and Y to be able to inverse_transform the predictions later
scaler_X = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler_X.fit_transform(X)

# Reshape Y_log to (n_samples, 1) for the scaler
Y_log = Y_log.reshape(-1, 1) 
scaler_Y = MinMaxScaler(feature_range=(0, 1))
Y_scaled = scaler_Y.fit_transform(Y_log) # Fit on the log-transformed data

# 6. Define time step
time_step = 5

# The create_sequences function needs adjustment to handle multiple input features (X_scaled)
def create_sequences_multivariate(X, Y, time_steps):
    X_seq, y_seq = [], []
    # Loop until the last time step for which a target exists
    for i in range(len(X) - time_steps):
        # The input is a sequence of 'time_steps' observations of all features
        X_seq.append(X[i:(i + time_steps), :])
        # The target is the 'co2e' value at time 'i + time_steps'
        y_seq.append(Y[i + time_steps, 0])
    return np.array(X_seq), np.array(y_seq)

# Now, create the sequences
X1, y1 = create_sequences_multivariate(X_scaled, Y_scaled, time_step)

# Reshape X1 for LSTM: [samples, timesteps, features] - it should already be this shape
# if the original X_scaled has more than one feature.
print(f"Shape of X1 (sequences): {X1.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, shuffle=False, random_state=42)
# NOTE: For time series, it's typically better to set shuffle=False to maintain temporal order.
# The original code used shuffle=True, which is generally incorrect for time series forecasting.

# Note: The Keras model input_shape in the next cell (cell 21) will need to be updated to:
# input_shape=(time_step, X1.shape[2])
# Currently, it is incorrectly using X_scaled.shape
# model.add(LSTM(units = 32, activation=activation, input_shape=(time_step, X1.shape[2])))

Shape of X1 (sequences): (6160, 5, 213)


In [17]:
# time_step = 5
# TARGET_VAR = "co2e"

# X = np.array(data[['facilityId', 'fuelHint', 'createdAt']].copy())
# X_scaled = MinMaxScaler(feature_range = (0,1)).fit_transform(X.reshape(-1, 1))
# Y = data.loc[:, TARGET_VAR:TARGET_VAR]

# def create_sequences(data, time_steps):
#     X, y = [], []
#     for i in range(len(data) - time_steps):
#         X.append(data[i:(i + time_steps), 0])
#         y.append(data[i + time_steps, 0])
#     return np.array(X), np.array(y)

# X1, y1 = create_sequences(X_scaled, time_step)

# X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [57]:
def create_model(activation='relu',
                 batch_size = 1,
                 dropout_rate = 0.02):

    model = Sequential()
    model.add(LSTM(units = 32, activation=activation, input_shape=(time_step, X1.shape[2])))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss = 'mean_squared_error', optimizer=optimizer)

    return model


estimator=KerasRegressor(build_fn=create_model, verbose=0, epochs = 10)

param_grid = {
    'batch_size':   [5, 10, 20, 40, 60],
    'epochs': [10, 15, 20]
}


In [17]:
X1.shape

(324, 108)

In [59]:
scoring = {
    "MSE": make_scorer(mean_squared_error, greater_is_better=False),
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "R2": make_scorer(r2_score)
}

grid_search = GridSearchCV(estimator, param_grid, cv = 5, scoring = scoring, refit = "R2", n_jobs = -1, verbose = True)
grid_search.fit(X_train, y_train, validation_data=(X_test, y_test))

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 15 candidates, totalling 75 fits


  X, y = self._initialize(X, y)
  super().__init__(**kwargs)


In [61]:
r2_scores = cross_val_score(best_model, X1, y1, cv = 5, scoring = "r2")
mse_scores = -cross_val_score(best_model, X1, y1, cv = 5, scoring = "neg_mean_squared_error")
mae_scores = -cross_val_score(best_model, X1, y1, cv = 5, scoring = "neg_mean_absolute_error")

results_list = []

  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)


In [63]:
results_list.append({
    'Model': 'LSTM',
    'Best Params': best_params,
    'MAE': np.mean(mae_scores),
    'MAE Std': np.std(mae_scores),
    'RMSE': np.sqrt(np.mean(mse_scores)),
    'RMSE Std': np.std(np.sqrt(mse_scores)),
    'MSE': np.mean(mse_scores),
    'MSE Std': np.std(mse_scores),
    'R2': np.mean(r2_scores),
    'R2 Std': np.std(r2_scores)
        })

In [65]:
# Convert results to a DataFrame
df_results = pd.DataFrame(results_list)
df_results.head()

Unnamed: 0,Model,Best Params,MAE,MAE Std,RMSE,RMSE Std,MSE,MSE Std,R2,R2 Std
0,LSTM,"{'batch_size': 5, 'epochs': 20}",0.05094,0.063304,0.128258,0.103064,0.01645,0.030887,-84.455953,85.297424


In [67]:
# Assuming time_step = 5 (your lookback window)
time_step = 5 

# 1. Get the last sequence of observations from the complete scaled data
# This represents the sequence immediately preceding the value you want to predict.
last_sequence = X_scaled[-time_step:]

# 2. Reshape the sequence to the required 3D format for the LSTM
# (Number of Samples, time_step, Number of Features)
X_predict = last_sequence.reshape(1, time_step, last_sequence.shape[1])

print(f"Prediction Input Shape: {X_predict.shape}")

Prediction Input Shape: (1, 5, 213)


In [69]:
# Use the trained model to generate the prediction (scaled output)
scaled_prediction = best_model.predict(X_predict)

# The result is a scaled value (e.g., between 0 and 1)
print(f"Scaled Prediction: {scaled_prediction[0]:.4f}")

# Reshape the single scaled prediction back to (1, 1) as required by the scaler
scaled_prediction_reshaped = scaled_prediction.reshape(-1, 1) 

# Use the target scaler (scaler_Y) to convert the scaled value back to the original units
actual_prediction = scaler_Y.inverse_transform(scaled_prediction_reshaped)

# Use the target scaler (scaler_Y) to convert the scaled value back to the original units
actual_prediction_log = scaler_Y.inverse_transform(scaled_prediction_reshaped)

# *** NEW: Inverse the log transformation (exp(x) - 1) ***
final_co2e_prediction = np.expm1(actual_prediction_log[0][0]) 

print("-" * 30)
print(f"Predicted CO2e Emission (Actual Units): {final_co2e_prediction:,.2f}")
print("-" * 30)

Scaled Prediction: -0.0663
------------------------------
Predicted CO2e Emission (Actual Units): -0.63
------------------------------


In [73]:
# Clean up
cur.close()
conn.close()