In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.utils.multiclass import type_of_target

import numpy as np
import pandas as pd
import altair as alt
import os  # <-- NEW: Import the os module

In [4]:
data = pd.read_csv("emissions_data.csv")
m_fac = pd.read_csv("material_factors.csv")

data['mat'] = data['material'].map({"Steel":1,"Aluminum":2,"Plastic":3})

print(data)
print(m_fac)

     month  year material  kg_used  emissions_kgCO2e  mat
0        1  2022    Steel      512            1189.6    1
1        2  2022    Steel      478            1107.4    1
2        3  2022    Steel      534            1254.2    1
3        4  2022    Steel      601            1392.3    1
4        5  2022    Steel      642            1493.9    1
..     ...   ...      ...      ...               ...  ...
103      8  2024  Plastic      780            1168.2    3
104      9  2024  Plastic      749            1119.7    3
105     10  2024  Plastic      719            1079.1    3
106     11  2024  Plastic      686            1030.6    3
107     12  2024  Plastic      659             990.5    3

[108 rows x 6 columns]
   material  emission_factor_kgCO2e_per_kg
0     Steel                            2.3
1  Aluminum                            8.1
2   Plastic                            1.5
3  Concrete                            0.9
4    Copper                            3.8


In [6]:
time_step = len(data)
TARGET_VAR = "emissions_kgCO2e"

X = np.array(data[['month', 'year', 'mat', 'kg_used']].copy())
X_scaled = MinMaxScaler(feature_range = (0,1)).fit_transform(X.reshape(-1, 1))
Y = data.loc[:, TARGET_VAR:TARGET_VAR]

def create_sequences(data, time_steps):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps), 0])
        y.append(data[i + time_steps, 0])
    return np.array(X), np.array(y)

X1, y1 = create_sequences(X_scaled, time_step)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [16]:
def create_model(activation='linear',
                 batch_size = 1,
                 dropout_rate = 0.02):

    model = Sequential()
    model.add(LSTM(units = 32, activation=activation, input_shape=(X_scaled.shape)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss = 'mean_squared_error', optimizer=optimizer)

    return model


estimator=KerasRegressor(build_fn=create_model, verbose=0, epochs = 10)

param_grid = {
    'batch_size':   [20, 40, 60, 80],
    'epochs': [10, 15, 20]
}


In [20]:
X_train.shape

(259, 108)

In [22]:
scoring = {
    "MSE": make_scorer(mean_squared_error, greater_is_better=False),
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "R2": make_scorer(r2_score)
}

grid_search = GridSearchCV(estimator, param_grid, cv = 5, scoring = scoring, refit = "R2", n_jobs = -1, verbose = True)
grid_search.fit(X_train, y_train, validation_data=(X_test, y_test))

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  X, y = self._initialize(X, y)
  super().__init__(**kwargs)


In [24]:
r2_scores = cross_val_score(best_model, X1, y1, cv = 5, scoring = "r2")
mse_scores = -cross_val_score(best_model, X1, y1, cv = 5, scoring = "neg_mean_squared_error")
mae_scores = -cross_val_score(best_model, X1, y1, cv = 5, scoring = "neg_mean_absolute_error")

results_list = []

  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)




  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)
  X, y = self._initialize(X, y)
  super().__init__(**kwargs)


In [26]:
results_list.append({
    'Model': 'LSTM',
    'Best Params': best_params,
    'MAE': np.mean(mae_scores),
    'MAE Std': np.std(mae_scores),
    'RMSE': np.sqrt(np.mean(mse_scores)),
    'RMSE Std': np.std(np.sqrt(mse_scores)),
    'MSE': np.mean(mse_scores),
    'MSE Std': np.std(mse_scores),
    'R2': np.mean(r2_scores),
    'R2 Std': np.std(r2_scores)
        })

In [28]:
# Convert results to a DataFrame
df_results = pd.DataFrame(results_list)
df_results.head()

Unnamed: 0,Model,Best Params,MAE,MAE Std,RMSE,RMSE Std,MSE,MSE Std,R2,R2 Std
0,LSTM,"{'batch_size': 20, 'epochs': 10}",0.270265,0.014369,0.352071,0.020551,0.123954,0.01428,-4994.608824,9989.584944
