# Grid search
For MLP model in seasonality adjustment methodology

In [1]:
# Load Packages
import pandas as pd
import numpy as np
import plotly.express as px

import warnings
from typing import List
import os, sys, time

from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import keras

rootpath = ".."
sys.path.insert(0, f"{os.getcwd()}/{rootpath}/base_models")
sys.path.insert(0, f"{os.getcwd()}/{rootpath}/source_models")
warnings.filterwarnings("ignore")

import model_prep


step_back = 6  # window size = 6*5 = 30 mins

step_back = 6  # window size = 6*5 = 30 mins
season_map = {
    "spring": [3, 4, 5],
    "summer": [6, 7, 8],
    "fall": [9, 10, 11],
    "winter": [12, 1, 2],
}

2023-12-01 01:01:49.331153: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from_building_name = "ESB"
from_tower_number = 1
to_building_name = "ESB"
to_tower_number = 2
features = ['FlowEvap', 'PerHumidity', 'TempAmbient', 'TempCondIn',
    'TempCondOut', 'TempEvapIn', 'TempEvapOut', 'TempWetBulb',
    'PerFreqConP', 'Tonnage', 'PerFreqFan']
target = 'EnergyConsumption'
to_season = "summer"
from_season = "summer"
finetuning_percentage = 0.8
source_epochs=100
finetune_epochs = 100
display_results = True
use_delta = True
shuffle_seed = 42
train_percentage = 0.8

In [3]:
"""
1. Load data and do basic preprocessing
"""
# load data
df = pd.read_csv(
    f"{rootpath}/data/{from_building_name.lower()}/{from_building_name.lower()}{from_tower_number}_preprocessed.csv",
    index_col="time",
)
df.index = pd.to_datetime(df.index)

# only take data for one season
df = model_prep.choose_season(df, season=from_season)

# remove cases in which tower was OFF, and cases where OFF data would be included in past timesteps of ON data
on_condition = df[target] > 0
df = df.drop(df[~on_condition].index, axis=0)

# select features and targets and create final dataframe that includes only relevant features and targets
df = df[features+["DayOfWeek"]].join(df[target], on=df.index)

# if difference from first value should be used as for predictions then return the first value
first_val = df.iloc[0, df.columns.get_loc(target)]
if use_delta:
    df[target] = (
        df[target] - first_val
    )

In [4]:
"""
2. Seasonality removal
"""

def calculate_seasonal_index(time_series, seasonality_column, m):
    """
    Calculate the seasonal index for each seasonality value in the time series.

    Parameters:
    - time_series: Pandas DataFrame containing the time series data with a column for the seasonality values.
    - seasonality_column: String representing the column name containing the seasonality values (e.g., days of the week).
    - m: Integer representing the number of data points for each seasonality value.

    Returns:
    - Pandas DataFrame containing the seasonal index for each seasonality value.
    """

    # Group the data by the seasonality column
    grouped_data = time_series.groupby(seasonality_column)

    # Calculate the average of all target variable data points
    y_bar = time_series.mean()[target]

    # Initialize an empty dictionary to store the seasonal index values
    seasonal_index_dict = {}

    # Iterate through each group (seasonality value)
    for group, group_data in grouped_data:
        # Calculate the sum of the first m data points
        sum_y_p_j = group_data.iloc[:m][target].sum()

        # Calculate the seasonal index using the provided formula
        seasonal_index = 1 / y_bar * (1 / m) * sum_y_p_j

        # Store the seasonal index value in the dictionary
        seasonal_index_dict[group] = seasonal_index

    # Convert the dictionary to a Pandas DataFrame
    seasonal_index_df = pd.DataFrame(list(seasonal_index_dict.items()), columns=[seasonality_column, 'sp'])

    return seasonal_index_df

def operate_with_sp(col, sp_df, operation):
    index_col = col.index
    combined_df = pd.merge(col, sp_df, left_on=col.index.dayofweek, right_on='DayOfWeek', how='left').set_index(index_col)
    if operation == 'multiply':
        combined_df[col.name] = combined_df[col.name] * combined_df['sp']
    elif operation == 'divide':
        combined_df[col.name] = combined_df[col.name] / combined_df['sp']
    else:
        raise ValueError('Invalid operation')
    return combined_df[col.name]

In [5]:
# apply seasonality removal
sdf = calculate_seasonal_index(df, 'DayOfWeek', 7)
df[target] = operate_with_sp(df[target], sdf, 'divide')

In [6]:
"""
3. Split data into training and testing sets
"""

df = df.dropna() # drop first NaN value due to zero division
X = df[features]  # only have features
y = df[target]  # only have target column

# split into input and outputs
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=(1 - train_percentage), shuffle=False, random_state=shuffle_seed
)

# scale feature data
scaler = MinMaxScaler().fit(X_train)
X_train[X_train.columns] = scaler.transform(X_train)
X_test[X_test.columns] = scaler.transform(X_test)
vec_X_train = X_train.values
vec_X_test = X_test.values


vec_y_train = y_train.values
vec_y_test = y_test.values

In [10]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold

# Function to create the Keras model
def create_model(units=20):
    model = Sequential()
    model.add(Dense(units, input_shape=(len(features),), kernel_initializer='normal', activation='relu'))
    model.add(Dense(units=1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Wrap the Keras model so it can be used by scikit-learn
keras_regressor = KerasRegressor(build_fn=create_model, epochs=100, batch_size=10, verbose=0)

# Define the hyperparameters to search
param_grid = {
    'units': [30, 80],  # Number of units in the first layer
    'batch_size': [10],  # Batch size
    'epochs': [200]  # Number of training epochs
}

# Use GridSearchCV to find the best hyperparameters
grid = GridSearchCV(estimator=keras_regressor, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=KFold(n_splits=3))
grid_result = grid.fit(X_train, y_train)

# Print the best parameters and corresponding MSE
print("Best parameters found: ", grid_result.best_params_)
print("Best MAE found: ", -grid_result.best_score_)


Best parameters found:  {'batch_size': 10, 'epochs': 200, 'units': 80}
Best MAE found:  71.07040794588964


Best parameters found:  {'batch_size': 10, 'epochs': 200, 'units': 30}
Best MAE found:  70.4610976722637

Best parameters found:  {'batch_size': 10, 'epochs': 200, 'units': 80}
Best MAE found:  70.49244807956524

Best parameters found:  {'batch_size': 10, 'epochs': 200, 'units': 80}
Best MAE found:  71.07040794588964