In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Define file paths for each dataset
file_paths = {
    '2015': './PUB_PriceHOEPPredispOR_2015_v1.csv',
    '2016': './PUB_PriceHOEPPredispOR_2016_v1.csv',
    '2017': './PUB_PriceHOEPPredispOR_2017_v1.csv',
    '2018': './PUB_PriceHOEPPredispOR_2018_v148.csv',
    '2019': './PUB_PriceHOEPPredispOR_2019_v395.csv',
    '2020': './PUB_PriceHOEPPredispOR_2020_v396.csv',
    '2021': './PUB_PriceHOEPPredispOR_2021_v395.csv',
    '2022': './PUB_PriceHOEPPredispOR_2022_v396.csv',
    '2023': './PUB_PriceHOEPPredispOR_2023_v393.csv'
}

# Function to load and preprocess the data
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path, skiprows=2)
    # Set correct column names
    df.columns = ['Date', 'Hour', 'HOEP', 'Hour 1 Predispatch', 'Hour 2 Predispatch', 'Hour 3 Predispatch', 'OR 10 Min Sync', 'OR 10 Min non-sync', 'OR 30 Min']
    # Drop unnecessary columns
    df = df[['Date', 'Hour', 'HOEP']].reset_index(drop=True)
    # Remove rows with non-numeric 'Hour' values
    df = df[df['Hour'].apply(lambda x: str(x).isnumeric())]
    # Convert 'Hour' to integer and 'HOEP' to numeric
    df['Hour'] = df['Hour'].astype(int) - 1  # Adjusting for 0-23 hour format
    df['HOEP'] = pd.to_numeric(df['HOEP'], errors='coerce')
    # Drop rows with NaN values in 'HOEP'
    df.dropna(subset=['HOEP'], inplace=True)
    return df

# Load and concatenate the training datasets
df_list = [load_and_preprocess(file_paths[year]) for year in file_paths if year != '2023']
data_train = pd.concat(df_list)

# Preprocess the data: Filling missing values and scaling
data_train['HOEP'].fillna(method='ffill', inplace=True)

# Feature Engineering: Create a lagged feature dataset
def create_lagged_features(df, n_lags=24):
    """
    Creates lagged features from a time series data.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the 'HOEP' column.
    n_lags (int): Number of lagged features to create.
    
    Returns:
    pd.DataFrame: DataFrame containing lagged features and original features.
    """
    df_lagged = pd.DataFrame(index=df.index)
    # Create lagged features
    for lag in range(1, n_lags + 1):
        df_lagged[f'HOEP_lag_{lag}'] = df['HOEP'].shift(lag)
    df_lagged['HOEP'] = df['HOEP']  # Current HOEP to predict
    df_lagged.dropna(inplace=True)  # Drop rows with NaN values resulting from shifting
    return df_lagged

# Apply the function to create lagged features
data_train_lagged = create_lagged_features(data_train, n_lags=24)

# Normalize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_train_lagged.drop('HOEP', axis=1))
scaled_labels = scaler.fit_transform(data_train_lagged['HOEP'].values.reshape(-1, 1))

# Split the data into features and labels
X_train = scaled_features
y_train = scaled_labels.ravel()  # Flatten to 1D array

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model Design: Using a simple feed-forward network as an example
model = Sequential([
    Dense(100, activation='relu', input_dim=X_train.shape[1]),
    Dense(50, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# Preprocess the evaluation dataset
data_eval = pd.read_csv(file_paths['2023'], skiprows=2)
data_eval['HOEP'].fillna(method='ffill', inplace=True)
data_eval_lagged = create_lagged_features(data_eval, n_lags=24)

# Normalize the evaluation data using the scaler fitted on the training data
scaled_features_eval = scaler.transform(data_eval_lagged.drop('HOEP', axis=1))
scaled_labels_eval = scaler.transform(data_eval_lagged['HOEP'].values.reshape(-1, 1))

X_eval = scaled_features_eval
y_eval = scaled_labels_eval.ravel()  # Flatten to 1D array

# Evaluate the model
evaluation = model.evaluate(X_eval, y_eval, verbose=0)

print(f'Evaluation MSE: {evaluation}')


2024-01-29 15:16:02.920987: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/software/quadis/latest/quadis/build/lib:/opt/conda/lib
2024-01-29 15:16:02.921048: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


KeyError: 'HOEP'