In [45]:
# Import packages and modules
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
df  = pd.read_csv("../Data/dataset_dk3619_preprocessed_v1.csv")

In [47]:
df.shape

(3323747, 44)

In [48]:
print(df['DK3619Code'])
dkcode_list = list(set(df['DK3619Code'].to_list()))
dkcode_list.sort()
id2dkcode = enumerate(dkcode_list)
id2dkcode = dict(id2dkcode)

dkcode2id = {}
for i in id2dkcode.items():
    dkcode2id[i[1]] = i[0]
print(dict(dkcode2id))

### Mapping data to numberic
# df['DK3619Code'] = df['DK3619Code'].map(dkcode2id)

0                M_MA
1          C_CE_CF_CD
2                 G_G
3                Q_QA
4                 P_P
              ...    
3323742           E_E
3323743           F_F
3323744           I_I
3323745           H_H
3323746           A_A
Name: DK3619Code, Length: 3323747, dtype: object
{'A_A': 0, 'C_CA': 1, 'C_CC': 2, 'C_CE_CF_CD': 3, 'C_CG': 4, 'C_CH': 5, 'C_CI': 6, 'C_CJ': 7, 'C_CK': 8, 'C_CL': 9, 'C_CM_CB': 10, 'D_D': 11, 'E_E': 12, 'F_F': 13, 'G_G': 14, 'H_H': 15, 'I_I': 16, 'J_JA': 17, 'J_JB_JC': 18, 'K_K': 19, 'L_L': 20, 'M_MA': 21, 'M_MB': 22, 'M_MC': 23, 'N_N': 24, 'O_O': 25, 'PR_PR': 26, 'P_P': 27, 'Q_QA': 28, 'Q_QB': 29, 'R_R': 30, 'S_S': 31}


### Choose Feauture will use for Model

In [49]:
columns = ['HourUTC', 'HourDK', 'hour', 'day_of_week',
              'weekday_name', 'quarter', 'month', 'year', 'day_of_year',
              'day_of_month', 'week_of_year', 'season', 'holiday', 'Consumption_MWh']


In [50]:
df.columns

Index(['HourUTC', 'HourDK', 'DK3619Code', 'hour', 'day_of_week',
       'weekday_name', 'quarter', 'month', 'year', 'day_of_year',
       'day_of_month', 'week_of_year', 'season', 'holiday', 'DK36Code',
       'DK36Title', 'DK19Code', 'DK19Title', 'Consumption_MWh', 'lag_24',
       'lag_25', 'lag_26', 'lag_27', 'lag_28', 'lag_29', 'lag_30', 'lag_31',
       'lag_32', 'lag_33', 'lag_34', 'lag_35', 'lag_36', 'lag_37', 'lag_38',
       'lag_39', 'lag_40', 'lag_41', 'lag_42', 'lag_43', 'lag_44', 'lag_45',
       'lag_46', 'lag_47', 'lag_48'],
      dtype='object')

In [9]:
# print(df['weekday_name'])
# weekday_list = list(set(df['weekday_name'].to_list()))
# weekday_list.sort()
# id2day = enumerate(weekday_list)
# id2day = dict(id2day)

# day2id = {}
# for i in id2day.items():
#     day2id[i[1]] = i[0]
# print(dict(day2id))

### Mapping data to numberic
# df['weekday_name'] = df['weekday_name'].map(day2id)

In [51]:
weekday_mapping = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}
### Mapping data to numberic
df['weekday_name'] = df['weekday_name'].map(weekday_mapping)

In [11]:
# print(df['season'])
# season_list = list(set(df['season'].to_list()))
# season_list.sort()
# id2season = enumerate(season_list)
# id2season = dict(id2season)

# season2id = {}
# for i in id2season.items():
#     season2id[i[1]] = i[0]
# print(dict(season2id))

### Season mapping
# df['season'] = df['season'].map(season2id)


In [62]:

### Season mapping

seasons_mapping = {
    'Spring': 0,
    'Summer': 1,
    'Autumn': 2,
    'Winter': 3
}

df['season'] = df['season'].map(seasons_mapping)

In [63]:
df['day_of_month'] = df['day_of_month'].astype('int')

In [64]:
df.head()

Unnamed: 0,HourUTC,HourDK,DK3619Code,hour,day_of_week,weekday_name,quarter,month,year,day_of_year,...,lag_39,lag_40,lag_41,lag_42,lag_43,lag_44,lag_45,lag_46,lag_47,lag_48
0,2012-06-01T23:00:00,2012-06-02 01:00:00,M_MA,1,5,5,2,6,2012,154,...,,,,,,,,,,
1,2012-06-01T23:00:00,2012-06-02 01:00:00,C_CE_CF_CD,1,5,5,2,6,2012,154,...,,,,,,,,,,
2,2012-06-02T00:00:00,2012-06-02 02:00:00,G_G,2,5,5,2,6,2012,154,...,,,,,,,,,,
3,2012-06-02T00:00:00,2012-06-02 02:00:00,Q_QA,2,5,5,2,6,2012,154,...,,,,,,,,,,
4,2012-06-02T00:00:00,2012-06-02 02:00:00,P_P,2,5,5,2,6,2012,154,...,,,,,,,,,,


### Split data

In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,GRU, Dense, Bidirectional
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.layers import TimeDistributed, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

import os.path


In [56]:
# Create sequences
def create_sequences(df, seq_length):
    sequences = []
    labels = []
    for i in range(len(df) - seq_length):
        seq = df.iloc[i:i+seq_length].values
        label = df['Consumption_MWh'].iloc[i+seq_length]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

In [65]:
df_filter = df[df['DK3619Code'] == 'M_MA']
df_corr = df_filter[columns]

df_corr['HourUTC'] = pd.to_datetime(df_corr['HourUTC'])
df_corr['HourDK'] = pd.to_datetime(df_corr['HourDK'])

df_corr = df_corr.drop(['HourUTC', 'HourDK'], axis=1)

# Normalize the data
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df_corr[['season', 'year', 'month', 'day_of_month', 'hour', 'Consumption_MWh']])

scaled_df = pd.DataFrame(scaled_df, columns=['season', 'year', 'month', 'day_of_month', 'hour', 'Consumption_MWh'])

#create sequence for training
SEQ_LENGTH = 24  # for 24 hours sequence
sequences, labels = create_sequences(scaled_df, SEQ_LENGTH)

# Split the data into training (60%), validation (20%), and test (20%) sets
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, shuffle=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corr['HourUTC'] = pd.to_datetime(df_corr['HourUTC'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corr['HourDK'] = pd.to_datetime(df_corr['HourDK'])


In [66]:
X_test

array([[[1.        , 0.83333333, 0.        , 0.7       , 0.56521739,
         0.41874931],
        [1.        , 0.83333333, 0.        , 0.7       , 0.60869565,
         0.41349038],
        [1.        , 0.83333333, 0.        , 0.7       , 0.65217391,
         0.40803718],
        ...,
        [1.        , 0.83333333, 0.        , 0.73333333, 0.43478261,
         0.40509322],
        [1.        , 0.83333333, 0.        , 0.73333333, 0.47826087,
         0.41143037],
        [1.        , 0.83333333, 0.        , 0.73333333, 0.52173913,
         0.41164813]],

       [[1.        , 0.83333333, 0.        , 0.7       , 0.60869565,
         0.41349038],
        [1.        , 0.83333333, 0.        , 0.7       , 0.65217391,
         0.40803718],
        [1.        , 0.83333333, 0.        , 0.7       , 0.69565217,
         0.41717204],
        ...,
        [1.        , 0.83333333, 0.        , 0.73333333, 0.47826087,
         0.41143037],
        [1.        , 0.83333333, 0.        , 0.73333333, 0.521

In [67]:
y_test

array([0.41240995, 0.40975682, 0.40826677, ..., 0.40813699, 0.43167281,
       0.47514127])

In [68]:
X_train

array([[[0.33333333, 0.        , 0.45454545, 0.03333333, 0.04347826,
         0.        ],
        [0.33333333, 0.        , 0.45454545, 0.03333333, 0.08695652,
         0.15203586],
        [0.33333333, 0.        , 0.45454545, 0.03333333, 0.13043478,
         0.15577927],
        ...,
        [0.33333333, 0.        , 0.45454545, 0.03333333, 0.95652174,
         0.15670099],
        [0.33333333, 0.        , 0.45454545, 0.03333333, 1.        ,
         0.15276802],
        [0.33333333, 0.        , 0.45454545, 0.06666667, 0.        ,
         0.14878711]],

       [[0.33333333, 0.        , 0.45454545, 0.03333333, 0.08695652,
         0.15203586],
        [0.33333333, 0.        , 0.45454545, 0.03333333, 0.13043478,
         0.15577927],
        [0.33333333, 0.        , 0.45454545, 0.03333333, 0.17391304,
         0.15528827],
        ...,
        [0.33333333, 0.        , 0.45454545, 0.03333333, 1.        ,
         0.15276802],
        [0.33333333, 0.        , 0.45454545, 0.06666667, 0.   

In [70]:
#all_dkcode_trainings
if not os.path.exists("../Checkpoints/"):
    os.makedirs("../Checkpoints/")

for i in dkcode_list:
    if os.path.exists("../Checkpoints/" + i + "_LSTMmodel.h5"):
        continue
    print('Training LSTM model with DKCode: ', i)
    df_filter = df[df['DK3619Code'] == i]
    df_corr = df_filter[columns]

    df_corr['HourUTC'] = pd.to_datetime(df_corr['HourUTC'])
    df_corr['HourDK'] = pd.to_datetime(df_corr['HourDK'])

    df_corr = df_corr.drop(['HourUTC', 'HourDK'], axis=1)

    # Normalize the data
    scaler = MinMaxScaler()
    scaled_df = scaler.fit_transform(df_corr[['season', 'year', 'month', 'hour', 'Consumption_MWh']])

    scaled_df = pd.DataFrame(scaled_df, columns=['season', 'year', 'month', 'hour', 'Consumption_MWh'])

    #create sequence for training
    SEQ_LENGTH = 24  # for 24 hours sequence
    sequences, labels = create_sequences(scaled_df, SEQ_LENGTH)

    # Split the data into training (60%), validation (20%), and test (20%) sets
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, shuffle=False)

    #LSTM modeling
    model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(SEQ_LENGTH, X_train.shape[2])),
    Dropout(0.2),
    LSTM(50),
    Dropout(0.2),
    Dense(1)
])

    model.compile(optimizer='adam', loss='mean_squared_error')

    # Callbacks for early stopping and learning rate reduction
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

    # Model Training
    history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2, callbacks=[early_stopping, reduce_lr])

    model.save("../Checkpoints/" + i + "_LSTMmodel.h5")
    model = tf.keras.models.load_model("../Checkpoints/" + i + "_LSTMmodel.h5")


    # Model Evaluation
    predictions = model.predict(X_test)

    # Calculate evaluation metrics
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)

    print(f'R² Score: {r2}')
    print(f'Mean Absolute Error (MAE): {mae}')
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Root Mean Squared Error (RMSE): {rmse}')

Training LSTM model with DKCode:  A_A


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corr['HourUTC'] = pd.to_datetime(df_corr['HourUTC'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_corr['HourDK'] = pd.to_datetime(df_corr['HourDK'])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 