In [1]:
import pandas as pd
import numpy as np
from DB_Manager import DatabaseManager
import plotly.express as px
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

def smape(y_true, y_pred):
    """Calculate Symmetric Mean Absolute Percentage Error."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return 100 * np.mean(np.abs((y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred) + 1e-10)))

In [11]:
db = DatabaseManager()
feeder = 1
actuals = db.fetch_data(feeder, "2024-05-01", "2024-07-31")[['Net_Load_Demand']]
actuals.index = actuals.index.tz_convert('UTC').tz_localize(None)

forecasts = pd.read_csv(f'forecast_output_feeder_{feeder}_24hr_rls.csv', index_col=0, parse_dates=True)
forecasts.index = forecasts.index.tz_convert('UTC').tz_localize(None)
forecasts.columns = ['Forecast', 'Actual']

forecasts_base = pd.read_csv(f'forecast_output_feeder_{feeder}_24hr_base.csv', index_col=0, parse_dates=True)
forecasts_base.index = forecasts_base.index.tz_convert('UTC').tz_localize(None)


results = pd.concat([actuals, forecasts_base, forecasts], axis=1)
results.columns = ['Actual',  'Forecast_Base', 'Forecast_RLS', 'Actual_RLS']
results = results.dropna()
results

Fetching data for Feeder 1 from 2024-05-01 to 2024-07-31...
Fetched 2184 records.


Unnamed: 0_level_0,Actual,Forecast_Base,Forecast_RLS,Actual_RLS
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-05-02 00:00:00,6349.938000,6459.9100,2820.1514,6349.938000
2024-05-02 01:00:00,5965.801667,6015.1787,4114.8610,5965.801667
2024-05-02 02:00:00,5672.610000,5734.1530,2441.4404,5672.610000
2024-05-02 03:00:00,5510.938000,5545.8047,3391.5027,5510.938000
2024-05-02 04:00:00,5384.245714,5480.7617,2954.9001,5384.245714
...,...,...,...,...
2024-07-31 19:00:00,7575.710000,6940.2500,7331.1357,7575.710000
2024-07-31 20:00:00,7996.772222,7194.3560,7997.0312,7996.772222
2024-07-31 21:00:00,7879.154444,7016.4250,7900.1840,7879.154444
2024-07-31 22:00:00,7580.424444,6824.7803,7614.5957,7580.424444


In [12]:
results_copy = results.copy()
results_copy = results_copy['2024-06-01':'2024-07-30']
px.line(results_copy).show()

print("RLS Forecast RMSE:", root_mean_squared_error(results_copy['Actual'], results_copy['Forecast_RLS']))
print("RLS Forecast MAE:", mean_absolute_error(results_copy['Actual'], results_copy['Forecast_RLS']))
print("RLS Forecast sMAPE:", smape(results_copy['Actual'], results_copy['Forecast_RLS']))
print("Base Forecast RMSE:", root_mean_squared_error(results_copy['Actual'], results_copy['Forecast_Base']))
print("Base Forecast MAE:", mean_absolute_error(results_copy['Actual'], results_copy['Forecast_Base']))
print("Base Forecast sMAPE:", smape(results_copy['Actual'], results_copy['Forecast_Base']))

RLS Forecast RMSE: 1077.6672645391438
RLS Forecast MAE: 613.1343174632758
RLS Forecast sMAPE: 18.330416132387278
Base Forecast RMSE: 1115.4587572538328
Base Forecast MAE: 717.0369595531367
Base Forecast sMAPE: 19.73394001509051


Testing_9
RLS Forecast RMSE: 1100.3003324434687
RLS Forecast MAE: 642.515822425072
RLS Forecast sMAPE: 19.95600926373078
Base Forecast RMSE: 1014.1402633873743
Base Forecast MAE: 706.9873865710803
Base Forecast sMAPE: 20.68236121451009

Testing_10 - ANN neurons increased to 128 and 64
RLS Forecast RMSE: 1027.2848396697655
RLS Forecast MAE: 586.9513832443973
RLS Forecast sMAPE: 18.61108375960213
Base Forecast RMSE: 1021.1841482079919
Base Forecast MAE: 708.7904292605953
Base Forecast sMAPE: 20.644239333938977

Testing_11 - batch_size decreased to 8
RLS Forecast RMSE: 1043.5002528877308
RLS Forecast MAE: 578.9244920672354
RLS Forecast sMAPE: 17.923997935802642
Base Forecast RMSE: 969.2387468434622
Base Forecast MAE: 686.6931712615536
Base Forecast sMAPE: 20.251243705978496

Testing_12 - Increased neurons in second dense layer from 64 to 128
RLS Forecast RMSE: 949.6398911788949
RLS Forecast MAE: 525.1560414026321
RLS Forecast sMAPE: 16.687866151323135
Base Forecast RMSE: 929.1286609780144
Base Forecast MAE: 623.4729768075932
Base Forecast sMAPE: 18.69363673055586


Testing_13 - Increased neurons in first dense layer from 128 to 256
RLS Forecast RMSE: 1005.8519452789614
RLS Forecast MAE: 519.1379662079645
RLS Forecast sMAPE: 15.852592109095411
Base Forecast RMSE: 891.4653670430395
Base Forecast MAE: 557.4931437959266
Base Forecast sMAPE: 17.104895952292097

Testing_14 - Changed batch_size to 4
RLS Forecast RMSE: 963.132608102938
RLS Forecast MAE: 508.3463454685732
RLS Forecast sMAPE: 15.820239713182897
Base Forecast RMSE: 862.7627771994034
Base Forecast MAE: 493.8173296337357
Base Forecast sMAPE: 15.841644938695648


v1.4.7_LSTM_Testing_1 - Changed to LSTM 16 neurons, Dense 16 neurons
RLS Forecast RMSE: 1036.637368994676
RLS Forecast MAE: 601.4083776170836
RLS Forecast sMAPE: 18.431522995485203
Base Forecast RMSE: 978.3244212919634
Base Forecast MAE: 658.610672624832
Base Forecast sMAPE: 19.33282630522247

v1.4.7_LSTM_Testing_2 - Changed to LSTM 128 neurons, Dense 64 neurons
RLS Forecast RMSE: 1075.3076392136077
RLS Forecast MAE: 600.7509811917228
RLS Forecast sMAPE: 17.735635468856497
Base Forecast RMSE: 1011.6125614405108
Base Forecast MAE: 665.3823876594329
Base Forecast sMAPE: 20.525080395531592

v1.4.7_LSTM_Testing_3 - Changed to LSTM 128 neurons, Dense 256 neurons, flatten layer, dense 64 neurons
RLS Forecast RMSE: 1042.1164257050896
RLS Forecast MAE: 510.97156368997685
RLS Forecast sMAPE: 14.684968218185338
Base Forecast RMSE: 915.7394050242486
Base Forecast MAE: 577.7100891146943
Base Forecast sMAPE: 17.46968602562478

In [117]:
from sklearn.ensemble import IsolationForest

actuals_pivot = actuals.pivot_table(index=actuals.index.date, columns=actuals.index.hour, values='Net_Load_Demand')
outlier_model = IsolationForest(contamination=0.01, random_state=42)
outlier_model.fit(actuals_pivot)
outlier_scores = outlier_model.decision_function(actuals_pivot)
outlier_labels = outlier_model.predict(actuals_pivot)

actuals_pivot['Outlier'] = outlier_labels
actuals_pivot['Outlier_Score'] = outlier_scores
actuals_pivot[actuals_pivot['Outlier'] == -1]

Timestamp,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,Outlier,Outlier_Score
2024-04-13,6591.444286,6279.175,6009.375714,5742.137143,5622.621667,5507.73,5550.345556,4940.337273,2797.48475,2379.6305,...,8038.869167,8584.94,10923.825,10954.025714,10984.226429,11014.427143,11044.627857,11074.828571,-1,-0.174698
2024-04-14,10771.325,10430.2,10001.323333,9730.351818,9409.713333,9380.646,9560.076364,9127.493333,6204.216667,4209.043333,...,5661.735833,5735.295833,5618.354167,7076.61,7613.105455,7563.203636,7224.3,6825.938889,-1,-0.150367


In [113]:
def drop_outlier_days(df):
    # --- Outlier Day Removal ---
    net_load = df[["Net_Load_Demand"]].copy()
    net_load["date"] = net_load.index.date
    daily_stats = net_load.groupby("date").agg(["mean", "std"])
    daily_mean = daily_stats["Net_Load_Demand"]["mean"]
    daily_std = daily_stats["Net_Load_Demand"]["std"]

    daily_mean.index = pd.to_datetime(daily_mean.index)
    daily_std.index = pd.to_datetime(daily_std.index)

    # Define cutoffs
    lower_cutoff = daily_mean.quantile(0.01)
    upper_cutoff = daily_mean.quantile(0.99)

    # Define outlier condition (e.g., high daily variance)
    outlier_dates = daily_mean[(daily_mean <= lower_cutoff) & (daily_mean >= upper_cutoff)].index

    print("Daily mean and std deviation:")
    print(daily_std.loc["2024-06-10":"2024-06-20"])
    print(daily_mean.loc["2024-06-10":"2024-06-20"])
    # print(daily_mean)
    # print(daily_std)
    print(lower_cutoff)
    print(upper_cutoff)
    print(daily_mean[daily_mean <= lower_cutoff])
    print(daily_mean[daily_mean >= upper_cutoff])

    print(f"Outlier dates identified: {outlier_dates}")
    
drop_outlier_days(actuals)

Daily mean and std deviation:
date
2024-06-10     617.424566
2024-06-11     918.902664
2024-06-12     878.945460
2024-06-13     914.089899
2024-06-14    1015.946841
2024-06-15    1698.409746
2024-06-16    1972.691571
2024-06-17     987.823968
2024-06-18     512.691132
2024-06-19     949.023343
2024-06-20    1011.954888
Name: std, dtype: float64
date
2024-06-10    3194.817308
2024-06-11    3050.031881
2024-06-12    3066.484606
2024-06-13    3027.385781
2024-06-14    2854.945731
2024-06-15    2020.710786
2024-06-16    1643.923077
2024-06-17    3198.481061
2024-06-18    3397.482105
2024-06-19    2932.833163
2024-06-20    2922.449665
Name: mean, dtype: float64
1355.8575900835272
3425.423322649574
date
2024-03-31    1029.480745
2024-04-01    1293.047737
Name: mean, dtype: float64
date
2024-05-23    3530.535524
2024-06-04    3641.322382
Name: mean, dtype: float64
Outlier dates identified: DatetimeIndex([], dtype='datetime64[ns]', name='date', freq=None)


In [4]:
db = DatabaseManager()
start_date = "2024-01-01"
end_date = "2024-05-31"

feeder_id = 1
scenario_type = '24hr'

combined_data = db.fetch_data(feeder_id, start_date, end_date)
combined_data.index = combined_data.index.tz_convert('UTC').tz_localize(None)


features = combined_data.copy().drop(columns=['Net_Load_Demand'])
target = combined_data.copy()[['Net_Load_Demand']]

feeder_stats = combined_data.describe().drop(columns=['Feeder_ID']).T
feeder_stats.reset_index(inplace=True)
feeder_stats.columns = ['feature_name', 'count', 'mean', 'std', 'min', '25th_percentile', '50th_percentile', '75th_percentile', 'max']
feeder_stats

Fetching data for Feeder 1 from 2024-01-01 to 2024-05-31...
Fetched 3600 records.


Unnamed: 0,feature_name,count,mean,std,min,25th_percentile,50th_percentile,75th_percentile,max
0,Net_Load_Demand,3600.0,4229.116376,2927.53877,-4113.889167,1973.728952,5285.972917,6333.887813,11074.828571
1,Temperature_Historic,3600.0,26.836097,2.049195,22.3025,25.2025,26.4525,28.4025,32.4025
2,Temperature_Forecast,3600.0,27.092013,1.639556,24.0275,25.7775,26.8275,28.3275,31.7275
3,Shortwave_Radiation_Historic,3600.0,250.857222,321.754285,0.0,0.0,10.0,530.25,1026.0
4,Shortwave_Radiation_Forecast,3600.0,252.5625,321.552756,0.0,0.0,12.5,544.25,987.0
5,Cloud_Cover_Historic,3600.0,50.476667,33.677738,0.0,20.0,43.0,88.0,100.0
6,Cloud_Cover_Forecast,3600.0,56.116389,23.382977,0.0,41.0,53.0,72.0,100.0


In [None]:
db.save_feeder_stats(feeder_id, scenario_type, start_date, end_date, feeder_stats)

Saved 7 feature statistics to metadata.feeder_stats.


TypeError: 'APIResponse[~_ReturnT]' object is not subscriptable

In [139]:
features

Unnamed: 0_level_0,Temperature_Historic,Temperature_Forecast,Shortwave_Radiation_Historic,Shortwave_Radiation_Forecast,Cloud_Cover_Historic,Cloud_Cover_Forecast
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-05-01 00:00:00,26.2525,26.177500,0,0,6,35
2024-05-01 01:00:00,26.1525,25.977499,0,0,30,33
2024-05-01 02:00:00,25.6025,26.077500,0,0,9,42
2024-05-01 03:00:00,24.8525,25.977499,0,0,10,42
2024-05-01 04:00:00,24.9525,25.927500,0,0,10,36
...,...,...,...,...,...,...
2024-07-30 19:00:00,27.3525,28.227499,6,4,16,70
2024-07-30 20:00:00,26.8525,27.877499,0,0,5,18
2024-07-30 21:00:00,26.0525,27.677500,0,0,4,11
2024-07-30 22:00:00,25.9525,27.377499,0,0,7,0


In [55]:
feeder_id = 2
start_date = "2024-07-01"
end_date = "2024-07-31"
db = DatabaseManager(tag = 'main')
results = db.load_forecasts(feeder_id, "v1.6_Fresh_Testing_6", "24hr", "LSTM", start_date, end_date)[['actual_value', 'forecast_value']]
results.rename(columns={'forecast_value': 'forecast_value'}, inplace=True)
hp_results = db.load_forecasts(feeder_id, "v1.7_HP_Tuning_1", "24hr", "LSTM", start_date, end_date, "exp_HP")[['forecast_value']]
hp_results.rename(columns={'forecast_value': 'forecast_value_hp'}, inplace=True)

combined_results = pd.concat([results, hp_results], axis=1)

print("Shallow Forecast RMSE:", root_mean_squared_error(combined_results['actual_value'], combined_results['forecast_value']))
print("HP Forecast RMSE:", root_mean_squared_error(combined_results['actual_value'], combined_results['forecast_value_hp']))


print("Shallow Forecast MAE:", mean_absolute_error(combined_results['actual_value'], combined_results['forecast_value']))
print("HP Forecast MAE:", mean_absolute_error(combined_results['actual_value'], combined_results['forecast_value_hp']))

print("Shallow Forecast SMAPE:", smape(combined_results['actual_value'], combined_results['forecast_value']))
print("HP Forecast SMAPE:", smape(combined_results['actual_value'], combined_results['forecast_value_hp']))

px.line(combined_results).show()

Loaded 720 forecast entries with tag 'main'.
Loaded 720 forecast entries with tag 'exp_HP'.
Shallow Forecast RMSE: 160.99420186436393
HP Forecast RMSE: 163.76676060190724
Shallow Forecast MAE: 88.08137841360895
HP Forecast MAE: 92.740641135564
Shallow Forecast SMAPE: 5.435969072998445
HP Forecast SMAPE: 5.723966656994094


In [34]:
combined_results[combined_results.isna().any(axis=1)]

Unnamed: 0_level_0,actual_value,forecast_value,forecast_value_hp
target_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-05-02 00:00:00+00:00,6349.938000,2885.969727,
2024-05-02 01:00:00+00:00,5965.801667,3499.391602,
2024-05-02 02:00:00+00:00,5672.610000,5212.417480,
2024-05-02 03:00:00+00:00,5510.938000,3969.153076,
2024-05-02 04:00:00+00:00,5384.245714,4308.634277,
...,...,...,...
2024-05-20 19:00:00+00:00,7090.002222,7343.163086,
2024-05-20 20:00:00+00:00,7638.687500,7738.245605,
2024-05-20 21:00:00+00:00,7609.742222,7947.754883,
2024-05-20 22:00:00+00:00,7235.120000,7337.399414,


In [20]:
# from datetime import timedelta
from datetime import timedelta, datetime
datetime.strptime(start_date, "%Y-%m-%d") + timedelta(days=1)

datetime.datetime(2024, 5, 2, 0, 0)