In [1]:
from src.component.data_info import *
from src.paths import *
import pandas as pd

In [2]:
#1-load electricity data for a year
start_date = datetime(2024, 1, 1)  # Replace with your desired start date
end_date = datetime(2025, 1, 7)    # Replace with your desired end date


electricity_data=load_daily_electricity_data(start_date,end_date)

Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-01.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-02.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-03.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-04.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-05.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-06.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-07.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-08.json
Loading file D:\Electricity_demand_predictor2\data\raw\electricity_raw_data\hourly_demand_2024-01-09.json
Loading file D:\Electricity_demand_predictor2\

In [3]:
electricity_data

Unnamed: 0,date,sub_region_code,demand
0,2024-01-02 00:00:00+00:00,53,9276
1,2024-01-02 00:00:00+00:00,59,8985
2,2024-01-02 00:00:00+00:00,61,1504
3,2024-01-02 00:00:00+00:00,67,55
4,2024-01-02 00:00:00+00:00,21,11427
...,...,...,...
771461,2025-01-07 00:00:00+00:00,64,525
771462,2025-01-07 00:00:00+00:00,65,4073
771463,2025-01-07 00:00:00+00:00,68,4587
771464,2025-01-07 00:00:00+00:00,70,1457


In [5]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
def fetch_weather_data(start_date, end_date):
	start_date = pd.to_datetime(start_date,  format="YYYY-MM-DD")
	end_date = pd.to_datetime(end_date,  format="YYYY-MM-DD")

	url = "https://archive-api.open-meteo.com/v1/archive"
	params = {
	"latitude": 52.52,
	"longitude": 13.41,
	"start_date": start_date,
	"end_date": end_date,
	"hourly": ["temperature_2m", "weather_code"],
	"timeformat": "unixtime",
	"timezone": "America/New_York"}
	responses = openmeteo.weather_api(url, params=params)

   # Process first location. Add a for-loop for multiple locations or weather models
	response = responses[0]
	print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
	print(f"Elevation {response.Elevation()} m asl")
	print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
	print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process hourly data. The order of variables needs to be the same as requested.
	hourly = response.Hourly()
	hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
	hourly_weather_code = hourly.Variables(1).ValuesAsNumpy()

	hourly_data = {"date": pd.date_range(
		start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
		end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
		freq = pd.Timedelta(seconds = hourly.Interval()),
		inclusive = "left"
	)}

	hourly_data["temperature_2m"] = hourly_temperature_2m

	hourly_dataframe = pd.DataFrame(data = hourly_data)
	return hourly_dataframe

In [6]:
#read weather data
weather_data=fetch_weather_data(start_date,end_date)
weather_data

OpenMeteoRequestsError: {'error': True, 'reason': "Invalid date format. Make sure to use 'YYYY-MM-DD'"}

In [21]:
#merge them into full data frame
full_data=merge_data(electricity_data,weather_data)
full_data

Unnamed: 0,date,sub_region_code,demand,temperature_2m
0,2024-01-02 00:00:00,53,9276,4.8085
1,2024-01-02 00:00:00,59,8985,4.8085
2,2024-01-02 00:00:00,61,1504,4.8085
3,2024-01-02 00:00:00,67,55,4.8085
4,2024-01-02 00:00:00,21,11427,4.8085
...,...,...,...,...
758601,2025-01-01 00:00:00,64,407,0.8085
758602,2025-01-01 00:00:00,65,3899,0.8085
758603,2025-01-01 00:00:00,68,4422,0.8085
758604,2025-01-01 00:00:00,70,1171,0.8085


In [23]:

full_data.to_parquet(TRANSFORMED_DATA_DIR/f'full_data.parquet',index=False)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_demand_trend(data, start_date, end_date, region_code):
    """
    Plots the demand trend for a specific region within a given date range.
    
    Parameters:
        data (pd.DataFrame): The dataframe containing the demand data.
        start_date (str): The start date for the plot in 'YYYY-MM-DD' format.
        end_date (str): The end date for the plot in 'YYYY-MM-DD' format.
        region_code (int): The sub_region_code to filter the data for plotting.
    """
    # Convert date column to datetime if it's not already
    if data['date'].dtype == object:
        data['date'] = pd.to_datetime(data['date'])
    
    # Filter data based on date range and region
    mask = (data['date'] >= start_date) & (data['date'] <= end_date) & (data['sub_region_code'] == region_code)
    filtered_data = data[mask]
    
    # Check if filtered data is empty
    if filtered_data.empty:
        print(f"No data available for Region {region_code} between {start_date} and {end_date}.")
        return
    
    # Plotting
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=filtered_data, x='date', y='demand', label='sub_region_code')
    plt.title(f'Demand Trend for Region {region_code} from {start_date} to {end_date}')

    plt.xlabel('Date')
    plt.ylabel('Demand')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [2]:
# Example usage
plot_demand_trend(full_data, "2024-01-01", "2024-04-01", 53)

NameError: name 'full_data' is not defined

In [24]:
import pandas as pd
full_data=pd.read_parquet('../data/transformed/full_data.parquet')
full_data

Unnamed: 0,date,sub_region_code,demand,temperature_2m
0,2024-01-02 00:00:00,53,9276,4.8085
1,2024-01-02 00:00:00,59,8985,4.8085
2,2024-01-02 00:00:00,61,1504,4.8085
3,2024-01-02 00:00:00,67,55,4.8085
4,2024-01-02 00:00:00,21,11427,4.8085
...,...,...,...,...
758601,2025-01-01 00:00:00,64,407,0.8085
758602,2025-01-01 00:00:00,65,3899,0.8085
758603,2025-01-01 00:00:00,68,4422,0.8085
758604,2025-01-01 00:00:00,70,1171,0.8085


In [36]:
#feature engineering on the merged data
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

def feature_engineering(data):
    data['date']= pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')

    data['hour'] = data['date'].dt.hour
    data['day_of_week'] = data['date'].dt.dayofweek
    data['month'] = data['date'].dt.month
    data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)


# add (find) bank holiday (binary mask?)
    holidays = calendar().holidays(start=data['date'].min(), end=data['date'].max())
    data['is_holiday'] = data['date'].isin(holidays).astype(int)
    return data

In [40]:
full_data_with_FE=feature_engineering(full_data)

In [2]:

#full_data_with_FE.to_parquet(TRANSFORMED_DATA_DIR/f'full_data_with_FE.parquet',index=False)
#full_data_with_FE

In [6]:
import pandas as pd
from src.component.data_info import *
full_data_with_FE=pd.read_parquet(('../data/transformed/full_data_with_FE.parquet'))
full_data_with_FE

Unnamed: 0,date,sub_region_code,demand,temperature_2m,hour,day_of_week,month,is_weekend,is_holiday
0,2024-01-02,53,9276,4.8085,0,1,1,0,0
1,2024-01-02,59,8985,4.8085,0,1,1,0,0
2,2024-01-02,61,1504,4.8085,0,1,1,0,0
3,2024-01-02,67,55,4.8085,0,1,1,0,0
4,2024-01-02,21,11427,4.8085,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...
758601,2025-01-01,64,407,0.8085,0,2,1,0,1
758602,2025-01-01,65,3899,0.8085,0,2,1,0,1
758603,2025-01-01,68,4422,0.8085,0,2,1,0,1
758604,2025-01-01,70,1171,0.8085,0,2,1,0,1


In [7]:
import tqdm
features, targets = transform_ts_data_into_features_and_target(
    full_data_with_FE,
    input_seq_len=24*28*1, # one month
    step_size=24,
)

print(f'{features.shape=}')
print(f'{targets.shape=}'),

100%|██████████| 83/83 [00:15<00:00,  5.35it/s]

features.shape=(29307, 675)
targets.shape=(29307,)





(None,)

In [5]:
tabular_data = features
tabular_data['target_demand_next_hour'] = targets

from src.paths import TRANSFORMED_DATA_DIR
tabular_data.to_parquet(TRANSFORMED_DATA_DIR / 'tabular_data_with_FE.parquet')

In [None]:
features_and_target = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
features_and_target

Unnamed: 0,demand_previous_672_hour,demand_previous_671_hour,demand_previous_670_hour,demand_previous_669_hour,demand_previous_668_hour,demand_previous_667_hour,demand_previous_666_hour,demand_previous_665_hour,demand_previous_664_hour,demand_previous_663_hour,...,demand_previous_6_hour,demand_previous_5_hour,demand_previous_4_hour,demand_previous_3_hour,demand_previous_2_hour,demand_previous_1_hour,date,sub_region_code,temperature_2m,target_demand_next_hour
0,11233.0,10916.0,10704.0,10387.0,10196.0,9905.0,9600.0,9254.0,9137.0,9045.0,...,9199.0,9215.0,8763.0,8954.0,8954.0,9607.0,2024-01-28 02:00:00,53,0.1085,10318.0
1,11360.0,11147.0,11114.0,10904.0,10317.0,9762.0,9408.0,9056.0,8839.0,8796.0,...,9245.0,8668.0,8401.0,8388.0,8419.0,8419.0,2024-01-29 01:00:00,53,0.1585,8861.0
2,12702.0,12591.0,12286.0,12122.0,11578.0,10916.0,10271.0,10111.0,9917.0,10061.0,...,10282.0,9847.0,8928.0,9189.0,9403.0,9696.0,2024-01-30 00:00:00,53,1.1585,9696.0
3,11807.0,12549.0,12565.0,12310.0,12062.0,11694.0,11073.0,10446.0,10039.0,10055.0,...,11168.0,10028.0,9399.0,9044.0,9114.0,8829.0,2024-01-31 00:00:00,53,6.3585,9193.0
4,10352.0,11299.0,12290.0,12391.0,12174.0,11998.0,11487.0,10970.0,10378.0,10148.0,...,11025.0,11324.0,11523.0,11523.0,11311.0,11422.0,2024-01-31 23:00:00,53,2.4085,11696.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29302,3221.0,3175.0,3154.0,3167.0,3199.0,3274.0,3372.0,3475.0,3522.0,3500.0,...,3354.0,3464.0,3464.0,3470.0,3425.0,3380.0,2024-12-28 04:00:00,71,-1.0415,3322.0
29303,3351.0,3294.0,3295.0,3338.0,3405.0,3495.0,3659.0,3955.0,4150.0,4147.0,...,3025.0,3080.0,3253.0,3253.0,3330.0,3333.0,2024-12-29 03:00:00,71,-2.3415,3307.0
29304,3866.0,3723.0,3641.0,3624.0,3635.0,3686.0,3778.0,3964.0,4255.0,4403.0,...,2998.0,2993.0,3047.0,3226.0,3226.0,3303.0,2024-12-30 02:00:00,71,2.2085,3325.0
29305,3950.0,3763.0,3589.0,3481.0,3407.0,3363.0,3329.0,3396.0,3561.0,3831.0,...,3551.0,3523.0,3482.0,3494.0,3616.0,3616.0,2024-12-31 01:00:00,71,1.6085,3629.0


In [4]:
features = features_and_target.drop(columns=['target_demand_next_hour'])
targets = features_and_target['target_demand_next_hour']