

- How to deal with different columns in generation data for old (`DE_AT_LU` until 2018/09/30) and new bidding zone (`DE_LU` since 2018/10/01)? Old data contains all columns from new data but also additional columns, mostly about `'Actual Consumption'`, and one extra category `'Fossil Coal-derived gas Actual Aggregated'`.
- Which time span to include in general for training data?

## Data-loading playground with `entsoe-py`

### Imports

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
pd.set_option('display.max_rows', 100)

### Settings

In [2]:
api_key = "6e68642c-8403-4caa-af31-bda40b8c67f6" # web token for RESTful API
country_code = "10Y1001A1001A83F" # Germany
BZ_code = "DE_LU" # new bidding zone, valid since 2018/10/01
BZ_code_old = "DE_AT_LU" # old bidding zone, valid until 2018/09/30
time_zone = "Europe/Berlin" # time zone for Germany

### Helper functions

In [3]:
def get_col_diff_intersect(df1, df2):
    """
    Return difference and intersection of columns of two dataframes.
    
    Params
    ------
    df1 : pandas.DataFrame
          first dataframe
    df2 : pandas.DataFrame
          second dataframe
          
    Returns
    -------
    difference in columns of df1 and df2
    intersection of columns of df1 and df2
    
    """
    return df1.columns.difference(df2.columns), df1.columns.intersection(df2.columns)

In [4]:
def get_load_intervals(start_date, end_date, time_zone="Europe/Berlin"):
    """
    Get time points for sequential data loading from ENTSO-E transparency platform.
    
    For one request, the time delta for loading data from the platform is limited to one year.
    
    Params
    ------
    start_date : str
                 start date as "yyyymmdd"
    end_date : str
               end date as "yyyymmdd"
    time_zone : str
                time zone as string, e.g. "Europe/Berlin"
    
    Returns
    -------
    pd.Series
    pandas series with timestamps of time points to consider between start and end date
    """
    # Convert start and end dates to timestamps.
    start = pd.Timestamp(start_date, tz=time_zone)
    end = pd.Timestamp(end_date, tz=time_zone)

    # Create series from start and end timestamps.
    start_series = pd.Series(pd.Timestamp(start_date))
    end_series = pd.Series(pd.Timestamp(end_date))
    
    # Create date range from start and end dates and determine year starts within range.
    # Convert data range to series.
    dates = pd.date_range(start=start_date, end=end_date, freq="YS", inclusive="both").to_series()

    # Check whether start date itself is year start.
    # If not, prepend to dates to consider for data loading.
    if not start.is_year_start:
        dates = pd.concat([start_series, dates], ignore_index=True)

    # Check whether end date itself is year start.
    # If not, append to dates to consider for data loading.
    if not end.is_year_start:
        dates = pd.concat([dates, end_series], ignore_index=True)
        
    return dates

In [5]:
def load_data(start_date, 
              end_date, 
              api_key, 
              country_code="10Y1001A1001A83F", 
              time_zone="Europe/Berlin"):
    """
    Load actual load and actual aggregated generation per production type for requested time interval.
    
    
    Params
    ------
    start_date : str
                 start date as "yyyymmdd"
    end_date : str
               end date as "yyyymmdd"
    api_key : str
              RESTful API web key
    country_code : str
                   code for country, bidding zone, etc.
    time_zone : str
                time zone as string, e.g. "Europe/Berlin"
                
    Returns
    -------
    pd.DataFrame with time points as indices and load + generation per type as columns.
    """
    from entsoe import EntsoePandasClient
    # Initialize client and settings.
    client = EntsoePandasClient(api_key=api_key)
    start = pd.Timestamp(start_date, tz=time_zone)
    end = pd.Timestamp(end_date, tz=time_zone)
    # Query data and save to dataframe.
    df_load = client.query_load(country_code, start=start, end=end)
    print(f"Actual load has shape {df_load.shape}.")
    df_gen = client.query_generation(country_code, start=start, end=end, psr_type=None)
    df_gen.columns = [" ".join(a) for a in df_gen.columns.to_flat_index()]
    print(f"Actual generation per production type has shape {df_gen.shape}.")
    df_final = pd.concat([df_load, df_gen], axis=1) # Concatenate dataframes in columns dimension.
    print(f"Concatenated data frame has shape {df_final.shape}.")
    
    return df_final

In [6]:
def fetch_data(start_date, 
               end_date, 
               api_key, 
               country_code="10Y1001A1001A83F", 
               time_zone="Europe/Berlin",
               drop_consumption=True,
               create_pslp_columns=True):
    """
    Fetch data from ENTSO-E transparency platform as requested.
    
    Parameters
    ----------
    start_date : str
                 start date as "yyyymmdd"
    end_date : str
               end date as "yyyymmdd"
    api_key : str
              RESTful API web key
    time_zone : str
                time zone as string, e.g. "Europe/Berlin"
    country_code : str
                   code for country, bidding zone, etc.
    drop_consumption : Bool
                       Drop columns containing actual consumption.
    create_pslp_columns : Bool
                          Create columns for subsequent PSLP and residuals calculation.    
    Returns
    -------
    pd.DataFrame with actual load and generation per type for requested time interval
    """
    # Determine sequence of dates to consider when loading data.
    dates = get_load_intervals(start_date, end_date, time_zone)
    print(f"Consider the following dates:\n{dates}")
    df_list = []
    
    for i, _ in enumerate(dates):

        if i == dates.shape[0] - 1:
            print("Returning final data frame...")
            df_final = pd.concat(df_list, axis=0) # Concatenate dataframes along time axis (index).
            df_final.index = pd.to_datetime(df_final.index, utc=True).tz_convert(tz="UTC+01:00")

            # Drop columns containing actual consumption?
            if drop_consumption:
                print("Dropping columns containing actual consumption...")
                df_final.drop(list(df_final.filter(regex='Consumption')), axis=1, inplace=True)
            original_headers = df_final.columns
            # Create PSLP columns?
            if create_pslp_columns:
                print("Creating columns for PSLP calculation...")
                for header in original_headers:
                    df_final[str(header) + " PSLP"] = pd.Series(dtype='float')
            return df_final, original_headers
            
        try:
            print(f"Trying to load data chunk for time interval [{dates[i]}, {dates[i+1]}]...")
            df_temp = load_data(start_date=dates[i], 
                                end_date=dates[i+1],
                                api_key=api_key,
                                time_zone=time_zone,
                                country_code=country_code)
            print(df_temp.shape)
            df_list.append(df_temp)
            print("Loading successful!")
            
        except Exception as e:
            print(f"Loading failed!", e)
            continue

In [7]:
start_date = "20171225"
end_date = "20180125"
df_normal, original_headers_normal = fetch_data(start_date, end_date, api_key)
df_renewa, original_headers_renewa = fetch_data(start_date, end_date, api_key)

Consider the following dates:
0   2017-12-25
1   2018-01-01
2   2018-01-25
dtype: datetime64[ns]
Trying to load data chunk for time interval [2017-12-25 00:00:00, 2018-01-01 00:00:00]...
Actual load has shape (672, 1).
Actual generation per production type has shape (672, 32).
Concatenated data frame has shape (672, 33).
(672, 33)
Loading successful!
Trying to load data chunk for time interval [2018-01-01 00:00:00, 2018-01-25 00:00:00]...
Actual load has shape (2304, 1).
Actual generation per production type has shape (2304, 20).
Concatenated data frame has shape (2304, 21).
(2304, 21)
Loading successful!
Returning final data frame...
Dropping columns containing actual consumption...
Creating columns for PSLP calculation...
Consider the following dates:
0   2017-12-25
1   2018-01-01
2   2018-01-25
dtype: datetime64[ns]
Trying to load data chunk for time interval [2017-12-25 00:00:00, 2018-01-01 00:00:00]...
Actual load has shape (672, 1).
Actual generation per production type has shape

In [8]:
def _correct_time_shift(df, usual_length=96):
    """
    Find CET-CEST time shift dates in dataframe index.
    
    Params
    ------
    df : pandas.DataFrame
         considered dataframe
    usual_length : int
                   usual length of one day (96 for 15 min frequency)
                   
    Returns
    -------
    list : list of lists with time-shifting dates and their respective lengths
    """
    unique_dates = df.index.to_series().dt.date.drop_duplicates().tolist() # Get unique dates in data index.
    unique_dates = [pd.to_datetime(d).strftime('%Y-%m-%d') for d in unique_dates]
    time_shift_dates = []
    for date in unique_dates:
        length = df.loc[date].shape[0]
        if length != usual_length:
            print(f"Time shift at {date}, length is {length}.")
            time_shift_dates.append([date, length])
    return time_shift_dates

In [None]:
print(df_test.shape)
print(df_test.columns)
print(df_test["Actual Load"].isna().sum())
print(original_headers)

In [9]:
def get_pslp_category(date, weekday=None, holiday=None, country_code='DE'):
    """
    Get PSLP category from date, weekday information, and holiday information.
    0 : weekday
    1 : Saturday
    2 : Sunday and holiday
    
    Params
    ------
    date : str
           date in 'YYYYMMDD' format
    weekday : int
              corresponding weekday
              0 - Mon, 1 - Tue, 2 - Wed, 3 - Thu, 4 - Fri, 5 - Sat, 6 - Sun
    holiday : Bool
              True if public holiday, False if not.
    
    Returns
    -------
    int : PSLP category
    """
    # Convert string-type date to datetime object.
    if type(date) is str:
        date = pd.to_datetime(date)
    
    # Assign weekday if not given.
    if weekday is None:
        weekday = date.weekday()
    
    # Assign holiday category if not given.
    if holiday is None:
        import holidays
        holiday = date in holidays.country_holidays(country_code)
    
    # Special treatment for Christmas eve and New year's eve as Saturdays.
    if ( date.day == 24 or date.day == 31 ) and date.month == 12 and weekday != 6:
        pslp_category = 1
    # weekdays
    elif weekday < 5 and holiday is False:
        pslp_category = 0
    # Saturdays
    elif weekday == 5 and holiday is False:
        pslp_category = 1
    # Sundays and holidays
    elif weekday == 6 or holiday is True:
        pslp_category = 2
    return pslp_category

In [10]:
def assign_pslp_categories(df, country_code='DE'):
    """
    Assign PSLP categories to dates in dataframe's datetime index.

    0 is weekday, 1 is Saturday, 2 is Sunday or holiday.
    Special treatment for Christmas eve and New Year's eve (as Saturdays).

    Params
    ------
    df : pandas.Dataframe
    country_code : str
                   country to determine holidays for
    Returns
    -------
    pandas.Dataframe
    Dataframe amended by weekday information, holiday information, and PSLP category
    """
    import holidays
    
    # Get holidays in specified country.
    country_holidays = holidays.country_holidays(country_code) # Passing a state is also possible!

    s = df.index.to_series()                                # Convert datetime index to series.
    dates = s.dt.date                                       # Get plain dates from datetime objects.
    weekdays = s.dt.weekday                                 # Get weekdays from datetime objects.
    holidays = [date in country_holidays for date in dates] # Determine holidays.
    pslp_category = []
    
    for d, wd, hd in zip(dates, weekdays, holidays):
        pslp_category.append(get_pslp_category(d, wd, hd))
        
    df["PSLP Category"] = pslp_category
    df["Holiday"] = holidays
    df["Weekday"] = weekdays
    return df

In [11]:
date_str = "20180120"
#df_test = assign_pslp_categories(df_test)
df_normal = assign_pslp_categories(df_normal)
df_renewa = assign_pslp_categories(df_renewa)
#print(df_test.loc[date_str])
#print(df_test.columns)
#print(df_test["PSLP Category"])

In [12]:
def _get_nearest_future_pslp_date(date_str, pslp_category=None):
    """
    For a given date, get nearest days in future for each PSLP category.
    
    Params
    ------
    date_str : str
               considered date
    pslp_category : int
                    PSLP category of considered date
    """
    if pslp_category is None:
        pslp_category = get_pslp_category(date_str)
    start = pd.to_datetime(date_str).date() + pd.Timedelta(days=1)
    end = pd.to_datetime(date_str).date() + pd.Timedelta(weeks=1)
    future_dates = pd.date_range(start=start.strftime('%Y%m%d'), end=end.strftime('%Y%m%d'))
    pslp_categories = np.array([get_pslp_category(d) for d in future_dates])
    idx = np.where(pslp_categories == pslp_category)
    return future_dates[idx][0].date()
    
fut_date = _get_nearest_future_pslp_date('20180909')
print(pd.to_datetime("20180916").date() != fut_date)

False


In [13]:
def _calculate_pslp(df, original_headers, date_str, lookback=3, country_code='DE', renewable=False, DEBUG=False):
    """
    Calculate PSLPs for all dates in dataframe or for given date from given data.
    
    The data is categorized into weekdays, Saturdays, and Sundays/holidays.
    The `lookback` most recent days from the specified date's category are used to
    calculate the corresponding PSLP as the average.
    
    Params
    ------
    df : pandas.Dataframe
         data to calculate PSLP for, must have datetime index
    original_headers : list of str
                       categories to calculate PSLP for
    date_str : str
               date 'YYYYMMDD' to calculate PSLP for; if None, calculate PSLP for all dates
    lookback : int
               number of days to consider in each category for calculating PSLP
    country_code : str
                   considered country (for holidays)
    """
    unique_dates = df.index.to_series().dt.date.drop_duplicates().tolist() # Get unique dates in data index.
    df = assign_pslp_categories(df, country_code)
    
    print(f"Calculating PSLP for date {date_str}...")
    date = pd.to_datetime(date_str)
    
    pslp_category = get_pslp_category(date_str)
    if DEBUG: 
        print(f"PSLP category of {date.date()} is {pslp_category}.")

    # Check whether date is in range of given dataframe.
    if date.date() < unique_dates[0]:
        raise IndexError(f"PSLP cannot be calculated. Date {date_str} is in the past.")
    if date.date()  > unique_dates[-1] + pd.Timedelta(days = 1) and date.date() != _get_nearest_future_pslp_date(date_str, pslp_category):
        raise IndexError(f"PSLP cannot be calculated. Date {date_str} is too far in the future.")
    assert date.date() in unique_dates
    
    unique_dates_pslp = df[df['PSLP Category'] == pslp_category].index.to_series().dt.date.drop_duplicates().tolist()
    idx_pslp = unique_dates_pslp.index(date.date())
    if DEBUG:
        print(f"Index in unique days of PSLP category is {idx_pslp}.")
    if idx_pslp - lookback < 0:
        raise IndexError(f"PSLP cannot be calculated. Less than {lookback} samples in PSLP category for date {date_str}.")
    lookback_dates = [pd.to_datetime(d).strftime('%Y-%m-%d') for d in unique_dates_pslp[idx_pslp-lookback:idx_pslp]]
    print(f"Dates to consider for calculating PSLP: {lookback_dates}")
    
    if renewable:
        idx_renewable = unique_dates.index(date.date())
        if idx_renewable - lookback < 0:
            raise IndexError(f"SLP cannot be calculated. Less than {lookback} samples for date {date_str}.")
        lookback_dates_renewable = [pd.to_datetime(d).strftime('%Y-%m-%d') for d in unique_dates[idx_renewable-lookback:idx_renewable]]
        print(f"Dates to consider for calculating SLP for renewables: {lookback_dates_renewable}")

    for header in original_headers:
        if renewable and ( "Solar" in header or "Wind" in header ):
            df[header+" PSLP"].at[date_str] = pd.concat([df[header].at[d].reset_index(drop=True) for d in lookback_dates_renewable], axis=1).mean(axis=1)
        else:
            df[header+" PSLP"].at[date_str] = pd.concat([df[header].at[d].reset_index(drop=True) for d in lookback_dates], axis=1).mean(axis=1)
    return df

In [14]:
def calculate_pslps(df, original_headers, date_str=None, lookback=3, country_code='DE', renewable=False, DEBUG=False):
    """
    Calculate PSLPs for all dates in dataframe or for given date from given data.
    
    The data is categorized into weekdays, Saturdays, and Sundays/holidays.
    The `lookback` most recent days from the specified date's category are used to
    calculate the corresponding PSLP as the average.
    
    Params
    ------
    df : pandas.Dataframe
         data to calculate PSLP for, must have datetime index
    original_headers : list of str
                       categories to calculate PSLP for
    date_str : str
               date 'YYYYMMDD' to calculate PSLP for; if None, calculate PSLP for all dates
    lookback : int
               number of days to consider in each category for calculating PSLP
    country_code : str
                   considered country (for holidays)
    """
    if date_str is not None:
        print(f"Calculating PSLP for date {date_str} only...")
        df = _calculate_pslp(df, 
                            original_headers, 
                            date_str, 
                            lookback=lookback, 
                            country_code=country_code, 
                            DEBUG=DEBUG)
    
    else:
        print("Calculating PSLPs for all dates in dataframe...")
        unique_dates = df.index.to_series().dt.date.drop_duplicates().tolist() # Get unique dates in data index.
        unique_dates = [pd.to_datetime(d).strftime('%Y-%m-%d') for d in unique_dates]
        
        for date in unique_dates:
            try:
                df = _calculate_pslp(df, 
                                    original_headers, 
                                    date_str=date, 
                                    lookback=lookback, 
                                    country_code=country_code, 
                                    renewable=renewable, 
                                    DEBUG=DEBUG)
            except IndexError as e:
                print(e)
    return df

In [15]:
#calculate_pslps(df_test, original_headers, renewable=True)#, date_str)
calculate_pslps(df_normal, original_headers_normal, renewable=False)#, date_str)
calculate_pslps(df_renewa, original_headers_renewa, renewable=True)#, date_str)

Calculating PSLPs for all dates in dataframe...
Calculating PSLP for date 2017-12-25...
PSLP cannot be calculated. Less than 3 samples in PSLP category for date 2017-12-25.
Calculating PSLP for date 2017-12-26...
PSLP cannot be calculated. Less than 3 samples in PSLP category for date 2017-12-26.
Calculating PSLP for date 2017-12-27...
PSLP cannot be calculated. Less than 3 samples in PSLP category for date 2017-12-27.
Calculating PSLP for date 2017-12-28...
PSLP cannot be calculated. Less than 3 samples in PSLP category for date 2017-12-28.
Calculating PSLP for date 2017-12-29...
PSLP cannot be calculated. Less than 3 samples in PSLP category for date 2017-12-29.
Calculating PSLP for date 2017-12-30...
PSLP cannot be calculated. Less than 3 samples in PSLP category for date 2017-12-30.
Calculating PSLP for date 2017-12-31...
PSLP cannot be calculated. Less than 3 samples in PSLP category for date 2017-12-31.
Calculating PSLP for date 2018-01-01...
Dates to consider for calculating PSL

Calculating PSLP for date 2018-01-20...
Dates to consider for calculating PSLP: ['2017-12-30', '2018-01-06', '2018-01-13']
Dates to consider for calculating SLP for renewables: ['2018-01-17', '2018-01-18', '2018-01-19']
Calculating PSLP for date 2018-01-21...
Dates to consider for calculating PSLP: ['2018-01-01', '2018-01-07', '2018-01-14']
Dates to consider for calculating SLP for renewables: ['2018-01-18', '2018-01-19', '2018-01-20']
Calculating PSLP for date 2018-01-22...
Dates to consider for calculating PSLP: ['2018-01-17', '2018-01-18', '2018-01-19']
Dates to consider for calculating SLP for renewables: ['2018-01-19', '2018-01-20', '2018-01-21']
Calculating PSLP for date 2018-01-23...
Dates to consider for calculating PSLP: ['2018-01-18', '2018-01-19', '2018-01-22']
Dates to consider for calculating SLP for renewables: ['2018-01-20', '2018-01-21', '2018-01-22']
Calculating PSLP for date 2018-01-24...
Dates to consider for calculating PSLP: ['2018-01-19', '2018-01-22', '2018-01-23

Unnamed: 0,Actual Load,Biomass Actual Aggregated,Fossil Brown coal/Lignite Actual Aggregated,Fossil Coal-derived gas Actual Aggregated,Fossil Gas Actual Aggregated,Fossil Hard coal Actual Aggregated,Fossil Oil Actual Aggregated,Geothermal Actual Aggregated,Hydro Pumped Storage Actual Aggregated,Hydro Run-of-river and poundage Actual Aggregated,...,Nuclear Actual Aggregated PSLP,Other Actual Aggregated PSLP,Other renewable Actual Aggregated PSLP,Solar Actual Aggregated PSLP,Waste Actual Aggregated PSLP,Wind Offshore Actual Aggregated PSLP,Wind Onshore Actual Aggregated PSLP,PSLP Category,Holiday,Weekday
2017-12-25 00:00:00+01:00,43926.0,4731.0,5230.0,433.0,1021.0,1443.0,174.0,26.0,418.0,1507.0,...,,,,,,,,2,True,0
2017-12-25 00:15:00+01:00,43025.0,4727.0,5208.0,382.0,1018.0,1450.0,174.0,26.0,180.0,1487.0,...,,,,,,,,2,True,0
2017-12-25 00:30:00+01:00,42163.0,4731.0,5153.0,382.0,1021.0,1451.0,174.0,26.0,157.0,1480.0,...,,,,,,,,2,True,0
2017-12-25 00:45:00+01:00,41341.0,4716.0,5141.0,382.0,1024.0,1451.0,174.0,26.0,146.0,1477.0,...,,,,,,,,2,True,0
2017-12-25 01:00:00+01:00,40810.0,4694.0,5139.0,382.0,1021.0,1456.0,174.0,26.0,141.0,1471.0,...,,,,,,,,2,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-24 22:45:00+01:00,62329.0,4749.0,13469.0,522.0,3631.0,3573.0,205.0,25.0,128.0,1778.0,...,9182.666667,564.000000,206.666667,0.0,631.333333,2550.000000,14223.000000,0,False,2
2018-01-24 23:00:00+01:00,60741.0,4738.0,12861.0,522.0,3600.0,3469.0,205.0,25.0,425.0,1775.0,...,9215.333333,564.333333,206.666667,0.0,633.000000,2558.666667,14292.666667,0,False,2
2018-01-24 23:15:00+01:00,60044.0,4745.0,12478.0,481.0,3563.0,3436.0,205.0,25.0,280.0,1772.0,...,9235.000000,563.666667,206.666667,0.0,632.666667,2555.000000,14402.000000,0,False,2
2018-01-24 23:30:00+01:00,59121.0,4740.0,12191.0,481.0,3527.0,3407.0,204.0,25.0,154.0,1772.0,...,9204.666667,563.333333,206.333333,0.0,633.666667,2662.000000,14550.666667,0,False,2


In [None]:
print(df_test.loc["20180102"])

In [16]:
def calculate_residuals(df, original_headers):
    """
    Calculate residuals of actual data w.r.t PSLPs.
    
    Params
    ------
    df : pandas.DataFrame
         pre-processed data with PSLPs and residuals
    original_headers : list of str
                       original headers in ENTSO-E dataframe
    """
    for header in original_headers:
        df[header+" Residuals"] = df[header] - df[header+" PSLP"]
    return df

In [18]:
#calculate_residuals(df_test, original_headers)
calculate_residuals(df_normal, original_headers_normal)
calculate_residuals(df_renewa, original_headers_renewa)

Unnamed: 0,Actual Load,Biomass Actual Aggregated,Fossil Brown coal/Lignite Actual Aggregated,Fossil Coal-derived gas Actual Aggregated,Fossil Gas Actual Aggregated,Fossil Hard coal Actual Aggregated,Fossil Oil Actual Aggregated,Geothermal Actual Aggregated,Hydro Pumped Storage Actual Aggregated,Hydro Run-of-river and poundage Actual Aggregated,...,Hydro Pumped Storage Actual Aggregated Residuals,Hydro Run-of-river and poundage Actual Aggregated Residuals,Hydro Water Reservoir Actual Aggregated Residuals,Nuclear Actual Aggregated Residuals,Other Actual Aggregated Residuals,Other renewable Actual Aggregated Residuals,Solar Actual Aggregated Residuals,Waste Actual Aggregated Residuals,Wind Offshore Actual Aggregated Residuals,Wind Onshore Actual Aggregated Residuals
2017-12-25 00:00:00+01:00,43926.0,4731.0,5230.0,433.0,1021.0,1443.0,174.0,26.0,418.0,1507.0,...,,,,,,,,,,
2017-12-25 00:15:00+01:00,43025.0,4727.0,5208.0,382.0,1018.0,1450.0,174.0,26.0,180.0,1487.0,...,,,,,,,,,,
2017-12-25 00:30:00+01:00,42163.0,4731.0,5153.0,382.0,1021.0,1451.0,174.0,26.0,157.0,1480.0,...,,,,,,,,,,
2017-12-25 00:45:00+01:00,41341.0,4716.0,5141.0,382.0,1024.0,1451.0,174.0,26.0,146.0,1477.0,...,,,,,,,,,,
2017-12-25 01:00:00+01:00,40810.0,4694.0,5139.0,382.0,1021.0,1456.0,174.0,26.0,141.0,1471.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-24 22:45:00+01:00,62329.0,4749.0,13469.0,522.0,3631.0,3573.0,205.0,25.0,128.0,1778.0,...,-27.000000,-167.666667,44.666667,-7.666667,-72.000000,-21.666667,0.0,19.666667,1573.000000,14691.000000
2018-01-24 23:00:00+01:00,60741.0,4738.0,12861.0,522.0,3600.0,3469.0,205.0,25.0,425.0,1775.0,...,246.666667,-171.333333,26.000000,-18.333333,-74.333333,-21.666667,0.0,20.000000,1560.333333,14198.333333
2018-01-24 23:15:00+01:00,60044.0,4745.0,12478.0,481.0,3563.0,3436.0,205.0,25.0,280.0,1772.0,...,128.666667,-177.333333,11.000000,-60.000000,-73.666667,-21.666667,0.0,20.333333,1849.000000,13868.000000
2018-01-24 23:30:00+01:00,59121.0,4740.0,12191.0,481.0,3527.0,3407.0,204.0,25.0,154.0,1772.0,...,17.333333,-180.666667,15.000000,-0.666667,-73.333333,-21.333333,0.0,19.333333,1979.000000,13756.333333


In [24]:
#df_test["Actual Load Residuals"].loc["20180101"]
print(df_normal["Solar Actual Aggregated Residuals"].loc["20180101"].sum())
print(df_renewa["Solar Actual Aggregated Residuals"].loc["20180101"].sum())
#print(df_normal["Solar Actual Aggregated Residuals"].loc["20180101"])

29056.0
40634.666666666664


In [25]:
def plot_data(df, original_headers):
    """
    Plot preprocessed load and generation data.
    
    Params
    ------
    df : pandas.DataFrame
         pre-processed data with PSLPs and residuals
    original_headers : list of str
                       original headers in ENTSO-E dataframe
    """
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go

    num_rows = len(original_headers)
    
    fig = make_subplots(rows=num_rows, cols=1, subplot_titles=(original_headers))

    for i, header in enumerate(original_headers):
        fig.add_trace(go.Scatter(x = df.index, y = df[header], name=header), row=i+1, col=1)
        fig.add_trace(go.Scatter(x = df.index, y = df[header+" PSLP"], name=header+" PSLP"), row=i+1, col=1)
        fig.add_trace(go.Scatter(x = df.index, y = df[header+" Residuals"], name=header+" Residuals"), row=i+1, col=1)

    fig.update_layout(height=10000, width=1200)
    fig.show()

In [26]:
def calculate_errors(df, original_headers):
    """
    Calculate forecasting errors for preprocessed ENTSO-E load and generation data.
    
    Params
    ------
    df : pandas.DataFrame
         pre-processed data with PSLPs and residuals
    original_headers : list of str
                       original headers in ENTSO-E dataframe
    """
    from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error    
    for header in original_headers:
        temp = pd.concat([df[header], df[header+" PSLP"]], axis=1).dropna()
        mae = mean_absolute_error(temp[header], temp[header+" PSLP"])
        mape = mean_absolute_percentage_error(temp[header], temp[header+" PSLP"])
        mse = mean_squared_error(temp[header], temp[header+" PSLP"])
        print_str = f"{header}:\n"
        print_str += f"MAE = {mae}\nMSE = {mse}\nMAPE = {mape}\n"
        print(print_str)

In [27]:
#calculate_errors(df_test, original_headers)
calculate_errors(df_normal, original_headers_normal)
calculate_errors(df_renewa, original_headers_renewa)

Actual Load:
MAE = 3064.049873737374
MSE = 14478984.376893941
MAPE = 0.04975828224211815

Biomass Actual Aggregated:
MAE = 33.752367424242415
MSE = 1855.3331755050503
MAPE = 0.006980931855779489

Fossil Brown coal/Lignite Actual Aggregated:
MAE = 2401.349431818182
MSE = 11039677.868371213
MAPE = 0.1658434747113002

Fossil Coal-derived gas Actual Aggregated:
MAE = 88.40151515151516
MSE = 10846.137626262625
MAPE = 0.20476633438169559

Fossil Gas Actual Aggregated:
MAE = 1688.6437815656564
MSE = 4055023.2790930136
MAPE = 0.32715685071427586

Fossil Hard coal Actual Aggregated:
MAE = 3226.268465909091
MSE = 17572828.59832702
MAPE = 0.5135025455441473

Fossil Oil Actual Aggregated:
MAE = 15.577178030303031
MSE = 807.0528724747473
MAPE = 0.0723223357719505

Geothermal Actual Aggregated:
MAE = 1.2410037878787878
MSE = 3.1984953703703702
MAPE = 0.056620737215362975

Hydro Pumped Storage Actual Aggregated:
MAE = 684.3385416666666
MSE = 1106019.5224642258
MAPE = 1.3715724486431502

Hydro Run-of-

In [None]:
plot_data(df_test, original_headers)

In [None]:
#END

### Complete parameter list
https://transparency.entsoe.eu/content/static_content/Static%20content/web%20api/Guide.html#_complete_parameter_list

### Queries returning Pandas Series

`client.query_day_ahead_prices(country_code, start=start,end=end)` <br>
`client.query_net_position(country_code, start=start, end=end, dayahead=True)` <br>
`client.query_crossborder_flows(country_code_from, country_code_to, start, end)` <br>
`client.query_scheduled_exchanges(country_code_from, country_code_to, start, end, dayahead=False)` <br>
`client.query_net_transfer_capacity_dayahead(country_code_from, country_code_to, start, end)` <br>
`client.query_net_transfer_capacity_weekahead(country_code_from, country_code_to, start, end)` <br>
`client.query_net_transfer_capacity_monthahead(country_code_from, country_code_to, start, end)` <br>
`client.query_net_transfer_capacity_yearahead(country_code_from, country_code_to, start, end)` <br>
`client.query_intraday_offered_capacity(country_code_from, country_code_to, start, end,implicit=True)` <br>
`client.query_offered_capacity(country_code_from, country_code_to, start, end, contract_marketagreement_type, implicit=True)` <br>
`client.query_aggregate_water_reservoirs_and_hydro_storage(country_code, start, end)`

### Queries returning Pandas DataFrames

`client.query_load(country_code, start=start,end=end)` <br>
`client.query_load_forecast(country_code, start=start,end=end)` <br>
`client.query_load_and_forecast(country_code, start=start, end=end)` <br>
`client.query_generation_forecast(country_code, start=start,end=end)` <br>
`client.query_wind_and_solar_forecast(country_code, start=start,end=end, psr_type=None)` <br>
`client.query_generation(country_code, start=start,end=end, psr_type=None)` <br>
`client.query_generation_per_plant(country_code, start=start,end=end, psr_type=None)` <br>
`client.query_installed_generation_capacity(country_code, start=start,end=end, psr_type=None)` <br>
`client.query_installed_generation_capacity_per_unit(country_code, start=start,end=end, psr_type=None)` <br>
`client.query_imbalance_prices(country_code, start=start,end=end, psr_type=None)` <br>
`client.query_contracted_reserve_prices(country_code, start, end, type_marketagreement_type, psr_type=None)` <br>
`client.query_contracted_reserve_amount(country_code, start, end, type_marketagreement_type, psr_type=None)` <br>
`client.query_unavailability_of_generation_units(country_code, start=start,end=end, docstatus=None, periodstartupdate=None, periodendupdate=None)` <br>
`client.query_unavailability_of_production_units(country_code, start, end, docstatus=None, periodstartupdate=None, periodendupdate=None)` <br>
`client.query_unavailability_transmission(country_code_from, country_code_to, start, end, docstatus=None, periodstartupdate=None, periodendupdate=None)` <br>
`client.query_withdrawn_unavailability_of_generation_units(country_code, start, end)` <br>
`client.query_import(country_code, start, end)` <br>
`client.query_generation_import(country_code, start, end)` <br>
`client.query_procured_balancing_capacity(country_code, start, end, process_type, type_marketagreement_type=None)`

## Load data from client

In [None]:
df = pd.DataFrame()
df["load forecast"] = client.query_load_forecast(country_code, start=start,end=end)
df["load"] = client.query_load(country_code, start=start,end=end)
df["load forecast error"] = df["load forecast"] - df["load"]
df["generation forecast"] = client.query_generation_forecast(country_code, start=start,end=end)

In [None]:
df_gen = client.query_generation(country_code, start=start,end=end, psr_type=None)
df_gen.head()

In [None]:
df_gen.xs(key="Actual Aggregated", level=1, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(14,7))
ax.plot(df_gen.xs(key="Actual Aggregated", level=1, axis=1))
ax.legend()

In [None]:
df["generation"] = df_gen.sum(axis=1)

In [None]:
df["generation forecast error"] = df["generation forecast"] - df["generation"]

In [None]:
px.line(df)

## Save to csv file

In [None]:
df.to_csv('entsoe.csv')