This code is from Linda Punt for the paper "On Efficient Operation of VPP"

It is my personal perception that I don't understand many of the decisions taken in preprocessing this data.
For example:
* Removing negative prices from Day Ahead

In [21]:
import pandas as pd
import numpy as np
from scipy.stats import truncnorm, rv_discrete
import logging
from datetime import timedelta
import matplotlib.pyplot as plt
import yaml

In [22]:
def read_yaml(file):
    with open(file, 'r') as yml_file:
        config = yaml.load(yml_file, Loader=yaml.FullLoader)
        return config

In [23]:
def process_columns_da(dt):
    # Change column names
    #dt.columns = ['date', 'price_da', 'country']
    dt.columns = ['date', 'price_da']
    #dt.drop(columns='country', inplace=True)

    # Remove - prices
    dt = dt[dt.price_da != '-']
    dt['price_da'] = dt['price_da'].astype(float)
    dt = dt.dropna()

    # Change dates
    dt['date'] = dt.date.str.split('-').str[0]
    dt['date'] = pd.to_datetime(dt.date, format='%d.%m.%Y %H:%M ')

    # Get month and hour column
    dt['id_hour'] = dt.date.dt.hour

    print("Process DA")
    display(dt.describe())
    return dt

In [24]:
def process_columns_imbalance(dt):
    # Change dates
    dt['date'] = dt.apply(lambda row: pd.to_datetime(f'{row["Date"]} {row["period_until"]}',
                                                        format='%m/%d/%Y %H:%M'), axis=1)
    dt.rename(columns={'Consume': 'price_imbalance', 'Regulation state': 'regulation_state'},
              inplace=True)
    print("Process imbalance")
    display(dt.describe())
    return dt

In [25]:
def process_columns_mid(dt):
    dt['date'] = dt.apply(lambda row: pd.to_datetime(f'{row["Date"]} {row["Time"]}',
                                                        format='%m/%d/%Y %H:%M'), axis=1)
    dt['date'] = dt['date'].dt.round('15min')
    dt.rename(columns={'Mid_price_upward': 'mid_price'}, inplace=True)
    dt = dt.drop_duplicates(subset='date')
    print("Process mid")
    display(dt.describe())
    return dt

In [26]:
def process_dt(dt_da, dt_imbalance, dt_midprice):
    
    #quarter_hours = pd.DataFrame(data={'minutes': [0, 15, 30, 45], 'key': 1})
    quarter_hours = pd.DataFrame(data={'minutes': [0], 'key': 1})
    
    # Get quarter hour
    dt_da['key'] = 1
    dt = pd.merge(dt_da[['date', 'price_da', 'key']], quarter_hours, on='key')
    dt['date'] = dt.apply(lambda row: row['date'].replace(minute=row['minutes']), axis=1)

    # Get imbalance prices
    dt = pd.merge(dt, dt_imbalance[['date', 'price_imbalance', 'regulation_state']], on='date')
    dt = pd.merge(dt, dt_midprice[['date', 'mid_price']], on='date', how='left')
    dt['price_imbalance'] = np.where(dt['regulation_state'] == 2, dt['mid_price'], dt['price_imbalance'])

    # Add hour of the day and day of the year
    dt['hour'] = dt.date.dt.hour
    dt['day_year'] = dt.date.dt.dayofyear

    dt.drop(columns=['date', 'mid_price', 'regulation_state', 'key'], inplace=True)

    return dt

In [27]:
def simulate_dt(d_config, dt):
    # Make dataframe with hours and dates
    days = pd.DataFrame(data={'date': pd.date_range(start=pd.to_datetime(d_config['general']['start_date'],
                                                                         format='%d-%m-%Y'),
                                                    periods=d_config['general']['n_days']),
                              'key': 1})
    hours = pd.DataFrame(data={'hour': list(range(0, 24)),
                               'key': 1})
    
    #quarter_hours = pd.DataFrame(data={'minutes': [0, 15, 30, 45], 'key': 1})
    quarter_hours = pd.DataFrame(data={'minutes': [0], 'key': 1})
    
    data_simulated = pd.merge(days, hours, on='key')
    data_simulated = pd.merge(data_simulated, quarter_hours, on='key')
    data_simulated['date'] = data_simulated.apply(
        lambda row: row['date'].replace(hour=row['hour'], minute=row['minutes']), axis=1)
    data_simulated['day_year'] = data_simulated.date.dt.dayofyear

    # Merge dataframes
    data_simulated = pd.merge(data_simulated, dt, on=['day_year', 'minutes', 'hour'], how='left')

    # Multiply by error term
    data_simulated['error'] = 1 #+ np.random.normal(0, 0.2, data_simulated.shape[0]) # Commented out by Javier
    data_simulated['price_da'] = data_simulated['price_da'] * data_simulated['error']
    data_simulated['price_imbalance'] = data_simulated['price_imbalance'] * data_simulated['error']

    return data_simulated[['date', 'price_da', 'price_imbalance']]

In [28]:
def run(d_config):
    logging.info('Running d_process_pv_prices.py')
    dt_da = pd.read_csv('./data/Linda/day-ahead_prices_2019.csv')
    dt_imbalance = pd.read_csv('./data/Linda/imbalance_prices_2019.csv')
    dt_midprice = pd.read_csv('./data/Linda/mid_price_2019.csv')

    # Rename columns
    dt_da = process_columns_da(dt_da)
    dt_imbalance = process_columns_imbalance(dt_imbalance)
    dt_midprice = process_columns_mid(dt_midprice)

    # Process data
    dt_processed = process_dt(dt_da, dt_imbalance, dt_midprice)
    print("Processed data")
    display(dt_processed.describe())

    # Simulate data
    dt_simulated = simulate_dt(d_config, dt_processed)

    return dt_simulated, dt_processed

In [29]:
#if __name__ == "__main__":
#import os
#os.chdir('..')
run_config = read_yaml('./data/Linda/config.yml')
data_config = read_yaml('./data/Linda/data_config.yml')

final_dt, processed_dt = run(data_config)

Process DA


Unnamed: 0,price_da,id_hour
count,8760.0,8760.0
mean,41.195801,11.5
std,11.278806,6.922582
min,-9.02,0.0
25%,34.1,5.75
50%,39.7,11.5
75%,47.3925,17.25
max,121.46,23.0


Process imbalance


Unnamed: 0,PTE,To regulate up,To regulate down,Incentive component,price_imbalance,Feed,regulation_state
count,35040.0,19971.0,16910.0,35040.0,35040.0,35040.0,35040.0
mean,48.500457,63.592258,18.612762,0.0,42.554082,40.39106,0.223459
std,27.712496,62.519171,27.792309,0.0,52.003945,51.227613,1.022846
min,1.0,0.0,-487.65,0.0,-487.65,-487.65,-1.0
25%,24.75,32.6,18.0,0.0,25.53,24.47,-1.0
50%,48.5,40.63,24.5,0.0,32.16,31.11,1.0
75%,72.25,57.105,30.2,0.0,42.54,40.9,1.0
max,100.0,936.12,179.01,0.0,936.12,936.12,2.0


Process mid


Unnamed: 0,Sequence_number,IGCCContribution_up,IGCCContribution_down,To regulate up,To regulate down,To regulate up_reserve,To regulate down_reserve,Emergency capacity,Highest_price_upward,mid_price,Lowest_price_downward
count,35038.0,35036.0,35036.0,35036.0,35036.0,35036.0,35036.0,35038.0,15042.0,35036.0,12000.0
mean,721.52737,35.396935,38.903043,39.567845,23.787733,0.155954,0.002569,0.002454,55.35557,30.47897,22.961194
std,415.693744,75.558363,85.56818,75.1633,55.291123,2.354327,0.256207,0.049483,54.065099,8.065918,21.774535
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.31,-429.95
25%,369.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.29,25.78,21.0575
50%,729.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.08,29.76,26.08
75%,1085.25,31.0,32.0,45.0,14.0,0.0,0.0,0.0,50.19,35.19,31.78
max,1494.0,555.0,770.0,559.0,541.0,75.0,35.0,1.0,936.12,107.4,179.01


Processed data


Unnamed: 0,price_da,minutes,price_imbalance,hour,day_year
count,8760.0,8760.0,8760.0,8760.0,8760.0
mean,41.193841,0.0,40.972971,11.5,183.047945
std,11.280501,0.0,51.920979,6.922582,105.377493
min,-9.02,0.0,-239.9,0.0,1.0
25%,34.1,0.0,24.4,5.75,92.0
50%,39.7,0.0,30.975,11.5,183.0
75%,47.3925,0.0,41.225,17.25,274.0
max,121.46,0.0,936.12,23.0,365.0


In [30]:
final_dt

Unnamed: 0,date,price_da,price_imbalance
0,2018-07-01 00:00:00,28.98,23.89
1,2018-07-01 01:00:00,26.23,28.34
2,2018-07-01 02:00:00,25.02,26.09
3,2018-07-01 03:00:00,24.87,23.65
4,2018-07-01 04:00:00,25.90,28.47
...,...,...,...
46661,2023-10-26 19:00:00,37.37,43.51
46662,2023-10-26 20:00:00,35.77,17.57
46663,2023-10-26 21:00:00,35.05,21.65
46664,2023-10-26 22:00:00,31.82,34.19


In [31]:
final_dt = final_dt.set_index('date')

In [32]:
final_dt = final_dt[~final_dt.index.duplicated(keep='first')]

In [33]:
#Replace NaN values with mean

final_dt['price_da'] = final_dt['price_da'].fillna(final_dt['price_da'].mean())
final_dt['price_imbalance'] = final_dt['price_imbalance'].fillna(final_dt['price_imbalance'].mean())

In [34]:
#final_dt.to_csv('J:/Thesis code/thesis_code_saidur/Code_Yashar_Project/Yashar_Collab_Linda_Code/Linda NEW/synthetic_price_data_saidur.csv',index=True)

In [35]:
final_dt.describe()

Unnamed: 0,price_da,price_imbalance
count,46656.0,46656.0
mean,40.98801,40.780491
std,11.197233,51.924871
min,-9.02,-239.9
25%,33.95,24.21
50%,39.505,30.61
75%,47.18,40.95
max,121.46,936.12


In [36]:
final_dt_cp = final_dt.copy()
final_dt_cp["year"] = final_dt_cp.index.year
final_dt_2019 = final_dt_cp[final_dt_cp["year"] == 2019]
final_dt_2019 = final_dt_2019.drop(columns=["year"])
final_dt_2019.describe()

Unnamed: 0,price_da,price_imbalance
count,8760.0,8760.0
mean,41.197656,40.980199
std,11.277584,51.918739
min,-9.02,-239.9
25%,34.1,24.4075
50%,39.71,30.985
75%,47.3925,41.225
max,121.46,936.12


In [37]:
processed_dt.describe()

Unnamed: 0,price_da,minutes,price_imbalance,hour,day_year
count,8760.0,8760.0,8760.0,8760.0,8760.0
mean,41.193841,0.0,40.972971,11.5,183.047945
std,11.280501,0.0,51.920979,6.922582,105.377493
min,-9.02,0.0,-239.9,0.0,1.0
25%,34.1,0.0,24.4,5.75,92.0
50%,39.7,0.0,30.975,11.5,183.0
75%,47.3925,0.0,41.225,17.25,274.0
max,121.46,0.0,936.12,23.0,365.0


In [38]:
asdasdasdad

NameError: name 'asdasdasdad' is not defined

In [None]:
#price_dt_2018 = final_dt

In [None]:
# REMEMBER TO CHANGE THE START DATE IN D_CONFIG FILE FIRST!!!!!!

In [None]:
price_dt_2018.head()

In [None]:
price_dt_2020.head()

In [None]:
price_dt_2019.head()

In [None]:
price_dataset_saidur_FINAL = pd.concat([price_dt_2018, price_dt_2019, price_dt_2020])

In [None]:
#price_dataset_saidur_FINAL.to_csv('price_dataset_saidur_FINAL.csv', index=False)