# Load the hourly historical data

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('../')

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv, find_dotenv

In [2]:
# data directories
load_dotenv(find_dotenv(), verbose=True)
dir_project = os.getenv('PROJECT_ROOT')
dir_data_raw = os.path.join(dir_project, 'data/raw/')
dir_data_interim = os.path.join(dir_project, 'data/interim/')
dir_data_processed = os.path.join(dir_project, 'data/processed/')
dir_data_external = os.path.join(dir_project, 'data/external/')
dir_models = os.path.join(dir_project, 'models/')

In [3]:
df_outlook = pd.read_excel(os.path.join(dir_data_raw, 'Data 18 months Outlook/12-01-2018/12012018.xlsx'), sheet_name='Feuil1')

The original excel file has several sheets:

In [4]:
# correspond to ['HOEP', 'Ontario.Demand', 'Temp', 'CDD', 'HDD', 'NUCLEAR', 'GAS', 'HYDRO', 'WIND', 'SOLAR', 'BIOFUEL']

independ_vars = ['HOEP', 'Ontario ED', 'Normal Average Temperature (°C)', 'Expected Nuclear Output', 'Expected Hydro Output', 'Expected Wind Output', 'Expected Self-Scheduling & Intermittent Output']

In [56]:
df = df_outlook[['Date (week ending)'] + independ_vars]

In [57]:
df

Unnamed: 0,Date (week ending),HOEP,Ontario ED,Normal Average Temperature (°C),Expected Nuclear Output,Expected Hydro Output,Expected Wind Output,Expected Self-Scheduling & Intermittent Output
0,2019-01-06 23:00:00,13.066012,2616.954613,-4.533333,10800,2756,1693.698047,78.546684
1,2019-01-13 23:00:00,20.495655,2883.187716,-7.480952,10815,2756,1693.698047,78.870922
2,2019-01-20 23:00:00,32.777738,2871.823750,-5.152381,10463,2756,1473.056923,78.870922
3,2019-01-27 23:00:00,33.042917,2877.760451,-5.433333,10463,2756,1473.056923,78.870922
4,2019-02-03 23:00:00,31.676548,2890.869172,-5.000000,9649,3010,1693.698047,74.438802
...,...,...,...,...,...,...,...,...
74,2020-06-07 23:00:00,11.187560,2515.027323,20.947619,9627,2611,741.360842,51.106821
75,2020-06-14 23:00:00,5.162381,2518.644544,20.595238,9627,2611,741.360842,51.106821
76,2020-06-21 23:00:00,13.651845,2586.715333,21.747619,10395,2611,741.360842,51.106821
77,2020-06-28 23:00:00,13.430060,2645.172573,22.552381,10395,2611,741.360842,51.106821


# Pad to hourly

In [29]:
df['Date (week ending)'][0]

Timestamp('2019-01-06 23:00:00')

We are gonna shift the dataframe to week beginning:

In [58]:
extra_weekending = df['Date (week ending)'][0] - pd.Timedelta('7 day')
extra_weekending

Timestamp('2018-12-30 23:00:00')

In [75]:
# add one dummy raw to the beginning

extra_row = df[:1].copy()
extra_row['Date (week ending)'] = extra_weekending

In [76]:
extra_row

Unnamed: 0,Date (week ending),HOEP,Ontario ED,Normal Average Temperature (°C),Expected Nuclear Output,Expected Hydro Output,Expected Wind Output,Expected Self-Scheduling & Intermittent Output
0,2018-12-30 23:00:00,13.066012,2616.954613,-4.533333,10800,2756,1693.698047,78.546684


In [79]:
df = pd.concat([extra_row, df])

In [101]:
df_hourly = df.set_index('Date (week ending)').shift(-1).resample(rule='H').pad().shift().dropna()

In [103]:
df_hourly[160:200]

Unnamed: 0_level_0,HOEP,Ontario ED,Normal Average Temperature (°C),Expected Nuclear Output,Expected Hydro Output,Expected Wind Output,Expected Self-Scheduling & Intermittent Output
Date (week ending),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-06 16:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-06 17:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-06 18:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-06 19:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-06 20:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-06 21:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-06 22:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-06 23:00:00,13.066012,2616.954613,-4.533333,10800.0,2756.0,1693.698047,78.546684
2019-01-07 00:00:00,20.495655,2883.187716,-7.480952,10815.0,2756.0,1693.698047,78.870922
2019-01-07 01:00:00,20.495655,2883.187716,-7.480952,10815.0,2756.0,1693.698047,78.870922


Nice!

In [104]:
df_hourly.to_csv(os.path.join(dir_data_processed, '12012018_outlook_hourly.csv'))