In [71]:
%run ts_processing.py
%run ts_modelling.py

In [72]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns 

import inspect
import warnings
warnings.filterwarnings("ignore")

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
plt.style.use('seaborn-colorblind')
sns.set_theme()
%matplotlib inline
plt.rcParams["figure.figsize"] = (20, 10)
plt.rcParams['figure.dpi'] = 150

# Functions

In [84]:
print(inspect.getsource(data_processing)) # multivariate is the same without the datetime features

def data_processing(url_base, univ=True):
    """
    Takes NordPool's data in DataFrame format and corrects the misnaming of the hours,
    Assigns the correct index,
    Changes the price units to €/KWh (from €/MWh),
    Normalizes the prices,
    Adds datetime features to help explicitly infer the seasonality.
    """
    
    # Get data into a DataFrame
    currency = 'eur'
    filetype = ".xls"
    years = ["17", "18", "19", "20", "21", "22"] #Years we want to get historical data
    for y in years:
        if y == years[0]: prices = read_spot_prices(url_base+currency+str(y)+filetype)
        else: 
            df2 = read_spot_prices(url_base+currency+str(y)+filetype)
            prices = prices.append(df2)
    prices.dropna(subset = ["date"], how = "any", axis = 0, inplace=True)
    prices.drop_duplicates(subset=["date"], keep = "last", ignore_index = True, inplace=True)
    price_dat = prices.drop("date", axis = 1)
    dat = []
    # Solve the misnaming of Nordpool (24h to 00h f

In [73]:
regions = {'NO1':'os-',
           'NO3':'trh-',
           'NO4':'tro-',
           'SE1':'lul',
           'SE3':'sto',
           'SE4':'mal',
           'DK1':'ode',
           'DK2':'cph'}

# Univariate

In [82]:
def data_processing(url_base, univ=True):
    """
    Takes NordPool's data in DataFrame format and corrects the misnaming of the hours,
    Assigns the correct index,
    Changes the price units to €/KWh (from €/MWh),
    Normalizes the prices,
    Adds datetime features to help explicitly infer the seasonality.
    """
    
    # Get data into a DataFrame
    currency = 'eur'
    filetype = ".xls"
    years = ["17", "18", "19", "20", "21", "22"] #Years we want to get historical data
    for y in years:
        if y == years[0]: prices = read_spot_prices(url_base+currency+str(y)+filetype)
        else: 
            df2 = read_spot_prices(url_base+currency+str(y)+filetype)
            prices = prices.append(df2)
    prices.dropna(subset = ["date"], how = "any", axis = 0, inplace=True)
    prices.drop_duplicates(subset=["date"], keep = "last", ignore_index = True, inplace=True)
    price_dat = prices.drop("date", axis = 1)
    dat = []
    # Solve the misnaming of Nordpool (24h to 00h format)
    for ii, ro in price_dat.iterrows():
        dat.extend(ro.values)
    
    # Remove first 23 hours since day 1st jan 2017 is ordered incorrectly
    dat = dat[23:] 
    # Keep the values for the dates that have passed by, remove future prices
    dat = dat[:45505] 
    
    # Give DataFrame format with date range
    date_rng = pd.date_range(start='1/02/2017', end='3/13/2022', freq='H')
    df = pd.DataFrame(data = dat, columns = ["price"])
    
    # Add datetime features
    df["datetime"] = date_rng
    df = df[:-1]
    df.interpolate(inplace = True) # To check if ok use """df[1990:2000]"""
    
    if univ == True:
        df["weekday"] = df["datetime"].dt.weekday
        df["week"] = df["datetime"].dt.week
        df["day"] = df["datetime"].dt.day
    
    return df


In [83]:
### Save univariate series to each regions folder
for reg in regions.keys():
    url_base=f'NordPool/{reg}/{regions[reg]}'
    df = data_processing(url_base)
    df.to_csv(f'NordPool/{reg}/Spot_price_{reg}.csv')

# Multivariate

In [74]:
### Save multivariate data to NordPool's folder
reg_num = 0
for reg in regions.keys():
    reg_num += 1
    url_base=f'NordPool/{reg}/{regions[reg]}'
    if reg_num==1:
        df = multivariate_processing(url_base, univ=False)
    else:
        df2 = multivariate_processing(url_base, univ=False)
        df=df.merge(df2, on='datetime', how = 'outer', suffixes=(None,f'_{reg}'))
df.interpolate(inplace = True)
df.columns=['NO1', 'datetime', 'NO3', 'NO4', 'SE1', 'SE3', 'SE4', 'DK1', 'DK2']
df["weekday"] = df["datetime"].dt.weekday
df["week"] = df["datetime"].dt.week
df["day"] = df["datetime"].dt.day
df["hour"] = df["datetime"].dt.hour
df.to_csv("NordPool/Spot_Price_Nordics.csv")

In [75]:
df.sample(15)

Unnamed: 0,NO1,datetime,NO3,NO4,SE1,SE3,SE4,DK1,DK2,weekday,week,day,hour
34863,19.12,2020-12-24 15:00:00,13.99,13.99,14.38,19.12,19.12,19.12,19.12,3,52,24,15
27620,10.48,2020-02-26 20:00:00,18.7,18.7,18.7,42.17,42.17,42.17,42.17,2,9,26,20
28775,4.62,2020-04-14 23:00:00,4.15,4.34,4.15,4.62,4.62,4.62,4.62,1,16,14,23
16988,49.02,2018-12-10 20:00:00,49.02,49.02,49.02,49.02,49.02,49.39,49.39,0,50,10,20
530,28.93,2017-01-24 02:00:00,28.93,23.91,28.93,28.93,28.93,28.93,28.93,1,4,24,2
14119,49.78,2018-08-13 07:00:00,50.41,49.04,53.69,53.69,53.69,53.69,53.69,0,33,13,7
21657,25.91,2019-06-23 09:00:00,25.78,25.78,25.78,25.78,25.78,25.78,25.78,6,25,23,9
5124,27.24,2017-08-03 12:00:00,27.58,21.05,33.87,33.87,33.87,33.87,33.87,3,31,3,12
15983,41.07,2018-10-29 23:00:00,41.07,41.07,41.07,41.07,41.07,30.33,30.33,0,44,29,23
35024,24.37,2020-12-31 08:00:00,20.2,17.72,24.37,24.37,30.0,37.97,30.0,3,53,31,8
