In [2]:
import pytz
import plotly.express as px
import seaborn as sns
from datetime import datetime
from heatmap import corrplot
from datetime import timezone
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import holidays
%matplotlib inline

# Data loading

In [3]:
ts15 = pd.read_csv('../../RDN/Load Data (2018-2019)/INRG-load-series.csv', delimiter=',', header=0, index_col=0, parse_dates=True)
ts15

Unnamed: 0_level_0,Load
Date,Unnamed: 1_level_1
2018-01-01 00:00:00,4689.365428
2018-01-01 00:15:00,4653.100916
2018-01-01 00:30:00,4617.549863
2018-01-01 00:45:00,4595.858857
2018-01-01 01:00:00,4574.072846
...,...
2019-12-31 22:45:00,5418.783371
2019-12-31 23:00:00,5355.531827
2019-12-31 23:15:00,5285.912722
2019-12-31 23:30:00,5207.156101


# Check for invalid data
No invalid data is detected

In [4]:
print(f"NaNs count: {ts15.isnull().sum().item()}")
print(f"Negative values count: {len(ts15)-ts15[ts15 < 0].isnull().sum().item()}")
print(f"Duplicate indices count: {len(ts15[ts15.index.duplicated()])}")

NaNs count: 0
Negative values count: 0
Duplicate indices count: 0


# Resampling
We create a new load only dataset and change its timestep 60min (hourly).

In [5]:
ts30 = ts15.resample('30T').sum()
ts60 = ts15.resample('60T').sum()
ts60.head()


Unnamed: 0_level_0,Load
Date,Unnamed: 1_level_1
2018-01-01 00:00:00,18555.875064
2018-01-01 01:00:00,18001.483151
2018-01-01 02:00:00,17017.792074
2018-01-01 03:00:00,16187.432017
2018-01-01 04:00:00,15621.660524


We also store the result.

In [6]:
ts60.to_csv('../../RDN/Load Data (2018-2019)/artifacts/load_60min.csv')
ts15.to_csv('../../RDN/Load Data (2018-2019)/artifacts/load_15min.csv')
# ts30.to_csv('../../RDN/Load Data (2018-2019)/artifacts/load_30min.csv')


# Calendar
In this section a calendar is created containing auxiliary info for all unique dates that appear in the core dataset

In [40]:
local_tz = pytz.timezone('Europe/Lisbon')


def isholiday(x, holiday_list):
    if x in holiday_list:
        return True
    return False


def isweekend(x):
    if x == 6 or x == 0:
        return True
    return False


def create_calendar(timestep_minutes, holiday_list):

    unique_dates = {'15': ts15.index.tolist(),
                    '30': ts30.index.tolist(),
                    '60': ts60.index.tolist()}

    calendar = pd.DataFrame(
        unique_dates[str(timestep_minutes)],
        columns=['datetime']
    )

    calendar['year'] = calendar['datetime'].apply(lambda x: x.year)
    calendar['month'] = calendar['datetime'].apply(lambda x: x.month)
    calendar['day'] = calendar['datetime'].apply(lambda x: x.day)
    calendar['hour'] = calendar['datetime'].apply(lambda x: x.hour)
    calendar['minute'] = calendar['datetime'].apply(lambda x: x.minute)
    calendar['second'] = calendar['datetime'].apply(lambda x: x.second)
    calendar['weekday'] = calendar['datetime'].apply(lambda x: x.weekday())
    calendar['weekend'] = calendar['weekday'].apply(lambda x: isweekend(x))
    # first convert to utc and then to timestamp
    calendar['timestamp'] = calendar['datetime'].apply(lambda x: local_tz.localize(
        x).replace(tzinfo=timezone.utc).timestamp()).astype(int)

    # national_holidays = Province(name="valladolid").national_holidays()
    # regional_holidays = Province(name="valladolid").regional_holidays()
    # local_holidays = Province(name="valladolid").local_holidays()
    # holiday_list = national_holidays + regional_holidays + local_holidays
    
    calendar['holiday'] = calendar['datetime'].apply(lambda x: isholiday(x.date(), holiday_list))
    return calendar

An extended holiday set is chosen for Portugal that includes days most people have off.

In [41]:
pt_holidays = holidays.PortugalExt()

In [42]:
holiday_list = [i for i in ts15.index if i.date() in pt_holidays]
calendar15 = create_calendar(15, holiday_list)
print("15 min Calendar dataset structure:")
calendar15.head()


  if x in holiday_list:


15 min Calendar dataset structure:


Unnamed: 0,datetime,year,month,day,hour,minute,second,weekday,weekend,timestamp,holiday
0,2018-01-01 00:00:00,2018,1,1,0,0,0,0,True,1514764800,True
1,2018-01-01 00:15:00,2018,1,1,0,15,0,0,True,1514765700,True
2,2018-01-01 00:30:00,2018,1,1,0,30,0,0,True,1514766600,True
3,2018-01-01 00:45:00,2018,1,1,0,45,0,0,True,1514767500,True
4,2018-01-01 01:00:00,2018,1,1,1,0,0,0,True,1514768400,True


In [39]:
holiday_list = [i for i in ts60.index if i in holidays.PortugalExt()]
calendar60 = create_calendar(60, holiday_list)
print("60 min Calendar dataset structure:")
calendar60.head()

  if x in holiday_list:


60 min Calendar dataset structure:


Unnamed: 0,datetime,year,month,day,hour,minute,second,weekday,weekend,timestamp,holiday
0,2018-01-01 00:00:00,2018,1,1,0,0,0,0,True,1514764800,True
1,2018-01-01 01:00:00,2018,1,1,1,0,0,0,True,1514768400,True
2,2018-01-01 02:00:00,2018,1,1,2,0,0,0,True,1514772000,True
3,2018-01-01 03:00:00,2018,1,1,3,0,0,0,True,1514775600,True
4,2018-01-01 04:00:00,2018,1,1,4,0,0,0,True,1514779200,True


## Store calendar datasets

In [43]:
ts15_plus_time = pd.merge(ts15, calendar15, how='left', left_index=True, right_on='datetime').set_index('datetime')
ts15_plus_time.to_csv("../../RDN/Load Data (2018-2019)/artifacts/timeseries_plus_time_15min.csv")

# ts30_plus_time = pd.merge(ts30, create_calendar(30), how='left', left_index=True, right_on='datetime').set_index('datetime')
# ts30_plus_time.to_csv("../VEOLIA/artifacts/timeseries_plus_time_30min.csv")

ts60_plus_time = pd.merge(ts60, calendar60, how='left', left_index=True, right_on='datetime').set_index('datetime')
ts60_plus_time.to_csv("../../RDN/Load Data (2018-2019)/artifacts/timeseries_plus_time_60min.csv")

time15 = calendar15
time15.to_csv("../../RDN/Load Data (2018-2019)/artifacts/time_15min.csv")

# time30 = create_calendar(30)
# time30.to_csv("../VEOLIA/artifacts/time_30min.csv")

time60 = calendar60
time60.to_csv("../../RDN/Load Data (2018-2019)/artifacts/time_60min.csv")