In [2]:
# Importing packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pyarrow

We are still working with the month of January 2021

In [66]:
path_P = "../21-01/year_month=21-01/plugin=ipmi_pub/metric=total_power/a_0.parquet"
dataset_P = pd.read_parquet(path_P, engine='pyarrow')

# Casting of node values into integers
dataset_P['node'] = dataset_P['node'].astype(int)

# I want to extract from the whole dataset only the rows related to the node 0
dataset_P_node0 = dataset_P[dataset_P['node'] == 0]
del(dataset_P)
print(dataset_P_node0)

                          timestamp  value  node
211842    2021-01-20 23:36:20+00:00    440     0
211843    2021-01-20 23:36:40+00:00    420     0
211844    2021-01-20 23:37:00+00:00    440     0
211845    2021-01-20 23:37:20+00:00    440     0
211846    2021-01-20 23:37:40+00:00    440     0
...                             ...    ...   ...
112158837 2021-01-31 22:04:00+00:00    440     0
112158838 2021-01-31 22:04:20+00:00    440     0
112158839 2021-01-31 22:04:40+00:00    440     0
112158840 2021-01-31 22:05:00+00:00    440     0
112158841 2021-01-31 22:05:20+00:00    440     0

[114586 rows x 3 columns]


In [67]:
# As usual I need to order by datetime the samples I have
dataset_P_node0['timestamp'] = pd.to_datetime(dataset_P_node0['timestamp'])

dataset_P_node0 = dataset_P_node0.sort_values(by='timestamp')

print(dataset_P_node0)

                         timestamp  value  node
55240462 2021-01-01 00:00:00+00:00    380     0
55240463 2021-01-01 00:00:20+00:00    360     0
55240464 2021-01-01 00:00:40+00:00    360     0
55240465 2021-01-01 00:01:00+00:00    360     0
55240466 2021-01-01 00:01:20+00:00    360     0
...                            ...    ...   ...
69375705 2021-01-31 23:30:00+00:00    440     0
69375706 2021-01-31 23:30:20+00:00    420     0
69375707 2021-01-31 23:30:40+00:00    440     0
69375708 2021-01-31 23:31:00+00:00    420     0
69375709 2021-01-31 23:43:40+00:00    440     0

[114586 rows x 3 columns]


In [68]:
# Now we need to sum the values in order to obtain a cumulative single value for each hour
dataset_P_node0['date_hour'] = dataset_P_node0['timestamp'].dt.strftime('%Y-%m-%d %H')
hourly_sum_P = dataset_P_node0.groupby('date_hour')['value'].sum()

# And here we convert the pandas series into a dataframe with the corresponding datetimes
dataset_P_node0 = pd.DataFrame({'datetime': pd.to_datetime(hourly_sum_P.index, format='%Y-%m-%d %H'), 'power': hourly_sum_P.values})
del(hourly_sum_P)

print(dataset_P_node0)

               datetime  power
0   2021-01-01 00:00:00  65920
1   2021-01-01 01:00:00  66480
2   2021-01-01 02:00:00  66300
3   2021-01-01 03:00:00  66220
4   2021-01-01 04:00:00  66560
..                  ...    ...
686 2021-01-31 19:00:00  12160
687 2021-01-31 20:00:00   7820
688 2021-01-31 21:00:00  20500
689 2021-01-31 22:00:00   7420
690 2021-01-31 23:00:00   6020

[691 rows x 2 columns]


In [69]:
# Since some hours are missing in the dataframe we need to add them with a null power value
start_datetime = pd.Timestamp('2021-01-01 00:00:00')
end_datetime = pd.Timestamp('2021-01-31 23:00:00')
expected_datetimes = pd.date_range(start=start_datetime, end=end_datetime, freq='h')

missing_datetimes = expected_datetimes[~expected_datetimes.isin(dataset_P_node0['datetime'])]
missing_data = pd.DataFrame({'datetime': missing_datetimes, 'power': 0})

dataset_P_node0 = pd.concat([dataset_P_node0, missing_data]).sort_values(by='datetime').reset_index(drop=True)
print(dataset_P_node0)

               datetime  power
0   2021-01-01 00:00:00  65920
1   2021-01-01 01:00:00  66480
2   2021-01-01 02:00:00  66300
3   2021-01-01 03:00:00  66220
4   2021-01-01 04:00:00  66560
..                  ...    ...
739 2021-01-31 19:00:00  12160
740 2021-01-31 20:00:00   7820
741 2021-01-31 21:00:00  20500
742 2021-01-31 22:00:00   7420
743 2021-01-31 23:00:00   6020

[744 rows x 2 columns]


In [70]:
# Now that we have our power dataset formatted as needed, it's the time to extract also the carbon intensity dataset
path_CI = "../IT-NO_2021_hourly.csv"
dataset_CI = pd.read_csv(path_CI)
dataset_CI = dataset_CI.drop(columns=['Carbon Intensity gCO₂eq/kWh (LCA)', 'Country', 'Low Carbon Percentage' ,'Renewable Percentage' ,'Zone Name', 'Zone Id', 'Data Source', 'Data Estimated', 'Data Estimation Method'])

dataset_CI['Carbon Intensity gCO₂eq/kWh (direct)'] = dataset_CI['Carbon Intensity gCO₂eq/kWh (direct)'].fillna(dataset_CI.describe(include='all').loc['mean'].loc['Carbon Intensity gCO₂eq/kWh (direct)'])

dataset_CI['Datetime (UTC)'] = pd.to_datetime(dataset_CI['Datetime (UTC)'])

end_of_january = pd.to_datetime('2021-02-01')
dataset_CI = dataset_CI[dataset_CI['Datetime (UTC)'] < end_of_january]

dataset_CI = dataset_CI.rename(columns={'Datetime (UTC)': 'datetime'})

print(dataset_CI)

               datetime  Carbon Intensity gCO₂eq/kWh (direct)
0   2021-01-01 00:00:00                                303.29
1   2021-01-01 01:00:00                                303.45
2   2021-01-01 02:00:00                                295.04
3   2021-01-01 03:00:00                                295.56
4   2021-01-01 04:00:00                                308.06
..                  ...                                   ...
739 2021-01-31 19:00:00                                244.04
740 2021-01-31 20:00:00                                253.52
741 2021-01-31 21:00:00                                263.28
742 2021-01-31 22:00:00                                262.85
743 2021-01-31 23:00:00                                266.70

[744 rows x 2 columns]


## Moving Average computation

In [71]:
# We'll use window sizes of 6, 12 and 24 hours, 1 week and also 1 month and compare the results
window_size = 6 # hours
dataset_P_MA = dataset_P_node0.rolling(window=window_size).mean()

dataset_CI_MA = dataset_CI.rolling(window=window_size).mean()

print(dataset_P_MA)
print(dataset_CI_MA)

DataError: Cannot aggregate non-numeric type: datetime64[ns]