## Data manipulation

The following code is useful to correctly format, manipulate and save as a CVS file the power dataset we need to work with

In [1]:
# Importing packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pyarrow

In [2]:
path_P = "../21-01/year_month=21-01/plugin=ipmi_pub/metric=total_power/a_0.parquet"
dataset_P = pd.read_parquet(path_P, engine='pyarrow')

# Casting of node values into integers and timestamp into datetimes
dataset_P['timestamp'] = pd.to_datetime(dataset_P['timestamp'])
dataset_P['node'] = dataset_P['node'].astype(int)

print(dataset_P)

                          timestamp  value  node
0         2021-01-21 06:42:40+00:00    700   128
1         2021-01-21 06:43:00+00:00    700   128
2         2021-01-21 06:43:20+00:00    580   128
3         2021-01-21 06:43:40+00:00    700   128
4         2021-01-21 06:44:00+00:00    680   128
...                             ...    ...   ...
112525874 2021-01-22 16:19:00+00:00    540   968
112525875 2021-01-22 16:19:20+00:00    540   968
112525876 2021-01-22 16:19:40+00:00    560   968
112525877 2021-01-22 16:20:00+00:00    540   968
112525878 2021-01-22 16:20:20+00:00    540   968

[112525879 rows x 3 columns]


In [3]:
# Pivoting of the node rows into separated columns, keeping the timestamp column as index of the dataframe
dataset_pivoted = dataset_P.pivot(index='timestamp', columns='node')
dataset_pivoted.reset_index(inplace=True)
del(dataset_P)

dataset_pivoted['timestamp'] = pd.to_datetime(dataset_pivoted['timestamp'])
dataset_pivoted.set_index('timestamp', inplace=True)

In [4]:
# We decided to drop the first months
dataset_pivoted = dataset_pivoted.loc['2020-05-01 00:00:00+00:00':'2022-09-28 22:00:00+00:00']

In [5]:
# Resampling of the DataFrame on an hourly basis and calculate the mean
dataset_pivoted = dataset_pivoted.resample('1h').mean()

dataset_pivoted

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
node,0,1,2,3,4,5,6,7,8,9,...,970,971,972,973,974,975,976,977,978,979
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-01-01 00:00:00+00:00,366.222222,403.555556,416.555556,365.444444,439.444444,395.111111,445.666667,421.222222,420.111111,420.444444,...,650.111111,458.222222,514.333333,479.888889,459.222222,460.555556,601.444444,757.541899,638.666667,480.000000
2021-01-01 01:00:00+00:00,369.333333,405.555556,417.000000,362.777778,437.666667,392.555556,444.222222,421.888889,420.888889,420.222222,...,635.888889,458.666667,514.666667,482.000000,459.000000,461.444444,970.333333,755.111111,646.888889,478.222222
2021-01-01 02:00:00+00:00,368.333333,403.666667,417.222222,366.444444,438.000000,394.444444,444.666667,423.444444,421.555556,420.444444,...,633.222222,460.333333,512.333333,479.000000,462.444444,461.555556,955.555556,756.222222,663.222222,478.000000
2021-01-01 03:00:00+00:00,367.888889,402.333333,416.555556,364.111111,437.777778,395.666667,444.111111,422.777778,421.111111,420.888889,...,642.888889,458.777778,514.333333,478.777778,457.777778,460.111111,944.111111,755.777778,648.333333,477.444444
2021-01-01 04:00:00+00:00,369.777778,404.111111,415.777778,363.111111,438.000000,395.888889,444.666667,423.555556,422.222222,420.444444,...,773.000000,460.666667,507.555556,481.333333,461.222222,458.888889,968.888889,752.777778,464.000000,476.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-31 19:00:00+00:00,434.285714,400.714286,413.571429,362.142857,440.000000,398.571429,445.714286,427.857143,422.962963,420.000000,...,539.285714,496.428571,536.428571,459.259259,507.857143,520.000000,508.148148,541.428571,519.047619,524.285714
2021-01-31 20:00:00+00:00,434.444444,400.000000,412.727273,365.000000,440.000000,396.842105,447.368421,422.857143,421.000000,420.000000,...,523.000000,501.666667,535.789474,466.666667,531.578947,514.545455,515.789474,548.333333,514.736842,515.789474
2021-01-31 21:00:00+00:00,436.170213,400.000000,414.000000,361.666667,440.425532,394.893617,449.777778,423.333333,422.083333,420.425532,...,525.161290,506.818182,538.723404,471.363636,523.829787,519.148936,520.425532,554.090909,505.106383,516.923077
2021-01-31 22:00:00+00:00,436.470588,400.000000,415.294118,362.352941,440.000000,396.470588,448.235294,425.882353,423.529412,422.352941,...,524.705882,534.117647,540.000000,462.352941,507.058824,511.764706,510.588235,517.647059,498.823529,522.352941


In [6]:
# Substitution of the NaN values present using a linear interpolation method
dataset_pivoted.interpolate(axis=0, method='linear', inplace=True)
dataset_pivoted.interpolate(axis=1, method='linear', inplace=True)

dataset_pivoted

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
node,0,1,2,3,4,5,6,7,8,9,...,970,971,972,973,974,975,976,977,978,979
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-01-01 00:00:00+00:00,366.222222,403.555556,416.555556,365.444444,439.444444,395.111111,445.666667,421.222222,420.111111,420.444444,...,650.111111,458.222222,514.333333,479.888889,459.222222,460.555556,601.444444,757.541899,638.666667,480.000000
2021-01-01 01:00:00+00:00,369.333333,405.555556,417.000000,362.777778,437.666667,392.555556,444.222222,421.888889,420.888889,420.222222,...,635.888889,458.666667,514.666667,482.000000,459.000000,461.444444,970.333333,755.111111,646.888889,478.222222
2021-01-01 02:00:00+00:00,368.333333,403.666667,417.222222,366.444444,438.000000,394.444444,444.666667,423.444444,421.555556,420.444444,...,633.222222,460.333333,512.333333,479.000000,462.444444,461.555556,955.555556,756.222222,663.222222,478.000000
2021-01-01 03:00:00+00:00,367.888889,402.333333,416.555556,364.111111,437.777778,395.666667,444.111111,422.777778,421.111111,420.888889,...,642.888889,458.777778,514.333333,478.777778,457.777778,460.111111,944.111111,755.777778,648.333333,477.444444
2021-01-01 04:00:00+00:00,369.777778,404.111111,415.777778,363.111111,438.000000,395.888889,444.666667,423.555556,422.222222,420.444444,...,773.000000,460.666667,507.555556,481.333333,461.222222,458.888889,968.888889,752.777778,464.000000,476.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-31 19:00:00+00:00,434.285714,400.714286,413.571429,362.142857,440.000000,398.571429,445.714286,427.857143,422.962963,420.000000,...,539.285714,496.428571,536.428571,459.259259,507.857143,520.000000,508.148148,541.428571,519.047619,524.285714
2021-01-31 20:00:00+00:00,434.444444,400.000000,412.727273,365.000000,440.000000,396.842105,447.368421,422.857143,421.000000,420.000000,...,523.000000,501.666667,535.789474,466.666667,531.578947,514.545455,515.789474,548.333333,514.736842,515.789474
2021-01-31 21:00:00+00:00,436.170213,400.000000,414.000000,361.666667,440.425532,394.893617,449.777778,423.333333,422.083333,420.425532,...,525.161290,506.818182,538.723404,471.363636,523.829787,519.148936,520.425532,554.090909,505.106383,516.923077
2021-01-31 22:00:00+00:00,436.470588,400.000000,415.294118,362.352941,440.000000,396.470588,448.235294,425.882353,423.529412,422.352941,...,524.705882,534.117647,540.000000,462.352941,507.058824,511.764706,510.588235,517.647059,498.823529,522.352941


In [7]:
# Dataset exporting as a CSV file for later use
output_file = '../PowerDataset_pivoted.csv'
dataset_pivoted.to_csv(output_file, index_label='timestamp')