## Imports

In [1]:
import pandas as pd
import numpy as np

### Format station data

Testing formatting and cutting of individual station data

In [None]:
df: pd.DataFrame = pd.read_csv('data/omsz_data/Győr-Moson-Sopron_Mosonmagyaróvár.csv',
                               skiprows=4, # skip metadata of csv
                               sep=';', # separator
                               skipinitialspace=True, # remove trailing whitespace
                               na_values=['EOR', -999], # End Of Record is irrelevant, -999 means missing value
                               low_memory=False, # warning about mixed types
                               )
df.columns = df.columns.str.strip() # remove trailing whitespaces
df['Time'] = pd.to_datetime(df['Time'], format='%Y%m%d%H%M') # convert to datetime
df.index = df['Time'] # set index to datetime
df.drop('Time', axis=1, inplace=True) # remove unnecessary column
df.dropna(how='all', axis=1, inplace=True) # remove columns with all NaN values
# print(df.dtypes)
# print(df.describe())

df

### Electricity load data

Checking electricity load data that will be used

In [2]:
df: pd.DataFrame = pd.read_csv('data/mavir_data/mavir_1.csv', sep=';')

df.columns = df.columns.str.strip()
df['Time'] = pd.to_datetime(df['Időpont'], utc=True).dt.tz_localize(None)
df.index = df['Time']
df.drop(['Time', 'Időpont'], axis=1, inplace=True)

df

Unnamed: 0_level_0,Nettó terv rendszerterhelés,Nettó terhelés,MAVIR becslés,Nettó terv rendszertermelés,Nettó rendszerterhelés tény - üzemirányítási,Bruttó hitelesített rendszerterhelés tény,Bruttó terv rendszerterhelés,Nettó tény rendszerterhelés - net.ker.elsz.meres,Bruttó tény rendszerterhelés,Nettó MAVIR rendszerterhelés becslés
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-12-31 23:00:00,4217.698,4441.649,4445.00,4205.410,4441.649,4727.00,4610.685,4334.268,4728.877,
2015-01-01 00:00:00,4038.799,4270.718,4283.00,4033.771,4270.718,4562.75,4438.101,4167.426,4561.364,3871.412
2015-01-01 01:00:00,3833.700,4068.940,4112.50,3828.110,4068.940,4363.75,4226.770,3966.348,4364.469,3734.557
2015-01-01 02:00:00,3523.887,3754.788,3847.25,3509.965,3754.788,4054.00,3914.933,3653.326,4054.418,3436.674
2015-01-01 03:00:00,3334.091,3510.361,3704.00,3327.267,3510.361,3809.75,3743.815,3406.167,3810.251,3264.344
...,...,...,...,...,...,...,...,...,...,...
2019-05-10 13:00:00,5137.336,5171.769,5523.00,5101.825,5171.769,5400.25,5383.933,5138.393,5404.372,5092.751
2019-05-10 14:00:00,5142.928,5167.305,5570.50,5114.371,5167.305,5396.00,5407.511,5114.985,5401.816,5143.517
2019-05-10 15:00:00,5134.088,5190.775,5549.00,5099.356,5190.775,5426.00,5413.481,5120.101,5426.095,5125.815
2019-05-10 16:00:00,5066.371,5152.173,5454.00,5026.913,5152.173,5386.00,5363.663,5041.042,5387.883,5025.547


### Checking meta data

I'm deciding which start date to use for the new dataset, it looks like a lot of new stations were set up in 2014, so somwhere between 2015 and 2017 seem like good years to start
- I have to keep in mind, the COVID pandemic started around 2020, so I want a good amount of data before that too

In [None]:
df = pd.DataFrame = pd.read_csv('data/omsz_meta.csv',
                                sep=';',
                                )
df.sort_values('StartDate', inplace=True)

df