## Imports

In [1]:
import pandas as pd
import numpy as np

### Format station data

Testing formatting and cutting of individual station data

In [None]:
df: pd.DataFrame = pd.read_csv('data/omsz_data/Győr-Moson-Sopron_Mosonmagyaróvár.csv',
                               skiprows=4, # skip metadata of csv
                               sep=';', # separator
                               skipinitialspace=True, # remove trailing whitespace
                               na_values=['EOR', -999], # End Of Record is irrelevant, -999 means missing value
                               low_memory=False, # warning about mixed types
                               )
df.columns = df.columns.str.strip() # remove trailing whitespaces
df['Time'] = pd.to_datetime(df['Time'], format='%Y%m%d%H%M') # convert to datetime
df.index = df['Time'] # set index to datetime
df.drop('Time', axis=1, inplace=True) # remove unnecessary column
df.dropna(how='all', axis=1, inplace=True) # remove columns with all NaN values
# print(df.dtypes)
# print(df.describe())

df

### Electricity load data

Checking electricity load data that will be used

In [4]:
def format_mavir(dataframe: pd.DataFrame):
    dataframe.columns = dataframe.columns.str.strip()
    dataframe['Time'] = (pd.to_datetime(dataframe['Időpont'], utc=True) + pd.Timedelta(hours=1)).dt.tz_localize(None)
    dataframe['Time'] = dataframe['Time']
    dataframe.index = dataframe['Time']
    dataframe.drop(['Time', 'Időpont'], axis=1, inplace=True)
    dataframe.dropna(axis=0, inplace=True)
    return dataframe

df1: pd.DataFrame = pd.read_csv('data/mavir_data/mavir_1.csv', sep=';')
df2: pd.DataFrame = pd.read_csv('data/mavir_data/mavir_2.csv', sep=';')
df1 = format_mavir(df1)
df2 = format_mavir(df2)

df = pd.concat([df1, df2])
df

Unnamed: 0_level_0,Nettó terv rendszerterhelés,Nettó terhelés,MAVIR becslés,Nettó terv rendszertermelés,Nettó rendszerterhelés tény - üzemirányítási,Bruttó hitelesített rendszerterhelés tény,Bruttó terv rendszerterhelés,Nettó tény rendszerterhelés - net.ker.elsz.meres,Bruttó tény rendszerterhelés,Nettó MAVIR rendszerterhelés becslés
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-01 01:00:00,4038.799,4270.718,4283.00,4033.771,4270.718,4562.75,4438.101,4167.426,4561.364,3871.412
2015-01-01 02:00:00,3833.700,4068.940,4112.50,3828.110,4068.940,4363.75,4226.770,3966.348,4364.469,3734.557
2015-01-01 03:00:00,3523.887,3754.788,3847.25,3509.965,3754.788,4054.00,3914.933,3653.326,4054.418,3436.674
2015-01-01 04:00:00,3334.091,3510.361,3704.00,3327.267,3510.361,3809.75,3743.815,3406.167,3810.251,3264.344
2015-01-01 05:00:00,3244.132,3426.489,3646.75,3238.058,3426.489,3728.50,3653.310,3323.994,3726.526,3195.405
...,...,...,...,...,...,...,...,...,...,...
2023-09-16 19:00:00,4897.714,4917.640,5153.50,4872.768,4917.639,5116.00,5158.700,4770.906,5094.324,4865.750
2023-09-16 20:00:00,4773.311,4836.893,4950.00,4722.568,4836.893,5037.75,5041.603,4712.392,5012.292,4659.000
2023-09-16 21:00:00,4525.007,4556.295,4658.25,4492.654,4556.295,4752.75,4779.990,4430.148,4731.554,4368.250
2023-09-16 22:00:00,4256.993,4308.079,4377.75,4251.202,4308.079,4505.50,4498.640,4173.954,4482.914,4099.250


### Checking meta data

I'm deciding which start date to use for the new dataset, it looks like a lot of new stations were set up in 2014, so somwhere between 2015 and 2017 seem like good years to start
- I have to keep in mind, the COVID pandemic started around 2020, so I want a good amount of data before that too

In [None]:
df = pd.DataFrame = pd.read_csv('data/omsz_meta.csv',
                                sep=';',
                                )
df.sort_values('StartDate', inplace=True)

df