## Imports

In [None]:
import pandas as pd
import numpy as np

### Format station data

Testing formatting and cutting of individual station data

In [None]:
df: pd.DataFrame = pd.read_csv('data/omsz_data/Győr-Moson-Sopron_Mosonmagyaróvár.csv',
                               skiprows=4, # skip metadata of csv
                               sep=';', # separator
                               skipinitialspace=True, # remove trailing whitespace
                               na_values=['EOR', -999], # End Of Record is irrelevant, -999 means missing value
                               low_memory=False, # warning about mixed types
                               )
df.columns = df.columns.str.strip() # remove trailing whitespaces
df['Time'] = pd.to_datetime(df['Time'], format='%Y%m%d%H%M') # convert to datetime
df.index = df['Time'] # set index to datetime
df.drop('Time', axis=1, inplace=True) # remove unnecessary column
df.dropna(how='all', axis=1, inplace=True) # remove columns with all NaN values
# print(df.dtypes)
# print(df.describe())

df

### Electricity load data

Checking electricity load data that will be used

In [None]:
def format_mavir(dataframe: pd.DataFrame):
    dataframe.columns = dataframe.columns.str.strip()
    dataframe['Time'] = (pd.to_datetime(dataframe['Időpont'], utc=True) + pd.Timedelta(hours=1)).dt.tz_localize(None)
    dataframe['Time'] = dataframe['Time']
    dataframe.index = dataframe['Time']
    dataframe.drop(['Time', 'Időpont'], axis=1, inplace=True)
    dataframe.dropna(axis=0, inplace=True)
    return dataframe

df1: pd.DataFrame = pd.read_csv('data/mavir_data/mavir_1.csv', sep=';')
df2: pd.DataFrame = pd.read_csv('data/mavir_data/mavir_2.csv', sep=';')
df1 = format_mavir(df1)
df2 = format_mavir(df2)

df = pd.concat([df1, df2])
df

In [11]:
df = pd.read_csv('data/mavir_data/mavir.csv', sep=';', index_col='Time')
df.drop(['Nettó terv rendszerterhelés', 'Bruttó hitelesített rendszerterhelés tény', 'Nettó tény rendszerterhelés - net.ker.elsz.meres',
         'Bruttó terv rendszerterhelés', 'Bruttó tény rendszerterhelés', 'Nettó rendszerterhelés tény - üzemirányítási',
         'Nettó terv rendszertermelés', 'Nettó MAVIR rendszerterhelés becslés'],
         inplace=True, axis=1)
df

Unnamed: 0_level_0,Nettó terhelés,MAVIR becslés
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01 01:00:00,4270.718,4283.00
2015-01-01 02:00:00,4068.940,4112.50
2015-01-01 03:00:00,3754.788,3847.25
2015-01-01 04:00:00,3510.361,3704.00
2015-01-01 05:00:00,3426.489,3646.75
...,...,...
2023-09-16 19:00:00,4917.640,5153.50
2023-09-16 20:00:00,4836.893,4950.00
2023-09-16 21:00:00,4556.295,4658.25
2023-09-16 22:00:00,4308.079,4377.75


### Checking meta data

I'm deciding which start date to use for the new dataset, it looks like a lot of new stations were set up in 2014, so somwhere between 2015 and 2017 seem like good years to start
- I have to keep in mind, the COVID pandemic started around 2020, so I want a good amount of data before that too

In [None]:
df = pd.DataFrame = pd.read_csv('data/omsz_meta.csv',
                                sep=';',
                                )
df.sort_values('StartDate', inplace=True)

df