## Imports

In [None]:
import pandas as pd
import numpy as np

### Format station data

Testing formatting and cutting of individual station data

In [None]:
df: pd.DataFrame = pd.read_csv('data/omsz_data/Szabolcs-Szatmár-Bereg_Nyírtass.csv',
                               skiprows=4, # skip metadata of csv
                               sep=';', # separator
                               skipinitialspace=True, # remove trailing whitespace
                               na_values=['EOR', -999], # End Of Record is irrelevant, -999 means missing value
                               low_memory=False, # warning about mixed types
                               )
df.columns = df.columns.str.strip() # remove trailing whitespaces
df['Time'] = pd.to_datetime(df['Time'], format='%Y%m%d%H%M') # convert to datetime
df.index = df['Time'] # set index to datetime
df.drop('Time', axis=1, inplace=True) # remove unnecessary column
df.dropna(how='all', axis=1, inplace=True) # remove columns with all NaN values
# print(df.dtypes)
# print(df.describe())
df = df['2015-01-01 00:00:00':]
df.drop(['StationNumber', 't', 'tn', 'tx', 'v', 'p', 'fs', 'fsd', 'fx', 'fxd', 'fxdat', 'fd', 'et5', 'et10', 'et20', 'et50', 'et100', 'tsn', 'suv'], axis=1, inplace=True, errors='ignore')
# 'suv' column doesn't exist in this particular instance
# still deciding if i should keep the 'we' column
df.to_csv('test.csv', sep=';')

df

### Electricity load data

Checking electricity load data that will be used

In [None]:
df: pd.DataFrame = pd.read_csv('data/mavir_data/mavir_1.csv', sep=';')

df.columns = df.columns.str.strip()
df['Time'] = pd.to_datetime(df['Időpont'], utc=True).dt.tz_localize(None)
df.index = df['Time']
df.drop(['Time', 'Időpont'], axis=1, inplace=True)

df

### Checking meta data

I'm deciding which start date to use for the new dataset, it looks like a lot of new stations were set up in 2014, so somwhere between 2015 and 2017 seem like good years to start
- I have to keep in mind, the COVID pandemic started around 2020, so I want a good amount of data before that too

In [None]:
df = pd.DataFrame = pd.read_csv('data/omsz_meta.csv',
                                sep=';',
                                )
df.sort_values('StartDate', inplace=True)

df