# Exploratory Data Analysis

### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# https://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score

np.random.seed(1320210409)
randomstate = np.random.RandomState(1320210409)

# The data

## Features

All features are hourly and a country-wide average.
- **Time** _[YYYY-MM-DD HH:MM:SS]_
- **el_load:** electricity load _[MW]_
- **prec:** rainfall amount _[mm]_
- **temp:** temperature _[°C]_
- **rhum:** relative humidity [%]
- **grad:** global radiation _[J/cm²]_
- **pres:** momentary sea level air pressure _[hPa]_
- **wind:** average wind speed _[m/s]_
- **Vel_tviz:** Velence water temperature in Agárd _[°C]_
- **Bal_tviz:** Balaton water temperature in Siófok _[°C]_
- **holiday:** 1 or 0 depending on if it's a holiday
- **weekend:** 1 or 0 depending on if it's a weekend
- **covid:** 1 or 0 depending on covid restrictions in Hungary (estimate)

### The goal

I want to predict Hungary's electricity load for the **next couple of hours** using this dataset, or it's differently aggregated counterpart (country, region, county or station)

In [None]:
df = pd.read_csv(
    'data/final_dataframe.csv',
    parse_dates=['Time'],
    index_col='Time',
    sep=';'
)

df.info()

df

No null entries, I have dealt with those in the _data_organization_ notebook.

In [None]:
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday
df['dayofmonth'] = df.index.day
df['dayofyear'] = df.index.dayofyear
df['month'] = df.index.month
df['year'] = df.index.year

df

## Features

- **Time** _[YYYY-MM-DD HH:MM:SS]_
- **el_load:** electricity load _[MW]_
- **prec:** rainfall amount _[mm]_
- **temp:** temperature _[°C]_
- **rhum:** relative humidity [%]
- **grad:** global radiation _[J/cm²]_
- **pres:** momentary sea level air pressure _[hPa]_
- **wind:** average wind speed _[m/s]_
- **Vel_tviz:** Velence water temperature in Agárd _[°C]_
- **Bal_tviz:** Balaton water temperature in Siófok _[°C]_
- **holiday:** 1 or 0 depending on if it's a holiday
- **weekend:** 1 or 0 depending on if it's a weekend
- **covid:** 1 or 0 depending on covid restrictions in Hungary (estimate)

In [None]:
group_by = ['hour', 'weekday', 'dayofmonth', 'dayofyear', 'month', 'year']

def plot_feature(dataframe: pd.DataFrame, groupes: list, feature: str, desc: str, color: str):
    group_len = len(groupes)
    fig, ax = plt.subplots(2, group_len // 2, figsize=(20, 7))
    fig.suptitle(f"Feature: {feature} ({desc})")
    for i, ax in enumerate(ax.flatten()):
        group = groupes[i % group_len]
        grouped = dataframe.groupby(group)[feature].mean()
        ax.set_title(f"Grouped by {group}", fontsize=10)
        marker = 'o' if group != 'dayofyear' else None
        ax.plot(grouped, color=color, marker=marker)

#### Electricity load

In [None]:
plot_feature(df, group_by, 'el_load', 'Electricity load', 'black')

- daily average rises during the day, it hits its peak at 18-19
- lower during the weekend
- we don't learn too much from the day of the month at this time
- during the year, load is higher in winter, probably since there's less sunlight
- we can see the effects of covid between 2020-2022

#### Precipitation

In [None]:
plot_feature(df, group_by, 'prec', 'Precipitation', 'blue')

- precipitation is higher during the summer as expected
- it's higher during the weekend, but that's probably up to chance
- it's higher during the afternoon and evening
- other groups tell us nothing

#### Temperature

In [None]:
plot_feature(df, group_by, 'temp', 'Temperature', 'red')

- temperature is higher during the summer as expected
- it's also higher during the day as expected
- seemingly, it's rising slowly as the years go on with some outliars

#### Relative humidity

In [None]:
plot_feature(df, group_by, 'rhum', 'Relative humidity', 'green')

- humidity is lower during the day, hitting its low in the afternoon
- the week group is a decieving graph, since the values are so close to each other
- it's lower during the summer overall

#### Global radiation

In [None]:
plot_feature(df, group_by, 'grad', 'Global radiation', 'orange')

- global radiation is higher during the summer and the day as expected
- it's slowly increasing as the years go on, 2023 being an outliar since we only have data for 8 months there

#### Momentary sea level air pressure

In [None]:
plot_feature(df, group_by, 'pres', 'Momentary sea level air pressure', 'purple')

- air pressure fluctuates heavily during the day, being higher in the morning, but low in the afternoon
- it's higher during the winter
- other groups tell us nothing

#### Average wind speed

In [None]:
plot_feature(df, group_by, 'wind', 'Average wind speed', 'brown')

- wind speed is higher during the day, hitting its peak in the afternoon
- it's higher during the winter and spring

#### Water temperature for Balaton and Velence

In [None]:
plot_feature(df, group_by, 'Vel_tviz', 'Velence water temperature in Agárd', 'cyan')

plot_feature(df, group_by, 'Bal_tviz', 'Balaton water temperature in Siófok', 'lightblue')

- the 2 water temperature graphs are really similar, so I'll write about them together
- water temperature is higher during the summer as expected
- they hit their peak in the afternoon for obvious reasons

## Correletion matrix

In [None]:
# limit features used
corr = df.drop(columns=['holiday', 'weekend', 'covid']).corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, cmap='coolwarm')

- tempretaure and water temperatures are highly correlated as expected
- the 2 water temperatures correlate highly, but I will keep these features seperate for now
- dayofyear and month are highly correlated, but that's to be expected
- relative humidity and global radiation display inverse correlation, which is interesting

## Automatic feature selection

I'll be doing a 1 hour forecast, using the current hour, to get the best features I can, using more hours would make this combinatorial problem too big.

In [None]:
X = df.to_numpy(dtype=np.float64)[:-1]
y = df['el_load'].to_numpy(dtype=np.float64)[1:]

scores_df = pd.DataFrame(columns=['score'] + list(df.columns))

fixed_features = None
for i in range(1, len(df.columns)+1):
    ts_split = TimeSeriesSplit(n_splits=4).split(X)
    sfs = SFS(
        RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=randomstate),
        k_features=i,
        forward=True,
        scoring='neg_root_mean_squared_error',
        cv = list(ts_split),
        fixed_features=fixed_features,
    )

    sfs.fit_transform(X, y)
    fixed_features = tuple(sfs.k_feature_idx_)
    feature_idxs = [True if i in fixed_features else False for i in range(len(df.columns))]
    scores_df.loc[i] = [-sfs.k_score_] + feature_idxs
    print(f"Finished {i} feature(s), score: {-sfs.k_score_}")

scores_df.to_csv('data/feature_selection.csv', sep=';')
scores_df