#### Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import warnings
import sys
import pandas as pd

sys.path.append("../")
from pipeline import data
from pipeline.config import CONF
from pipeline.data import plots
from pipeline.data import io
from pipeline.data import inspection
from pipeline.data import preprocess

# To suppress all warnings
warnings.filterwarnings("ignore")

# black is a code formatter (see https://github.com/psf/black).
# It will automatically format the code you write in the cells imposing consistent Python style.
%load_ext jupyter_black
# matplotlib style file
# Template for style file: https://matplotlib.org/stable/tutorials/introductory/customizing.html#customizing-with-style-sheets
plt.style.use("../matplotlib_style.txt")
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.expand_frame_repr", False)  # Prevent wrapping

## Raw data

### Load raw data

In [29]:
# Load raw data
(
    Installed_Capacity_Germany,
    Prices_Europe,
    Realised_Supply_Germany,
    Realised_Demand_Germany,
    Weather_Data_Germany,
    Weather_Data_Germany_2022,
) = data.load_data(CONF=CONF, data_type="raw")

Loaded Installed_Capacity_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/raw_data/Installed_Capacity_Germany.csv' successfully.
Loaded Prices_Europe from '/graphics/scratch2/students/nguyenlo/seminar-ml/raw_data/Prices_Europe.csv' successfully.
Loaded Realised_Supply_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/raw_data/Realised_Supply_Germany.csv' successfully.
Loaded Realised_Demand_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/raw_data/Reaslised_Demand_Germany.csv' successfully.
Loaded Weather_Data_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/raw_data/Weather_Data_Germany.csv' successfully.
Loaded Weather_Data_Germany_2022 from '/graphics/scratch2/students/nguyenlo/seminar-ml/raw_data/Weather_Data_Germany_2022.csv' successfully.


### Inspect raw data

In [30]:
# Generate profile reports
if CONF.data.inspect:
    data.save_data_inspection(
        Installed_Capacity_Germany=Installed_Capacity_Germany,
        Prices_Europe=Prices_Europe,
        Realised_Supply_Germany=Realised_Supply_Germany,
        Realised_Demand_Germany=Realised_Demand_Germany,
        Weather_Data_Germany=Weather_Data_Germany,
        Weather_Data_Germany_2022=Weather_Data_Germany_2022,
        CONF=CONF,
        data_type="raw",
    )

### Raw data processing

##### Merging data together

In [31]:
# Merging Weather_Data_Germany and Weather_Data_Germany_2022
# Replace Weather_Data_Germany with Weather_Data_Germany_2022 for 2022 data

# Remove the data for 2022 from the original dataframe
Weather_Data_Germany = Weather_Data_Germany[
    Weather_Data_Germany["time"].dt.year != 2022
]

# Concatenate the filtered original dataframe with the 2022 data
Weather_Data_Germany = pd.concat(
    [Weather_Data_Germany, Weather_Data_Germany_2022], ignore_index=True
)

##### Fill NaN

In [32]:
Installed_Capacity_Germany = data.process_na_values(Installed_Capacity_Germany, CONF)
Prices_Europe = data.process_na_values(Prices_Europe, CONF)
Realised_Supply_Germany = data.process_na_values(Realised_Supply_Germany, CONF)
Realised_Demand_Germany = data.process_na_values(Realised_Demand_Germany, CONF)
Weather_Data_Germany = data.process_na_values(Weather_Data_Germany, CONF)

#### Split train, val, test

In [33]:
Installed_Capacity_Germany = preprocess.split_data(
    df=Installed_Capacity_Germany, column_name=io.DATE_COLUMNS[-1]
)
Prices_Europe = preprocess.split_data(df=Prices_Europe, column_name=io.DATE_COLUMNS[-1])
Realised_Supply_Germany = preprocess.split_data(
    df=Realised_Supply_Germany, column_name=io.DATE_COLUMNS[-1]
)
Realised_Demand_Germany = preprocess.split_data(
    df=Realised_Demand_Germany, column_name=io.DATE_COLUMNS[-1]
)
Weather_Data_Germany = preprocess.split_data(
    df=Weather_Data_Germany, column_name=io.DATE_COLUMNS_WEATHER[0]
)

#### Normalize data

In [34]:
Installed_Capacity_Germany = preprocess.normalize_data(
    df=Installed_Capacity_Germany, ignore_features=io.DATE_COLUMNS
)
Prices_Europe = preprocess.normalize_data(
    df=Prices_Europe, ignore_features=io.DATE_COLUMNS
)
Realised_Supply_Germany = preprocess.normalize_data(
    df=Realised_Supply_Germany, ignore_features=io.DATE_COLUMNS
)
Realised_Demand_Germany = preprocess.normalize_data(
    df=Realised_Demand_Germany, ignore_features=io.DATE_COLUMNS
)
Weather_Data_Germany = preprocess.normalize_data(
    df=Weather_Data_Germany,
    ignore_features=io.DATE_COLUMNS_WEATHER + ["longitude", "latitude"],
)

#### Aggregate weather data

In [35]:
Weather_Data_Germany = preprocess.aggregate_weather_data(
    Weather_Data_Germany, ["forecast_origin", "time"]
)

#### Save data

In [37]:
data.save_data(
    Installed_Capacity_Germany=Installed_Capacity_Germany,
    Prices_Europe=Prices_Europe,
    Realised_Supply_Germany=Realised_Supply_Germany,
    Realised_Demand_Germany=Realised_Demand_Germany,
    Weather_Data_Germany=Weather_Data_Germany,
    CONF=CONF,
    data_type="preprocessed",
)

Saved Installed_Capacity_Germany to '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Installed_Capacity_Germany.csv' successfully.
Saved Prices_Europe to '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Prices_Europe.csv' successfully.
Saved Realised_Supply_Germany to '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Realised_Supply_Germany.csv' successfully.
Saved Realised_Demand_Germany to '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Reaslised_Demand_Germany.csv' successfully.
Saved Weather_Data_Germany to '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Weather_Data_Germany.csv' successfully.


## Processed data

### Load processed data

In [38]:
# Load processed data
(
    Installed_Capacity_Germany,
    Prices_Europe,
    Realised_Supply_Germany,
    Realised_Demand_Germany,
    Weather_Data_Germany,
) = data.load_data(CONF, data_type="preprocessed")

Loaded Installed_Capacity_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Installed_Capacity_Germany.csv' successfully.
Loaded Prices_Europe from '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Prices_Europe.csv' successfully.
Loaded Realised_Supply_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Realised_Supply_Germany.csv' successfully.
Loaded Realised_Demand_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Reaslised_Demand_Germany.csv' successfully.
Loaded Weather_Data_Germany from '/graphics/scratch2/students/nguyenlo/seminar-ml/preprocessed_data/Weather_Data_Germany.csv' successfully.


### Processed data inspection

In [39]:
# Generate profile reports

if CONF.data.inspect:
    data.save_data_inspection(
        Installed_Capacity_Germany=Installed_Capacity_Germany,
        Prices_Europe=Prices_Europe,
        Realised_Supply_Germany=Realised_Supply_Germany,
        Realised_Demand_Germany=Realised_Demand_Germany,
        Weather_Data_Germany=Weather_Data_Germany,
        CONF=CONF,
        data_type="preprocessed",
    )

### Plot processed data

In [49]:
plots.plot_df(Installed_Capacity_Germany, "Installed_Capacity_Germany", CONF)
plots.plot_df(Prices_Europe, "Prices_Europe", CONF)
plots.plot_df(Realised_Supply_Germany, "Realised_Supply_Germany", CONF)
plots.plot_df(Realised_Demand_Germany, "Realised_Demand_Germany", CONF)
plots.plot_df(
    Weather_Data_Germany,
    "Weather_Data_Germany",
    CONF,
    date_col=io.DATE_COLUMNS_WEATHER[-1],
    drop_date_cols=io.DATE_COLUMNS_WEATHER,
)

### Exploring data


##### Date resolution

In [41]:
inspection.date_range_and_resolution(Installed_Capacity_Germany, io.DATE_COLUMNS)

Min Date from: 2019-01-01 00:00:00
Max Date from: 2022-01-01 00:00:00
Resolution Date from: 365 days 00:00:00
Min Date to: 2020-01-01 00:00:00
Max Date to: 2023-01-01 00:00:00
Resolution Date to: 365 days 00:00:00


In [42]:
inspection.date_range_and_resolution(Prices_Europe, io.DATE_COLUMNS)

Min Date from: 2019-01-01 00:00:00
Max Date from: 2022-12-31 23:00:00
Resolution Date from: 0 days 01:00:00
Min Date to: 2019-01-01 01:00:00
Max Date to: 2023-01-01 00:00:00
Resolution Date to: 0 days 01:00:00


In [43]:
inspection.date_range_and_resolution(Realised_Supply_Germany, io.DATE_COLUMNS)

Min Date from: 2019-01-01 00:00:00
Max Date from: 2022-12-31 23:45:00
Resolution Date from: 0 days 00:15:00
Min Date to: 2019-01-01 00:15:00
Max Date to: 2023-01-01 00:00:00
Resolution Date to: 0 days 00:15:00


In [44]:
inspection.date_range_and_resolution(Realised_Demand_Germany, io.DATE_COLUMNS)

Min Date from: 2019-01-01 00:00:00
Max Date from: 2022-12-31 23:45:00
Resolution Date from: 0 days 00:15:00
Min Date to: 2019-01-01 00:15:00
Max Date to: 2023-01-01 00:00:00
Resolution Date to: 0 days 00:15:00


In [45]:
inspection.date_range_and_resolution(Weather_Data_Germany, io.DATE_COLUMNS_WEATHER)

Min forecast_origin: 2019-01-01 00:00:00
Max forecast_origin: 2022-12-31 00:00:00
Resolution forecast_origin: 0 days 00:00:00
Min time: 2019-01-01 00:00:00
Max time: 2022-12-31 23:00:00
Resolution time: 0 days 01:00:00
