#### Import libraries

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import warnings
import sys
import pandas as pd

sys.path.append("../")
from pipeline import data
from pipeline.config import CONF
from pipeline.data import plots
from pipeline.data import io
from pipeline.data import inspection
from pipeline.data import preprocess

# To suppress all warnings
warnings.filterwarnings("ignore")

# black is a code formatter (see https://github.com/psf/black).
# It will automatically format the code you write in the cells imposing consistent Python style.
%load_ext jupyter_black
# matplotlib style file
# Template for style file: https://matplotlib.org/stable/tutorials/introductory/customizing.html#customizing-with-style-sheets
plt.style.use("../matplotlib_style.txt")
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.expand_frame_repr", False)  # Prevent wrapping

## Process raw data

### Load raw data

In [None]:
if CONF.data.process_raw_data and not CONF.data.loaded_raw_data:
    # Load raw data
    (
        Installed_Capacity_Germany_Raw,
        Prices_Europe_Raw,
        Realised_Supply_Germany_Raw,
        Realised_Demand_Germany_Raw,
        Weather_Data_Germany_Raw,
        Weather_Data_Germany_2022_Raw,
    ) = data.load_data(CONF=CONF, data_type="raw")
    CONF.data.loaded_raw_data = True

In [None]:
if CONF.data.process_raw_data:
    Installed_Capacity_Germany = Installed_Capacity_Germany_Raw.copy()
    Prices_Europe = Prices_Europe_Raw.copy()
    Realised_Supply_Germany = Realised_Supply_Germany_Raw.copy()
    Realised_Demand_Germany = Realised_Demand_Germany_Raw.copy()
    Weather_Data_Germany = Weather_Data_Germany_Raw.copy()
    Weather_Data_Germany_2022 = Weather_Data_Germany_2022_Raw.copy()

### Inspect raw data

##### Inspect missingness

In [None]:
if CONF.data.process_raw_data:
    # Generate profile reports
    if CONF.data.inspect:
        data.save_data_inspection(
            Installed_Capacity_Germany=Installed_Capacity_Germany,
            Prices_Europe=Prices_Europe,
            Realised_Supply_Germany=Realised_Supply_Germany,
            Realised_Demand_Germany=Realised_Demand_Germany,
            Weather_Data_Germany=Weather_Data_Germany,
            Weather_Data_Germany_2022=Weather_Data_Germany_2022,
            CONF=CONF,
            data_type="raw",
        )

##### Inspect resolution

In [None]:
inspection.date_range_and_resolution(Installed_Capacity_Germany, io.DATE_COLUMNS)

In [None]:
inspection.date_range_and_resolution(Prices_Europe, io.DATE_COLUMNS)

In [None]:
inspection.date_range_and_resolution(Realised_Supply_Germany, io.DATE_COLUMNS)

In [None]:
inspection.date_range_and_resolution(Realised_Demand_Germany, io.DATE_COLUMNS)

In [None]:
inspection.date_range_and_resolution(Weather_Data_Germany, io.DATE_COLUMNS_WEATHER[1:])

In [None]:
inspection.date_range_and_resolution(
    Weather_Data_Germany_2022, io.DATE_COLUMNS_WEATHER[1:]
)

### Raw data pipeline

##### Merging data together

In [None]:
if CONF.data.process_raw_data:
    # Remove the data for 2022 from the original dataframe
    Weather_Data_Germany = Weather_Data_Germany[
        Weather_Data_Germany["time"].dt.year != 2022
    ]

    # Concatenate the filtered original dataframe with the 2022 data
    Weather_Data_Germany = pd.concat(
        [Weather_Data_Germany, Weather_Data_Germany_2022], ignore_index=True
    )

##### Fill NaN

In [None]:
if CONF.data.process_raw_data:
    Processed_Installed_Capacity_Germany = data.process_na_values(
        Installed_Capacity_Germany, CONF
    )
    Processed_Prices_Europe = data.process_na_values(Prices_Europe, CONF)
    Processed_Realised_Supply_Germany = data.process_na_values(
        Realised_Supply_Germany, CONF
    )
    Processed_Realised_Demand_Germany = data.process_na_values(
        Realised_Demand_Germany, CONF
    )
    Processed_Weather_Data_Germany = data.process_na_values(Weather_Data_Germany, CONF)

#### Split train, val, test

In [None]:
if CONF.data.process_raw_data:
    Processed_Installed_Capacity_Germany = preprocess.split_data(
        df=Processed_Installed_Capacity_Germany, column_name=io.DATE_COLUMNS[-1]
    )
    Processed_Prices_Europe = preprocess.split_data(
        df=Processed_Prices_Europe, column_name=io.DATE_COLUMNS[-1]
    )
    Processed_Realised_Supply_Germany = preprocess.split_data(
        df=Processed_Realised_Supply_Germany, column_name=io.DATE_COLUMNS[-1]
    )
    Processed_Realised_Demand_Germany = preprocess.split_data(
        df=Processed_Realised_Demand_Germany, column_name=io.DATE_COLUMNS[-1]
    )
    Processed_Weather_Data_Germany = preprocess.split_data(
        df=Processed_Weather_Data_Germany, column_name=io.DATE_COLUMNS_WEATHER[0]
    )

#### Normalize data

In [None]:
if CONF.data.process_raw_data:
    (
        Processed_Installed_Capacity_Germany,
        Processed_Installed_Capacity_Germany_Scalers,
    ) = preprocess.normalize_data(
        df=Processed_Installed_Capacity_Germany,
        ignore_features=io.DATE_COLUMNS,
        constant=CONF.data.price_normalization_constant,
    )

    Processed_Prices_Europe, Processed_Prices_Europe_Scalers = (
        preprocess.normalize_data(
            df=Processed_Prices_Europe,
            ignore_features=io.DATE_COLUMNS,
            constant=CONF.data.price_normalization_constant,
        )
    )
    Processed_Realised_Supply_Germany, Processed_Realised_Supply_Germany_Scalers = (
        preprocess.normalize_data(
            df=Processed_Realised_Supply_Germany, ignore_features=io.DATE_COLUMNS
        )
    )
    Processed_Realised_Demand_Germany, Processed_Realised_Demand_Germany_Scalers = (
        preprocess.normalize_data(
            df=Processed_Realised_Demand_Germany, ignore_features=io.DATE_COLUMNS
        )
    )
    Processed_Weather_Data_Germany, Processed_Weather_Data_Germany_Scalers = (
        preprocess.normalize_data(
            df=Processed_Weather_Data_Germany,
            ignore_features=io.DATE_COLUMNS_WEATHER + ["longitude", "latitude"],
        )
    )

In [None]:
if CONF.data.process_raw_data:
    io.save_scalers(
        scalers=Processed_Installed_Capacity_Germany_Scalers,
        name="Processed_Installed_Capacity_Germany_Scalers",
        CONF=CONF,
    )
    io.save_scalers(
        scalers=Processed_Prices_Europe_Scalers,
        name="Processed_Prices_Europe_Scalers",
        CONF=CONF,
    )
    io.save_scalers(
        scalers=Processed_Realised_Supply_Germany_Scalers,
        name="Processed_Realised_Supply_Germany_Scalers",
        CONF=CONF,
    )
    io.save_scalers(
        scalers=Processed_Realised_Demand_Germany_Scalers,
        name="Processed_Realised_Demand_Germany_Scalers",
        CONF=CONF,
    )
    io.save_scalers(
        scalers=Processed_Weather_Data_Germany_Scalers,
        name="Processed_Weather_Data_Germany_Scalers",
        CONF=CONF,
    )

#### Aggregate weather data

In [None]:
inspection.date_range_and_resolution(
    Processed_Weather_Data_Germany, io.DATE_COLUMNS_WEATHER[1:]
)

In [None]:
if CONF.data.process_raw_data:
    Processed_Weather_Data_Germany = preprocess.aggregate_weather_data(
        Processed_Weather_Data_Germany, ["forecast_origin", "time"]
    )

##### Decrease demand and supply's time resolution

In [None]:
if CONF.data.process_raw_data:
    Processed_Realised_Demand_Germany = Processed_Realised_Demand_Germany[
        Processed_Realised_Demand_Germany["Date from"].dt.minute == 0
    ]
    Processed_Realised_Supply_Germany = Processed_Realised_Supply_Germany[
        Processed_Realised_Supply_Germany["Date from"].dt.minute == 0
    ]

#### Save data

In [None]:
if CONF.data.process_raw_data:
    data.save_data(
        Installed_Capacity_Germany=Processed_Installed_Capacity_Germany,
        Prices_Europe=Processed_Prices_Europe,
        Realised_Supply_Germany=Processed_Realised_Supply_Germany,
        Realised_Demand_Germany=Processed_Realised_Demand_Germany,
        Weather_Data_Germany=Processed_Weather_Data_Germany,
        CONF=CONF,
        data_type="preprocessed",
    )

## Inspect processed data

### Load processed data

In [None]:
# Load processed data
(
    Processed_Installed_Capacity_Germany,
    Processed_Prices_Europe,
    Processed_Realised_Supply_Germany,
    Processed_Realised_Demand_Germany,
    Processed_Weather_Data_Germany,
) = data.load_data(CONF, data_type="preprocessed")

### Processed data missingness inspection

In [None]:
# Generate profile reports

if CONF.data.inspect:
    data.save_data_inspection(
        Installed_Capacity_Germany=Processed_Installed_Capacity_Germany,
        Prices_Europe=Processed_Prices_Europe,
        Realised_Supply_Germany=Processed_Realised_Supply_Germany,
        Realised_Demand_Germany=Processed_Realised_Demand_Germany,
        Weather_Data_Germany=Processed_Weather_Data_Germany,
        CONF=CONF,
        data_type="preprocessed",
    )

### Plot processed data's columns

In [None]:
plots.plot_df(Processed_Installed_Capacity_Germany, "Installed_Capacity_Germany", CONF)
plots.plot_df(Processed_Prices_Europe, "Prices_Europe", CONF)
plots.plot_df(Processed_Realised_Supply_Germany, "Realised_Supply_Germany", CONF)
plots.plot_df(Processed_Realised_Demand_Germany, "Realised_Demand_Germany", CONF)
plots.plot_df(
    Processed_Weather_Data_Germany,
    "Weather_Data_Germany",
    CONF,
    date_col=io.DATE_COLUMNS_WEATHER[-1],
    drop_date_cols=io.DATE_COLUMNS_WEATHER,
)

#### Processed data's time resolution

In [None]:
inspection.date_range_and_resolution(
    Processed_Installed_Capacity_Germany, io.DATE_COLUMNS
)

In [None]:
inspection.date_range_and_resolution(Processed_Prices_Europe, io.DATE_COLUMNS)

In [None]:
inspection.date_range_and_resolution(Processed_Realised_Supply_Germany, io.DATE_COLUMNS)

In [None]:
inspection.date_range_and_resolution(Processed_Realised_Demand_Germany, io.DATE_COLUMNS)

In [None]:
inspection.date_range_and_resolution(
    Processed_Weather_Data_Germany, io.DATE_COLUMNS_WEATHER[1:]
)

## Data loading