In [2]:
from dataclasses import dataclass
from datetime import datetime
import os
import pickle
import time
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import norm

from analysis.datasets import load_entsoe
from analysis.splits import to_train_validation_test_data

class DataPreprocessor:
    def __init__(self, target_column="power"):
        self.target_column = target_column
        self.max_power_value_rounded = None

    def load_data(self):
        """Loads the dataset."""
        self.df = load_entsoe()
        return self

    def transform_power(self, epsilon=1e-3):
        """Scales the power data using log transformation."""
        max_power_value = self.df[self.target_column].max()
        self.max_power_value_rounded = np.ceil(max_power_value / 1000) * 1000
        self.df[self.target_column] = np.log(self.df[self.target_column] / self.max_power_value_rounded + epsilon)
        return self

    def add_interval_index(self):
        """Creates an interval index feature based on time."""
        self.df['interval_index'] = ((self.df.index.hour * 60 + self.df.index.minute) // 15) + 1
        return self

    def add_lagged_features(self, lag=96):
        """Adds lagged power feature."""
        self.df[f'{self.target_column}_t-{lag}'] = self.df[self.target_column].shift(lag)
        self.df.dropna(inplace=True)
        return self

    def prepare_features(self, selected_features):
        """Selects only the specified features from the DataFrame."""
        selected_features.append(self.target_column)
        self.df = self.df[[feature for feature in selected_features if feature in self.df.columns]]
        return self

    def split_data(self, train_start, train_end, val_start, val_end):
        """Splits dataset into train, validation, and test sets."""
        self.train_X, self.train_y, self.val_X, self.val_y, self.test_X, self.test_y = to_train_validation_test_data(
            self.df, train_start, train_end, val_start, val_end
        )
        return self

    def get_processed_data(self):
        """Returns processed train, validation, and test sets."""
        return self.train_X, self.train_y, self.val_X, self.val_y, self.test_X, self.test_y 
    
class ExperimentMapper:

    @staticmethod
    def map_id_to_config(experiment_id: int):

        config = []

        if experiment_id  == 1:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "power_t-96"],
                "train_start": "2022-01-01",
                "train_end": "2022-03-31",
                "val_start": "2023-01-01",
                "val_end": "2023-03-31",
                "random_state": 42
                }
            ]

        elif experiment_id  == 2:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "power_t-96"],
                "train_start": "2022-04-01",
                "train_end": "2022-06-30",
                "val_start": "2023-04-01",
                "val_end": "2023-06-30",
                "random_state": 42
                }
            ]

        elif experiment_id  == 3:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "power_t-96"],
                "train_start": "2022-07-01",
                "train_end": "2022-09-30",
                "val_start": "2023-07-01",
                "val_end": "2023-09-30",
                "random_state": 42
                }
            ]
        elif experiment_id  == 4:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-10-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id  == 5:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-03-31",
                "random_state": 42
                }
            ]
        elif experiment_id  == 6:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-04-01",
                "val_end": "2023-06-30",
                "random_state": 42
                }
            ]
        elif experiment_id  == 7:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-07-01",
                "val_end": "2023-09-30",
                "random_state": 42
                }
            ]
        elif experiment_id  == 8:
            config = [
                {
                "selected_features": ["ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-06-30",
                "random_state": 42
                }
            ]
        elif experiment_id  == 9:
            config = [
                {
                "selected_features": ["ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-07-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id  == 10:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-06-30",
                "random_state": 42
                }
            ]
        elif experiment_id  == 11:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-07-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id  == 12:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96", "interval_index"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-06-30",
                "random_state": 42
                }
            ]
        elif experiment_id == 13:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96", "interval_index"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-07-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 14:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-01-01",
                "train_end": "2022-03-31",
                "val_start": "2023-04-01",
                "val_end": "2023-06-30",
                "random_state": 42
                }
            ]
        elif experiment_id == 15:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-07-01",
                "train_end": "2022-09-30",
                "val_start": "2023-04-01",
                "val_end": "2023-06-30",
                "random_state": 42
                }
            ]
        elif experiment_id == 16:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-01-01",
                "train_end": "2022-03-31",
                "val_start": "2023-10-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 17:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-01-01",
                "train_end": "2022-03-31",
                "val_start": "2023-07-01",
                "val_end": "2023-09-30",
                "random_state": 42
                }
            ]
        elif experiment_id == 18:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-04-01",
                "train_end": "2022-06-30",
                "val_start": "2023-01-01",
                "val_end": "2023-03-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 19:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-04-01",
                "train_end": "2022-06-30",
                "val_start": "2023-07-01",
                "val_end": "2023-09-30",
                "random_state": 42
                }
            ]
        elif experiment_id == 20:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-04-01",
                "train_end": "2022-06-30",
                "val_start": "2023-10-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 21:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-07-01",
                "train_end": "2022-09-30",
                "val_start": "2023-01-01",
                "val_end": "2023-03-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 22:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-07-01",
                "train_end": "2022-09-30",
                "val_start": "2023-10-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 23:
            config = [
                {
                "selected_features": ["power_t-96"],
                "train_start": "2022-01-01",
                "train_end": "2022-03-31",
                "val_start": "2023-01-01",
                "val_end": "2023-03-31",
                "random_state": 42
                }
            ]
        
        elif experiment_id == 24:
            config = [
                {
                "selected_features": ["power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]

        elif experiment_id == 25:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]

        elif experiment_id == 26:
            config = [
                {
                "selected_features": ["ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 27:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 28:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96", "interval_index"],
                "train_start": "2022-10-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 29:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96", "interval_index"],
                "train_start": "2016-01-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 30:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2016-01-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 31:
            config = [
                {
                "selected_features": ["ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "power_t-96"],
                "train_start": "2016-01-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        elif experiment_id == 32:
            config = [
                {
                "selected_features": ["ws_10m_loc_mean", "ws_100m_loc_mean","power_t-96"],
                "train_start": "2016-01-01",
                "train_end": "2022-12-31",
                "val_start": "2023-01-01",
                "val_end": "2023-12-31",
                "random_state": 42
                }
            ]
        else:
            raise ValueError(f"Experiment ID {experiment_id} is not valid.")
        
        return config
    
    from datetime import datetime

    @staticmethod
    def get_experiment_ids_for_time_range(time_range: str):
        # A method to filter experiment ids based on the time range.
        # We're only interested in experiments that belong to Q4 2022

        valid_ids = []
        for experiment_id in range(1, 30):  # Assuming there are 29 experiments
            config = ExperimentMapper.map_id_to_config(experiment_id)
            for item in config:
                if time_range in item["train_start"] or time_range in item["val_start"]:
                    valid_ids.append(experiment_id)
                    break
        return valid_ids
    

    @staticmethod
    def extract_date_abbreviations_from_config(config):
        """
        Processes training and validation date ranges into readable formats:
        - Qx YEAR for quarters
        - H1/H2 YEAR for half-years
        - FY YEAR for full year
        - YEAR-YEAR if the period spans multiple years
        """

        def months_between(start_date_str, end_date_str):
            start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
            end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
            months_diff = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
            if end_date.day >= start_date.day:
                months_diff += 1
            return months_diff, start_date.year, end_date.year

        def get_period(months_diff, start_date_str, start_year, end_year):
            if start_year != end_year:
                return f"{start_year}-{end_year}"

            start_month = datetime.strptime(start_date_str, "%Y-%m-%d").month

            if months_diff == 3:
                quarter = (start_month - 1) // 3 + 1
                return f"Q{quarter} {start_year}"
            elif months_diff == 6:
                return f"H1 {start_year}" if start_month <= 6 else f"H2 {start_year}"
            elif months_diff == 12:
                return f"FY {start_year}"
            else:
                return f"{start_year}-{end_year}"

        # Extract dates
        train_start = config[0]["train_start"]
        train_end = config[0]["train_end"]
        val_start = config[0]["val_start"]
        val_end = config[0]["val_end"]

        # Compute month diffs and periods
        train_month_diff, train_start_year, train_end_year = months_between(train_start, train_end)
        val_month_diff, val_start_year, val_end_year = months_between(val_start, val_end)

        train_period = get_period(train_month_diff, train_start, train_start_year, train_end_year)
        val_period = get_period(val_month_diff, val_start, val_start_year, val_end_year)

        return f"{train_period} / {val_period}"

    @staticmethod
    def get_feature_string_from_selected_features(config):

        selected_features = config[0]["selected_features"]
        
        if isinstance(selected_features, str):
            selected_features = [selected_features]  # Convert string to list

        # Define the conditions and return the corresponding string
        if set(selected_features) == {"power_t-96", "ws_10m_loc_mean", "ws_100m_loc_mean"}:
            return "power, mean ws"
        elif set(selected_features) == {"power_t-96"}:
            return "power"
        elif set(selected_features) == {"power_t-96", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10"}:
            return "power, ws at 10 loc"
        elif set(selected_features) == {"power_t-96", "ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10"}:
            return "power, all ws"
        elif set(selected_features) == {"power_t-96", "ws_10m_loc_mean", "ws_100m_loc_mean", "ws_10m_loc_1", "ws_10m_loc_2", "ws_10m_loc_3", "ws_10m_loc_4", "ws_10m_loc_5", "ws_10m_loc_6", "ws_10m_loc_7", "ws_10m_loc_8", "ws_10m_loc_9", "ws_10m_loc_10", 
                                        "ws_100m_loc_1", "ws_100m_loc_2", "ws_100m_loc_3", "ws_100m_loc_4", "ws_100m_loc_5", "ws_100m_loc_6", "ws_100m_loc_7", "ws_100m_loc_8", "ws_100m_loc_9", "ws_100m_loc_10", "interval_index"}:
            return "power, all ws, time bin"
        else:
            return "Unknown features"
    
@dataclass
class Experiment_Baseline:
    X_train: np.ndarray
    y_train: np.ndarray
    X_validation: np.ndarray
    y_validation: np.ndarray
    beta_0: None | float = None
    beta_1: None | np.ndarray = None
    sigma_sq: None | float = None
    intercept: bool = True

    def perform(self):
        model = LinearRegression(fit_intercept=self.intercept)
        model.fit(self.X_train, self.y_train)

        if self.intercept:
            self.beta_0 = model.intercept_  # Intercept (β₀) when fit_intercept=True
        else:
            self.beta_0 = 0.0  # Set beta_0 to 0 manually when fit_intercept=False

        self.beta_1 = model.coef_    # Coefficient for P_t-96 (β₁)

        # Calculate sigma^2 (variance of residuals)
        y_pred = model.predict(self.X_train)
        residuals = self.y_train - y_pred
        self.sigma_sq = (residuals ** 2).sum() / (len(self.X_train) - 2)

        return self
    
    def calculate_crps(self):
        """Calculates the Continuous Ranked Probability Score (CRPS)."""
        start_time = time.time()
        
        # Initialize CRPS statistics
        crps_values = []
        
        crps_mean = 0
        crps_min = 10000
        crps_max = 0
        counter = 0

        for i, y in enumerate(self.y_validation):  # Iterate over the validation set
            # Select the row corresponding to the i-th observation from X_validation
            mu = self.beta_0 + np.dot(self.X_validation.iloc[i, :], self.beta_1)  # Use iloc for row selection
            
            sigma = np.sqrt(self.sigma_sq)  # Predicted standard deviation (using variance)
            
            # CDF and PDF of standard normal distribution
            z = (y - mu) / sigma
            
            # Calculate PDF and CDF values for z
            pdf_z = norm.pdf(z)  # Standard normal PDF at z
            cdf_z = norm.cdf(z)  # Standard normal CDF at z

            # CRPS formula for normal distribution
            crps = sigma * (z * (2 * cdf_z - 1) + 2 * pdf_z - 1 / np.sqrt(np.pi))
            crps_values.append(crps)
            # Update CRPS statistics
            #crps_mean += crps
            #crps_min = min(crps_min, crps)
            #crps_max = max(crps_max, crps)

            counter += 1

            # Print progress every 5000 iterations
            if counter % 5000 == 0:
                end_time = time.time()
                elapsed = end_time - start_time
                print("Elapsed time:", elapsed)
                print("Counter:", counter)
                start_time = time.time()
        
        # Calculate the average CRPS value
        #crps_mean /= len(self.y_validation)
        crps_mean = np.mean(crps_values)
        crps_min = np.min(crps_values)
        crps_max = np.max(crps_values)
        crps_median = np.median(crps_values)
        # Return the CRPS statistics
        print("CRPS calculation finished")
        return crps_mean, crps_median, crps_min, crps_max


    def calculate_nll(self):
        """Calculates the Negative Log-Likelihood (NLL)."""
        start_time = time.time()
        nll_values = []
        counter = 0
        sigma = np.sqrt(self.sigma_sq)  # Predicted standard deviation (using variance)
        for i, y in enumerate(self.y_validation):  # Iterate over the validation set
            # Select the row corresponding to the i-th observation from X_validation
            mu = self.beta_0 + np.dot(self.X_validation.iloc[i, :], self.beta_1)  # Use iloc for row selection
            counter += 1
            # NLL formula for normal distribution
            nll = 0.5 * np.log(2 * np.pi * sigma**2) + ((y - mu)**2) / (2 * sigma**2)
            nll_values.append(nll)

            if counter % 5000 == 0:
                end_time = time.time()
                elapsed = end_time - start_time
                print("elapsed time", elapsed)
                start_time = time.time()
                
        nll_mean = np.mean(nll_values)
        nll_min = np.min(nll_values)
        nll_max = np.max(nll_values)
        nll_median = np.median(nll_values)
        return nll_mean, nll_median, nll_min, nll_max
        

class ExperimentStorage:
    def __init__(self, file_path):
        self.file_path = file_path

    def save(self, experiment: Experiment_Baseline):
        """Save the ExperimentTracker object to a file."""

        directory = os.path.dirname(self.file_path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(self.file_path, "wb") as f:
            pickle.dump(experiment, f)

    def load(self):
        """Load the Experiment object from a file."""
        if os.path.exists(self.file_path):
            with open(self.file_path, "rb") as f:
                return pickle.load(f)


In [3]:
from datetime import datetime
import time
import warnings

def run_baseline_model(experiment_ids, storage_path="experiments/", fit_intercept=False):
    """
    Run multiple experiments with different configurations.
    
    Parameters:
    - experiment_configs (list of dict): Each dictionary should contain:
        - selected_features
        - train_start
        - train_end
        - val_start
        - val_end
        - random_state (optional)
    """

    if isinstance(experiment_ids, int):
        experiment_ids = [experiment_ids]

    experiment_mapper = ExperimentMapper()

    for experiment_id in experiment_ids:

        config_list = experiment_mapper.map_id_to_config(experiment_id)

        for config in config_list:

            print(f"Running experiment {experiment_id}...")
            start_time = time.time()

            # Extract experiment-specific parameters
            selected_features = config["selected_features"]
            train_start = config["train_start"]
            train_end = config["train_end"]
            val_start = config["val_start"]
            val_end = config["val_end"]
            random_state = config.get("random_state", 42)  # Default random state if not provided

            # Preprocess the data
            print("- Preprocessing data...")
            preprocessor = DataPreprocessor()
            preprocessor.load_data()
            preprocessor.transform_power()
            preprocessor.add_interval_index()
            preprocessor.add_lagged_features()
            preprocessor.prepare_features(selected_features)
            print("- Splitting data into train, validation, test...")
            preprocessor.split_data(train_start, train_end, val_start, val_end)
            train_X, train_y, validation_X, validation_y, test_X, test_y = preprocessor.get_processed_data()
            display(train_X.head(3))


            print("- Running model training")
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=FutureWarning)

                experiment = Experiment_Baseline(X_train=train_X, y_train=train_y, X_validation=validation_X, y_validation=validation_y, intercept=fit_intercept)
                experiment = experiment.perform()

            end_time = time.time()
            elapsed = end_time - start_time
            print(f"⏱️ Experiment {experiment_id} completed in {elapsed:.2f} seconds")

            print("- Saving experiment results...")
            experiment_filename = f"{storage_path}/experiment_{experiment_id}.pkl"
            storage = ExperimentStorage(experiment_filename)
            storage.save(experiment)
            print(f"Experiment saved to: {experiment_filename}")

    print("All experiments completed and saved.")

In [4]:
import pandas as pd


def calculate_scores_baseline(experiment_id, storage_path="experiments/", all_scores=False):
    
    experiment_filename = f"{storage_path}/experiment_{experiment_id}.pkl"
    storage = ExperimentStorage(experiment_filename)
    experiment = storage.load()

    crps_mean, crps_median, crps_min, crps_max = experiment.calculate_crps()
    mean_nll, median_nll, min_nll, max_nll  = experiment.calculate_nll()

    scores = {
        'Metric': ['nll', 'crps'],
        'Mean': [mean_nll, crps_mean],
        'Median': [median_nll, crps_median],
        'Min': [min_nll, crps_min],
        'Max': [max_nll, crps_max]
    }

    df = pd.DataFrame(scores)


    file_name = f"{storage_path}/experiment_results_{experiment_id}.pkl"
    storage = ExperimentStorage(file_name)
    storage.save(df)

    return df

In [81]:
id = 32
run_baseline_model([id], "C:/Users/Minu/Documents/Baseline/experiments", fit_intercept=True)

Running experiment 32...
- Preprocessing data...
- Splitting data into train, validation, test...
# of training observations: 245376 | 77.76%
# of validation observations: 35040 | 11.10%
# of test observations: 35133 | 11.13%


Unnamed: 0_level_0,ws_10m_loc_mean,ws_100m_loc_mean,power_t-96
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-02 00:00:00,2.83,5.19,-2.465104
2016-01-02 00:15:00,2.825,5.16,-2.499602
2016-01-02 00:30:00,2.82,5.13,-2.485377


- Running model training
⏱️ Experiment 32 completed in 2.92 seconds
- Saving experiment results...
Experiment saved to: C:/Users/Minu/Documents/Baseline/experiments/experiment_32.pkl
All experiments completed and saved.


In [83]:
calculate_scores_baseline(32, "C:/Users/Minu/Documents/Baseline/experiments")

Elapsed time: 0.8329226970672607
Counter: 5000
Elapsed time: 0.8288474082946777
Counter: 10000
Elapsed time: 0.8326573371887207
Counter: 15000
Elapsed time: 0.8185572624206543
Counter: 20000
Elapsed time: 0.8296365737915039
Counter: 25000
Elapsed time: 0.8348667621612549
Counter: 30000
Elapsed time: 0.8325257301330566
Counter: 35000
CRPS calculation finished
elapsed time 0.25159668922424316
elapsed time 0.2624778747558594
elapsed time 0.2553431987762451
elapsed time 0.26299452781677246
elapsed time 0.2705802917480469
elapsed time 0.2640252113342285
elapsed time 0.2775447368621826


Unnamed: 0,Metric,Mean,Median,Min,Max
0,nll,0.875028,0.510165,0.305783,21.421837
1,crps,0.315546,0.212015,0.126578,3.214322


In [None]:
nll_mean = []
crps_mean = []

experiment_mapper_t = ExperimentMapper()
for i in range(24, 30):
    file = f"C:/Users/Minu/Documents/Baseline/experiments/experiment_results_{i}.pkl"

    with open(file, "rb") as f:
        df = pickle.load(f)
        nll_mean.append((df['Metric'].iloc[0], df['Mean'].iloc[0]))
        crps_mean.append((df['Metric'].iloc[1], df['Mean'].iloc[1]))
    
# Create dataframes with extra columns
nll_means_df = pd.DataFrame(nll_mean, columns=['Metric', 'Mean'])
crps_means_df = pd.DataFrame(crps_mean, columns=['Metric', 'Mean'])

# Set index to start at 1
nll_means_df.index = range(24, 30)
crps_means_df.index = range(24, 30)

[{'selected_features': ['power_t-96'], 'train_start': '2022-10-01', 'train_end': '2022-12-31', 'val_start': '2023-01-01', 'val_end': '2023-12-31', 'random_state': 42}]


TypeError: ExperimentMapper.extract_date_abbreviations_from_config() takes 1 positional argument but 2 were given

In [30]:
nll_means_df

Unnamed: 0,Metric,Mean
24,nll,1.501098
25,nll,0.926073
26,nll,0.925112
27,nll,0.92368
28,nll,0.924472
29,nll,0.731674


In [31]:
crps_means_df

Unnamed: 0,Metric,Mean
24,crps,0.594598
25,crps,0.312445
26,crps,0.281502
27,crps,0.280219
28,crps,0.280346
29,crps,0.273208


## log(P(t)/ P_max) = -3.4 - 0.10 * ws_10 + 0.34 * ws_100 + 0.14 * log(P(t-96) / P_max)

In [98]:
from scipy.optimize import minimize
y = exp.y_validation.values
# Define the log-likelihood function for the model
def log_likelihood(params, y, lag=96):
    beta_0, beta_1, sigma = params
    
    # Compute the residuals (errors)
    residuals = y[lag:] - beta_0 - beta_1 * y[lag-96:-96]
    
    # Log-likelihood for normal errors
    log_likelihood_value = -len(residuals)/2 * np.log(2 * np.pi * sigma**2) - np.sum(residuals**2) / (2 * sigma**2)
    
    return -log_likelihood_value  # Return negative log-likelihood for minimization

# Initial guess for parameters: beta_0 = 0, beta_1 = 0, sigma = 1
initial_guess = [0, 0, 1]

# Perform the optimization to maximize the log-likelihood
result = minimize(log_likelihood, initial_guess, args=(y), bounds=[(None, None), (None, None), (0.001, None)])

# Extract the estimated parameters
beta_0_est, beta_1_est, sigma_est = result.x

# Output the results
print("Estimated beta_0:", beta_0_est)
print("Estimated beta_1:", beta_1_est)
print("Estimated sigma:", sigma_est)

Estimated beta_0: -1.062821888023514
Estimated beta_1: 0.43036274221059917
Estimated sigma: 1.0606888484543233


## Compare Baseline model with TabPFN

In [5]:
import os
import glob
import re
import pickle
import pandas as pd

def analyze_3(method="tabpfn"):

    if method == "tabpfn":
        dir = "C:/Users/Minu/Documents/TabPFN/experiments"
        nll_name = "nll_5000"
        crps_name = "crps_5000"
    else:
        dir = "C:/Users/Minu/Documents/Baseline/experiments"
        nll_name = "nll"
        crps_name = "crps"

    # Step 1: Load experiment files

    files = glob.glob(os.path.join(dir, 'experiment_results_*'))

    ids = [int(re.search(r'\d+', s).group()) for s in files if re.search(r'\d+', s)]

    # Step 2: Define feature categories to track
    feature_groups = {
        "power, all ws": [],
        "power, all ws, time bin": [],
        'power, mean ws': [],
        "power, ws at 10 loc": []
    }

    # Step 3: Filter and categorize IDs
    for id in ids:
        config = ExperimentMapper.map_id_to_config(id)
        dates = ExperimentMapper.extract_date_abbreviations_from_config(config)
        
        # Clean and split date string
        dates_split = dates.strip().split(" / ")
        if not dates_split or len(dates_split) < 2:
            continue

        if dates.startswith("Q4"):
            feature = ExperimentMapper.get_feature_string_from_selected_features(config)
            feature = feature.strip()
            if feature in feature_groups:
                feature_groups[feature].append(id)
        

    # Step 4: Load metrics (CRPS, NLL) per feature group
    results_summary = {}

    for feature_name, id_list in feature_groups.items():
        nlls = []
        crps = []
        for id in id_list:
            file_path = os.path.join(dir, f"experiment_results_{id}.pkl")
            if not os.path.exists(file_path):
                continue

            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                try:
                    nll = data.loc[data['Metric'] == nll_name, 'Mean'].values[0]
                    crps_val = data.loc[data['Metric'] == crps_name, 'Mean'].values[0]
                    nlls.append(nll)
                    crps.append(crps_val)
                except IndexError:
                    continue

        # Calculate and store averages
        if nlls and crps:
            results_summary[feature_name] = {
                "average_nll": round(sum(nlls) / len(nlls), 5),
                "average_crps": round(sum(crps) / len(crps), 5),
                "count": len(nlls)
            }


    # Rename features for presentation
    pretty_names = {
        "power, mean ws": "Power, mean wind speed",
        "power, ws at 10 loc": "Power, 10 wind speeds",
        "power, all ws": "Power, all wind speeds",
        "power, all ws, time bin": "Power, all wind speeds, time"
    }

    # Build DataFrame
    df_data = []
    for feature, stats in results_summary.items():
        df_data.append({
            "Feature model": pretty_names.get(feature, feature),
            "CRPS": stats["average_crps"],
            "NLL": stats["average_nll"]
        })

    df = pd.DataFrame(df_data)
    df = df[["Feature model", "CRPS", "NLL"]]  # Reorder columns
    df = df.set_index("Feature model")
    df = df.round(3)
    
    feature_order = [
        "Power, mean wind speed",
        "Power, 10 wind speeds",
        "Power, all wind speeds",
        "Power, all wind speeds, time"
    ]
    df.index = pd.CategoricalIndex(df.index, categories=feature_order, ordered=True)
    df = df.sort_index()

    return df

In [17]:
import os
import glob
import re
import pickle
import pandas as pd

def analyze_3(method="tabpfn"):

    if method == "tabpfn":
        dir = "C:/Users/Minu/Documents/TabPFN/experiments"
        nll_name = "nll_5000"
        crps_name = "crps_5000"
        compute_2016 = False
    else:
        dir = "C:/Users/Minu/Documents/Baseline/experiments"
        nll_name = "nll"
        crps_name = "crps"
        compute_2016 = True

    # Step 1: Load experiment files

    def categorize_ID_to_feature_group():
        files = glob.glob(os.path.join(dir, 'experiment_results_*'))
        ids = [int(re.search(r'\d+', s).group()) for s in files if re.search(r'\d+', s)]

        feature_groups = {
            "power, all ws": [],
            "power, all ws, time bin": [],
            'power, mean ws': [],
            "power, ws at 10 loc": []
        }

        feature_groups_2016 = {
            "power, all ws": [],
            "power, all ws, time bin": [],
            'power, mean ws': [],
            "power, ws at 10 loc": []
        } if compute_2016 else None

        for id in ids:
            config = ExperimentMapper.map_id_to_config(id)
            dates = ExperimentMapper.extract_date_abbreviations_from_config(config)
            dates_split = dates.strip().split(" / ")
            if not dates_split or len(dates_split) < 2:
                continue

            feature = ExperimentMapper.get_feature_string_from_selected_features(config).strip()
            if feature not in feature_groups:
                continue

            # Add to overall group if it's Q4 (original behavior)
            if dates.startswith("Q4"):
                feature_groups[feature].append(id)

            # Add to 2016 group if any date starts with 2016
            if dates.startswith("2016"):
                feature_groups_2016[feature].append(id)

        return feature_groups, feature_groups_2016


        
    def load_metrics_per_feature_group(feature_groups, feature_groups_2016):
        results_summary = {}

        for feature_name in feature_groups:
            id_list_all = feature_groups[feature_name]
            id_list_2016 = feature_groups_2016[feature_name] if feature_groups_2016 else []

            def compute_metrics(id_list):
                nlls = []
                crps = []
                for id in id_list:
                    file_path = os.path.join(dir, f"experiment_results_{id}.pkl")
                    if not os.path.exists(file_path):
                        continue
                    with open(file_path, 'rb') as f:
                        data = pickle.load(f)
                        try:
                            nll = data.loc[data['Metric'] == nll_name, 'Mean'].values[0]
                            crps_val = data.loc[data['Metric'] == crps_name, 'Mean'].values[0]
                            nlls.append(nll)
                            crps.append(crps_val)
                        except IndexError:
                            continue
                return nlls, crps

            nlls_all, crps_all = compute_metrics(id_list_all)
            nlls_2016, crps_2016 = compute_metrics(id_list_2016) if compute_2016 else ([], [])

            if nlls_all and crps_all:
                results_summary[feature_name] = {
                    "average_nll": round(sum(nlls_all) / len(nlls_all), 5),
                    "average_crps": round(sum(crps_all) / len(crps_all), 5),
                    "average_nll_2016": round(sum(nlls_2016) / len(nlls_2016), 5) if nlls_2016 else None if nlls_2016 else None,
                    "average_crps_2016": round(sum(crps_2016) / len(crps_2016), 5) if crps_2016 else None if nlls_2016 else None,
                    "count": len(nlls_all),
                    "count_2016": len(nlls_2016) if nlls_2016 else None
                }
        return results_summary


    def load_result_df(results_summary):
        pretty_names = {
            "power, mean ws": "Power, mean wind speed",
            "power, ws at 10 loc": "Power, 10 wind speeds",
            "power, all ws": "Power, all wind speeds",
            "power, all ws, time bin": "Power, all wind speeds, time"
        }

        df_data = []
        for feature, stats in results_summary.items():
            row = {
                "Feature model": pretty_names.get(feature, feature),
                "CRPS": stats["average_crps"],
                "NLL": stats["average_nll"],
            }

            if compute_2016:
                row["CRPS_2016"] = stats["average_crps_2016"]
                row["NLL_2016"] = stats["average_nll_2016"]

            df_data.append(row)

        df = pd.DataFrame(df_data)
        columns = ["Feature model", "CRPS", "NLL"]
        if compute_2016:
            columns += ["CRPS_2016", "NLL_2016"]

        df = df[columns]
        df = df.set_index("Feature model")
        df = df.round(3)

        feature_order = [
            "Power, mean wind speed",
            "Power, 10 wind speeds",
            "Power, all wind speeds",
            "Power, all wind speeds, time"
        ]
        df.index = pd.CategoricalIndex(df.index, categories=feature_order, ordered=True)
        df = df.sort_index()
        return df
    
    feature_groups, feature_groups_2016 = categorize_ID_to_feature_group()
    results_summary = load_metrics_per_feature_group(feature_groups, feature_groups_2016)
    df = load_result_df(results_summary)
    return df


In [20]:
result = analyze_3("tabpfn")
result

Unnamed: 0_level_0,CRPS,NLL
Feature model,Unnamed: 1_level_1,Unnamed: 2_level_1
"Power, mean wind speed",0.343,1.774
"Power, 10 wind speeds",0.203,0.723
"Power, all wind speeds",0.204,0.72
"Power, all wind speeds, time",0.2,0.699


In [26]:
result = analyze_3("baseline")
result

Unnamed: 0_level_0,CRPS,NLL,CRPS_2016,NLL_2016
Feature model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Power, mean wind speed",0.312,0.926,0.316,0.875
"Power, 10 wind speeds",0.282,0.925,0.274,0.735
"Power, all wind speeds",0.28,0.924,0.274,0.733
"Power, all wind speeds, time",0.28,0.924,0.273,0.732


In [43]:
result = analyze_3("tabpfn")
result

Unnamed: 0_level_0,CRPS,NLL
Feature model,Unnamed: 1_level_1,Unnamed: 2_level_1
"Power, mean wind speed",0.343,1.774
"Power, 10 wind speeds",0.203,0.723
"Power, all wind speeds",0.204,0.72
"Power, all wind speeds, time",0.2,0.699


## Function to extract CRPS, NLL matrix for different features from NGBoost (C:\Users\Minu\Documents\NGboost\q4_train\caseXX + Merged_sheet)

In [27]:
import pandas as pd

def analyze_3_ngboost(filepath):
    # Read the Excel file
    df = pd.read_excel(filepath)

    # Ensure 'loss_function' is treated as string
    df['loss_function'] = df['loss_function'].astype(str)

    # List of feature_abbr values we're interested in
    target_features = {
        "p, ws_mean, ws_10_loc",
        "p, ws_mean, ws_10_loc, t_index",
        "p, ws_mean",
        "p, ws_10_loc"
    }

    # Filter rows
    filtered = df[
        df['loss_function'].str.contains('LogScore', case=False, na=False) &
        df['feature_abbr'].isin(target_features)
    ]

    # Rename the feature_abbr values
    rename_map = {
        "p, ws_10_loc": "Power, 10 wind speeds",
        "p, ws_mean": "Power, mean wind speed",
        "p, ws_mean, ws_10_loc, t_index": "Power, all wind speeds, time",
        "p, ws_mean, ws_10_loc": "Power, all wind speeds"
    }
    filtered['feature_abbr'] = filtered['feature_abbr'].map(rename_map)

    # Extract only the required columns and include renamed feature_abbr
    result = filtered[['feature_abbr', 'CRPS_gaussian_mean', 'NLL_mean']]

     # Custom sort
    feature_order = [
        "Power, mean wind speed",
        "Power, 10 wind speeds",
        "Power, all wind speeds",
        "Power, all wind speeds, time"
    ]
    result['feature_abbr'] = pd.Categorical(result['feature_abbr'], categories=feature_order, ordered=True)
    result = result.sort_values('feature_abbr').reset_index(drop=True)
    result = result.round(3)

    return result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['feature_abbr'] = filtered['feature_abbr'].map(rename_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['feature_abbr'] = pd.Categorical(result['feature_abbr'], categories=feature_order, ordered=True)


In [45]:
ngboost_matrix = analyze_3_ngboost("C:/Users/Minu/Documents/NGboost/q4_train/Merged_sheet.xlsx")
ngboost_matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['feature_abbr'] = filtered['feature_abbr'].map(rename_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['feature_abbr'] = pd.Categorical(result['feature_abbr'], categories=feature_order, ordered=True)


Unnamed: 0,feature_abbr,CRPS_gaussian_mean,NLL_mean
0,"Power, mean wind speed",0.284,0.737
1,"Power, 10 wind speeds",0.196,0.357
2,"Power, all wind speeds",0.196,0.353
3,"Power, all wind speeds, time",0.194,0.34


In [46]:
analyze_3_ngboost("C:/Users/Minu/Documents/NGboost/full_year/Merged_sheet.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['feature_abbr'] = filtered['feature_abbr'].map(rename_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['feature_abbr'] = pd.Categorical(result['feature_abbr'], categories=feature_order, ordered=True)


Unnamed: 0,feature_abbr,CRPS_gaussian_mean,NLL_mean
0,"Power, mean wind speed",0.27,0.544
1,"Power, 10 wind speeds",0.182,0.171
2,"Power, all wind speeds",0.183,0.174
3,"Power, all wind speeds, time",0.181,0.169


In [47]:
def final_table_q4():
    # Load all results
    ngboost_matrix_q4 = analyze_3_ngboost("C:/Users/Minu/Documents/NGboost/q4_train/Merged_sheet.xlsx")
    ngboost_matrix_ft = analyze_3_ngboost("C:/Users/Minu/Documents/NGboost/full_year/Merged_sheet.xlsx")
    tabpfn_result_q4 = analyze_3("tabpfn")
    baseline_result_q4 = analyze_3("baseline")

    # Prepare CRPS
    ngboost_crps_q4 = ngboost_matrix_q4.set_index('feature_abbr')[['CRPS_gaussian_mean']].rename(columns={'CRPS_gaussian_mean': 'NGBoost (Q4 training)'})
    ngboost_crps_ft = ngboost_matrix_ft.set_index('feature_abbr')[['CRPS_gaussian_mean']].rename(columns={'CRPS_gaussian_mean': 'NGBoost (full training)'})
    
    tabpfn_crps_q4 = tabpfn_result_q4[['CRPS']].rename(columns={'CRPS': 'TabPFN (Q4 training)'})
    baseline_crps_q4 = baseline_result_q4[['CRPS']].rename(columns={'CRPS': 'Baseline (Q4 training)'})
    baseline_crps_ft = baseline_result_q4[['CRPS_2016']].rename(columns={'CRPS_2016': 'Baseline (full training)'})

    crps_matrix = pd.concat([baseline_crps_q4, baseline_crps_ft, ngboost_crps_q4, ngboost_crps_ft, tabpfn_crps_q4], axis=1)

    # Prepare NLL
    ngboost_nll_q4 = ngboost_matrix_q4.set_index('feature_abbr')[['NLL_mean']].rename(columns={'NLL_mean': 'NGBoost (Q4 training)'})
    ngboost_nll_ft = ngboost_matrix_ft.set_index('feature_abbr')[['NLL_mean']].rename(columns={'NLL_mean': 'NGBoost (full training)'})

    tabpfn_nll_q4 = tabpfn_result_q4[['NLL']].rename(columns={'NLL': 'TabPFN (Q4 training)'})
    baseline_nll_q4 = baseline_result_q4[['NLL']].rename(columns={'NLL': 'Baseline (Q4 training)'})
    baseline_nll_ft = baseline_result_q4[['NLL_2016']].rename(columns={'NLL_2016': 'Baseline (full training)'})

    nll_matrix = pd.concat([baseline_nll_q4, baseline_nll_ft, ngboost_nll_q4, ngboost_nll_ft, tabpfn_nll_q4], axis=1)

    return crps_matrix.round(3), nll_matrix.round(3)


In [48]:
crps_matrix, nll_matrix = final_table_q4()
crps_matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['feature_abbr'] = filtered['feature_abbr'].map(rename_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['feature_abbr'] = pd.Categorical(result['feature_abbr'], categories=feature_order, ordered=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['feature_abbr'] = filter

Unnamed: 0,Baseline (Q4 training),Baseline (full training),NGBoost (Q4 training),NGBoost (full training),TabPFN (Q4 training)
"Power, mean wind speed",0.312,0.316,0.284,0.27,0.343
"Power, 10 wind speeds",0.282,0.274,0.196,0.182,0.203
"Power, all wind speeds",0.28,0.274,0.196,0.183,0.204
"Power, all wind speeds, time",0.28,0.273,0.194,0.181,0.2


In [49]:
nll_matrix

Unnamed: 0,Baseline (Q4 training),Baseline (full training),NGBoost (Q4 training),NGBoost (full training),TabPFN (Q4 training)
"Power, mean wind speed",0.926,0.875,0.737,0.544,1.774
"Power, 10 wind speeds",0.925,0.735,0.357,0.171,0.723
"Power, all wind speeds",0.924,0.733,0.353,0.174,0.72
"Power, all wind speeds, time",0.924,0.732,0.34,0.169,0.699


In [63]:
from analysis.latex import export_final_comparison_to_latex
%load_ext autoreload
%autoreload 2

export_final_comparison_to_latex(crps_matrix, nll_matrix)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
LaTeX document has been saved to: C:/Users/Minu/OneDrive/Arbeit/HTWG/Master/Masterarbeit/thesis_teamprojekt_templates-master/chapters/overall_comparison_of_crps_nll.tex
