In [56]:
import pandas as pd

def train_test_split():
    lp_20ips_2016 = pd.read_csv("LoadProfile_20IPs_2016.csv", sep=";", skiprows=1)
    lp_30ips_2017 = pd.read_csv("LoadProfile_30IPs_2017.csv", sep=";", skiprows=1)

    lp_20ips_2016["Time stamp"] = lp_20ips_2016["Time stamp"].str.replace(r'[^0-9.: ]', '', regex=True).str.strip()
    lp_20ips_2016["Time stamp"] = pd.to_datetime(lp_20ips_2016.loc[:, "Time stamp"], format='%d.%m.%Y %H:%M:%S')

    lp_30ips_2017["Time stamp"] = lp_30ips_2017["Time stamp"].str.replace(r'[^0-9.: ]', '', regex=True).str.strip()
    lp_30ips_2017["Time stamp"] = pd.to_datetime(lp_30ips_2017.loc[:, "Time stamp"], format='%d.%m.%Y %H:%M:%S')

    train_2016_full = lp_20ips_2016.iloc[:, :-7]
    train_2017_full = lp_30ips_2017.iloc[:, :-8]
    test_2016_full = pd.concat([lp_20ips_2016["Time stamp"], lp_20ips_2016.iloc[:, -7:] ], axis=1)
    test_2017_full = pd.concat([lp_30ips_2017["Time stamp"], lp_30ips_2017.iloc[:, -8:] ], axis=1)

    train_2016_training_data = train_2016_full[train_2016_full["Time stamp"].dt.month <= 8]
    train_2016_val_data = train_2016_full[train_2016_full["Time stamp"].dt.month > 8]

    test_2016_training_data = test_2016_full[test_2016_full["Time stamp"].dt.month <= 8]
    test_2016_test_data = test_2016_full[test_2016_full["Time stamp"].dt.month > 8]

    train_2017_training_data = train_2017_full[train_2017_full["Time stamp"].dt.month <= 8]
    train_2017_val_data = train_2017_full[train_2017_full["Time stamp"].dt.month > 8]

    test_2017_training_data = test_2017_full[test_2017_full["Time stamp"].dt.month <= 8]
    test_2017_test_data = test_2017_full[test_2017_full["Time stamp"].dt.month > 8]

    train_2016_training_data.to_csv("tune/2016_train.csv", index=False)
    train_2016_val_data.to_csv("tune/2016_val.csv", index=False)
    test_2016_training_data.to_csv("test/2016_train.csv", index=False)
    test_2016_test_data.to_csv("test/2016_test.csv", index=False)

    train_2017_training_data.to_csv("tune/2017_train.csv", index=False)
    train_2017_val_data.to_csv("tune/2017_val.csv", index=False)
    test_2017_training_data.to_csv("test/2017_train.csv", index=False)
    test_2017_test_data.to_csv("test/2017_test.csv", index=False)

In [28]:
train_test_split()

In [8]:
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
from sktime.datasets import load_airline
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.theta import ThetaForecaster
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error
from sktime.split import temporal_train_test_split

class Forecaster(ABC):
    def __init__(self, year: int, ig: int, fh: int):
        self.train_data, self.val_data = self.load_data(year, ig)
        self.fh = fh

    @staticmethod
    def load_data(year: int, ig: int) -> tuple[pd.DataFrame, pd.DataFrame]:
        if year == 2017:
            ig_str = f"LG {ig:02d}"
        else:
            ig_str = f"LG {ig:01d}"

        train = pd.read_csv(f"tune/{year}_train.csv")
        train_data = train[["Time stamp", ig_str]]
        train_data.loc[:, ig_str] = train_data[ig_str].astype(float)
        train_data.columns = ["Time stamp", "target"]

        val = pd.read_csv(f"tune/{year}_val.csv")
        val_data = val[["Time stamp", ig_str]]
        val_data.loc[:, ig_str] = val_data[ig_str].astype(float)
        val_data.columns = ["Time stamp", "target"]

        return train_data, val_data

    @abstractmethod
    def preprocess(self):
        raise

    @abstractmethod
    def train(self):
        pass

    @abstractmethod
    def predict(self):
        pass

    @abstractmethod
    def update(self, value: float):
        pass

    @abstractmethod
    def validate(self):
        pass


class SimpleForecaster(Forecaster):
    def __init__(self, year, ig, peak_threshold: float = 0.85, fh: int = 1):
        super().__init__(year, ig, fh)
        self.peak_threshold = peak_threshold

        self.mean = self.train_data["target"].mean()
        self.std = self.train_data["target"].std()
        self.peak_value = self.train_data["target"].max() * self.peak_threshold
        self.normalized_peak_vaue = (self.peak_value - self.mean) / self.std

        self.forecaster = ThetaForecaster(sp=48) 
        self.sk_fh = ForecastingHorizon(self.fh, is_relative=False)

    def preprocess(self):
        """Data cleaning and normalization"""
        self.train_data.loc[:, "target"] = self.train_data.loc[:, "target"].interpolate()
        self.val_data.loc[:, "target"] = self.val_data.loc[:, "target"].interpolate()

        self.train_data = self.train_data.dropna(subset=["target"])
        self.val_data = self.val_data.dropna(subset=["target"])

        self.train_data.loc[:, "target_normalized"] = (self.train_data.loc[:, "target"] - self.mean) / self.std
        self.train_data.loc[:, "is_peak"] = (self.train_data.loc[:, "target_normalized"] >= self.normalized_peak_vaue).astype(int)

    def train(self):
        """Fit the internal model(s)"""
        self.forecaster.fit(self.train_data["target"], fh=self.sk_fh)

    def predict(self):
        """Predict the next value(s)"""
        return self.forecaster.predict(self.sk_fh)

    def update(self, value: float):
        """Update the model with the new value (if required)"""
        pass
    
    def validate(self):
        """Validate the model on the validation data"""
        errors = []

        for y_target in self.val_data.loc[:100, "target"]:
            y_hat = self.predict()

            # de-normalize
            y_hat_denormalized = y_hat * self.std + self.mean
            
            # TODO plug in actual validation error metric
            error = mean_absolute_percentage_error(np.array([y_target]), y_hat_denormalized)

            errors.append(error)

            self.update(y_target)
        
        return np.mean(errors)


forecaster = SimpleForecaster(
    year=2017,
    ig=1,
)
forecaster.preprocess()
forecaster.train()
forecaster.validate()


np.float64(1.3352307434976363)