In [1]:
%config InlineBackend.figure_format = "svg"

# Import metrics
from sktime.performance_metrics.forecasting import MeanAbsoluteScaledError

# Import models and data splitting from sktime
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.compose import ColumnEnsembleForecaster
from sktime.forecasting.naive import NaiveForecaster

# Data pipelines
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Series decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Data acquisition, processing and visualization tools
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np

## Loading data

In [2]:
datapath = "data/train.csv"
df = pd.read_csv(datapath, index_col="date")
df

Unnamed: 0_level_0,tavg,tmin,tmax,wdir,wspd,pres
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01,3.1,0.6,5.4,48.0,6.5,1011.7
2018-01-02,5.0,2.3,7.7,335.0,11.4,1009.5
2018-01-03,4.0,0.7,6.6,223.0,12.2,1007.9
2018-01-04,4.7,2.2,10.4,190.0,8.0,1005.8
2018-01-05,6.4,3.7,9.5,144.0,6.3,1007.3
...,...,...,...,...,...,...
2022-09-23,12.0,6.8,17.3,300.0,5.3,1020.6
2022-09-24,12.2,5.5,18.2,98.0,8.3,1017.7
2022-09-25,13.7,11.0,16.5,73.0,6.1,1014.5
2022-09-26,14.6,12.4,18.1,327.0,7.9,1008.4


## Creating data pipelines

As we saw already in the exploratory data analysis notebook, there are two steps needed for preprocessing our data:
- Linear interpolation for handling missing values
- Classical decomposition using `statsmodels`

Unfortunately, these processes are not natively supported by either `sklearn` or `sktime`, and therefore we will build our custom transformers:

In [4]:
class LinearInterpolator(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.interpolate(method="linear")
        return X
    
class TSADecomposer(BaseEstimator, TransformerMixin):
    
    def __init__(self, model, period, extrapolate_trend):
        self.model = model
        self.period = period
        self.extrapolate_trend = extrapolate_trend
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Column placeholders
        self.trend_cols = []
        self.seasonal_cols = []
        self.original_cols = list(X.columns)
        
        # Iterate every column to transform
        for key in X:
            # Decompose series
            decomposed = seasonal_decompose(X[key].values, model=self.model, period=self.period, extrapolate_trend=self.extrapolate_trend)
            
            # Add dataframe entries
            X[key + "_t"] = decomposed.trend
            X[key + "_s"] = decomposed.seasonal
            
            # Add new columns
            self.trend_cols.append(key + "_t")
            self.seasonal_cols.append(key + "_s")
            
        # Remove original columns
        X = X.drop(columns=self.original_cols)
        self.new_cols = X.columns
        return X
    
    def get_original_cols(self):
        return self.original_cols
    
    def get_trend_cols(self):
        return self.trend_cols
    
    def get_seasonal_cols(self):
        return self.seasonal_cols
    
    def get_new_cols(self):
        return self.new_cols

With the classes defined, let's create our pipelines for training:

In [None]:
# Create interpolator
interpolator = LinearInterpolator()

# Create decomposer
decomposer = TSADecomposer(model="additive", period=365, extrapolate_trend="freq")

# Training preprocessor
train_preprocessor = Pipeline(steps=[
    ("interpolator", interpolator),
    ("decomposer", decomposer)
])

# Testing preprocessor
# test_preprocessor = interpolator

## Training and cross-validation

For this forecasting problem, we will be using the naive method for both trend and seasonal forecasting:
- Naive method for trend forecasting forecasts using the previous value of the trend
- Naive seasonal method forecasts using the corresponding value of the previous period

In [None]:
# Create trended forecaster
trend_forecaster = NaiveForecaster(strategy="last")

# Create seasonal forecaster
seasonal_forecaster = NaiveForecaster(strategy="last", sp=365)

# Create forecasting model
forecaster = ColumnEnsembleForecaster(forecasters=[
    ("trend", trend_forecaster, trend_cols),
    ("seasonal", seasonal_forecaster, seasonal_cols)
])
forecaster

In [None]:
# Creating Validation folds
nfolds = 50
forecast_size = 14
fh = np.arange(forecast_size) + 1
window_length = df.shape[0] - nfolds - forecast_size + 1
splitter = list(SlidingWindowSplitter(fh=fh, window_length=window_length).split(df[forecast_cols]))

# Iterate folds
metric = MeanAbsoluteScaledError()
train_score, val_score = 0, 0
bar = tqdm(splitter, desc="Cross-validating")
for train_idx, val_idx in bar:
    
    # TODO: Fix y_true both in training and validation
    
    # Get train and validation data
    y_train, y_val = df[forecast_cols].iloc[train_idx], df[forecast_cols].iloc[val_idx]
    
    # Get training performance
    forecaster = forecaster.fit(y_train.iloc[:-forecast_size])
    y_train_pred = forecaster.predict(fh=fh)
    train_score += metric(y_train.iloc[-forecast_size:], y_train_pred, y_train=y_train.iloc[:-forecast_size]) / len(splitter)
    
    # Predict on validation data
    forecaster = forecaster.fit(y_train)
    y_val_pred = forecaster.predict(fh=fh)
    val_score += metric(y_val, y_val_pred, y_train=y_train) / len(splitter)
    
print(f"Train score: {train_score} | Validation score: {val_score}")