In [1]:
%config InlineBackend.figure_format = "svg"

# Import metrics
from sktime.performance_metrics.forecasting import MeanAbsoluteScaledError

# Import models and data splitting from sktime
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.compose import ColumnEnsembleForecaster
from sktime.forecasting.naive import NaiveForecaster

# Data acquisition, processing and visualization tools
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np

## Loading data

In [2]:
datapath = "data/train.csv"
df = pd.read_csv(datapath)
df

Unnamed: 0,date,tavg_t,tavg_s,tavg_r,tmin_t,tmin_s,tmin_r,tmax_t,tmax_s,tmax_r,wdir_t,wdir_s,wdir_r,wspd_t,wspd_s,wspd_r,pres_t,pres_s,pres_r
0,2018-01-01,13.273631,-8.723072,-1.450558,9.213128,-7.468275,-1.144852,17.793790,-9.720526,-2.673264,162.989534,39.801348,-154.790882,7.994651,-1.282647,-0.212004,1017.023757,3.011254,-8.335010
1,2018-01-02,13.271941,-8.045036,-0.226905,9.210267,-6.731484,-0.178783,17.792590,-9.721875,-0.370716,163.066633,67.997983,103.935384,7.996125,3.768236,-0.364361,1017.024462,1.450017,-8.974479
2,2018-01-03,13.270252,-9.666780,0.396528,9.207407,-8.153816,-0.353591,17.791391,-11.924320,0.732929,163.143732,1.956262,57.900006,7.997598,2.227339,1.975063,1017.025168,2.968506,-12.093673
3,2018-01-04,13.268562,-8.889182,0.320620,9.204546,-7.336422,0.331876,17.790192,-9.646107,2.255915,163.220831,34.086048,-7.306879,7.999071,2.263976,-2.263047,1017.025873,0.066885,-11.292758
4,2018-01-05,13.266873,-8.149885,1.283012,9.201685,-6.377549,0.875863,17.788993,-10.225922,1.936929,163.297930,-4.365536,-14.932394,8.000545,3.358147,-5.058691,1017.026579,-2.435064,-7.291515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,2022-09-23,13.424207,2.170566,-3.594773,8.863514,1.860880,-3.924394,17.894557,2.534992,-3.129550,209.431416,-6.521823,97.090407,10.331325,0.269717,-5.301042,1019.300327,-0.507622,1.807295
1727,2022-09-24,13.426178,2.226829,-3.453007,8.864937,2.199335,-5.564273,17.897173,2.549264,-2.246436,209.388917,43.626951,-155.015868,10.332928,0.389013,-2.421941,1019.303410,-0.455800,-1.147610
1728,2022-09-25,13.428149,2.543366,-2.271515,8.866361,3.098777,-0.965138,17.899788,2.724740,-4.124529,209.346417,4.640383,-140.986800,10.334532,-2.170431,-2.064101,1019.306494,-0.224307,-4.582186
1729,2022-09-26,13.430119,1.999383,-0.829503,8.867785,1.035916,2.496299,17.902404,3.279560,-3.081964,209.303918,65.587787,52.108295,10.336136,-2.270725,-0.165411,1019.309577,-2.932321,-7.977256


In [3]:
# Get the decomposition columns
trend_cols = [col for col in df.columns if "_t" in col]
seasonal_cols = [col for col in df.columns if "_s" in col]
resid_cols = [col for col in df.columns if "_r" in col]
forecast_cols = trend_cols + seasonal_cols

# Get original columns
original_cols = [col.split("_")[0] for col in trend_cols]
original_cols

['tavg', 'tmin', 'tmax', 'wdir', 'wspd', 'pres']

## Training and cross-validation

For this forecasting problem, we will be using the naive method for both trend and seasonal forecasting:
- Naive method for trend forecasting forecasts using the previous value of the trend
- Naive seasonal method forecasts using the corresponding value of the previous period

In [4]:
# Create trended forecaster
trend_forecaster = NaiveForecaster(strategy="last")

# Create seasonal forecaster
seasonal_forecaster = NaiveForecaster(strategy="last", sp=365)

# Create forecasting model
forecaster = ColumnEnsembleForecaster(forecasters=[
    ("trend", trend_forecaster, trend_cols),
    ("seasonal", seasonal_forecaster, seasonal_cols)
])
forecaster

In [7]:
# Creating Validation folds
nfolds = 50
forecast_size = 14
fh = np.arange(forecast_size) + 1
window_length = df.shape[0] - nfolds - forecast_size + 1
splitter = list(SlidingWindowSplitter(fh=fh, window_length=window_length).split(df[forecast_cols]))

# Iterate folds
metric = MeanAbsoluteScaledError()
train_score, val_score = 0, 0
bar = tqdm(splitter, desc="Cross-validating")
for train_idx, val_idx in bar:
    
    # TODO: Fix y_true both in training and validation
    
    # Get train and validation data
    y_train, y_val = df[forecast_cols].iloc[train_idx], df[forecast_cols].iloc[val_idx]
    
    # Get training performance
    forecaster = forecaster.fit(y_train.iloc[:-forecast_size])
    y_train_pred = forecaster.predict(fh=fh)
    train_score += metric(y_train.iloc[-forecast_size:], y_train_pred, y_train=y_train.iloc[:-forecast_size]) / len(splitter)
    
    # Predict on validation data
    forecaster = forecaster.fit(y_train)
    y_val_pred = forecaster.predict(fh=fh)
    val_score += metric(y_val, y_val_pred, y_train=y_train) / len(splitter)
    
print(f"Train score: {train_score} | Validation score: {val_score}")

Cross-validating: 100%|██████████| 50/50 [00:07<00:00,  6.82it/s]

0.007732234787111072 0.007729431802435505 (14, 12)



