In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

## Set up

In [16]:
import os
import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.multioutput import RegressorChain
from sklearn.base import BaseEstimator

import catboost as cb
import lightgbm as lgbm
import xgboost as xgb

import tensorflow as tf

from utils import load_config
from src.helpers import ContiguousGroupKFold

## Constants

In [4]:
config = load_config()

In [5]:
DATA_DIR = config['final_data']
TRAIN = os.path.join(DATA_DIR, 'train.csv')

## Modelling

- Direct Regressor: Predicts outputs and prediction interval directly
  - GBT with quantile loss
  - deep learning with quantile loss
- Ensemble Regressor: Uses a monte-carlo simulation to generate prediction interval
  - GBT with ensemble
  - deep learning with ensemble
- CV ensemble modelling - fold cross validation

In [6]:
CV_SPLIT = 5

#### Load data

In [10]:
df = pd.read_csv(TRAIN)

  df = pd.read_csv(TRAIN)


### Train Test Split
- Before generating rolling and lag features, we split the data to prevent data leakage.
- Based on previous analysis, the dataset exhibits annual seasonality but no significant long-term trend.
- Therefore, it is acceptable to use chronological or block-wise splits without always reserving the most recent data for validation.
- This approach is appropriate for seasonally-repeating time series, where the assumption of trend-driven data drift does not hold.
- In such cases, the model's ability to generalize across seasonal cycles is more important than strict recency.

In [14]:
cgkf = ContiguousGroupKFold(5)
for idx, (train_ids, val_ids) in enumerate(cgkf.split(df, groups = df.year)):
    print(f'Years in fold {idx + 1}')
    print('Train:', *df.iloc[train_ids].year.unique())
    print('Validation:', *df.iloc[val_ids].year.unique())
    print('-------------------------------------------------------------------------')

Years in fold 1
Train: 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
Validation: 1990 1991 1992
-------------------------------------------------------------------------
Years in fold 2
Train: 1990 1991 1992 1996 1997 1998 1999 2000 2001 2002 2003 2004
Validation: 1993 1994 1995
-------------------------------------------------------------------------
Years in fold 3
Train: 1990 1991 1992 1993 1994 1995 1999 2000 2001 2002 2003 2004
Validation: 1996 1997 1998
-------------------------------------------------------------------------
Years in fold 4
Train: 1990 1991 1992 1993 1994 1995 1996 1997 1998 2002 2003 2004
Validation: 1999 2000 2001
-------------------------------------------------------------------------
Years in fold 5
Train: 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
Validation: 2002 2003 2004
-------------------------------------------------------------------------


In [53]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class RollingFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, window=3, functions=None, columns=None):
        self.window = window
        self.functions = functions or ['mean']
        self.columns = columns  # list of columns to apply rolling functions on

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            for func in self.functions:
                new_col_name = f"{col}_rolling_{func}_{self.window}"
                if func == 'mean':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).mean().bfill()
                elif func == 'sum':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).sum().bfill()
                elif func == 'std':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).std().bfill()
                elif func == 'min':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).min().bfill()
                elif func == 'max':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).max().bfill()
                else:
                    raise ValueError(f"Unsupported function: {func}")
        return X_transformed

    def fit_transform(self, X, y=None):
        return self.transform(X)


In [None]:

class LagFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, window=3, functions=None, columns=None):
        self.window = window
        self.columns = columns  # list of columns to apply rolling functions on

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            for func in self.functions:
                new_col_name = f"{col}_rolling_{func}_{self.window}"
                if func == 'mean':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).mean().bfill()
                elif func == 'sum':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).sum().bfill()
                elif func == 'std':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).std().bfill()
                elif func == 'min':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).min().bfill()
                elif func == 'max':
                    X_transformed[new_col_name] = X_transformed[col].rolling(self.window).max().bfill()
                else:
                    raise ValueError(f"Unsupported function: {func}")
        return X_transformed

    def fit_transform(self, X, y=None):
        return self.transform(X)


In [54]:
rft = RollingFeaturesTransformer(3, functions = ['mean', 'sum'], columns = ['tp'])

In [55]:
rft.fit_transform(df[['tp']])

Unnamed: 0,tp,tp_rolling_mean_3,tp_rolling_sum_3
0,6.853062e-05,0.000028,0.000085
1,1.560804e-05,0.000028,0.000085
2,5.732055e-07,0.000028,0.000085
3,8.079637e-05,0.000032,0.000097
4,1.040163e-05,0.000031,0.000092
...,...,...,...
28504,1.788707e-05,0.000109,0.000326
28505,1.555908e-04,0.000156,0.000468
28506,2.325291e-04,0.000135,0.000406
28507,1.206086e-04,0.000170,0.000509


In [56]:
p = Pipeline([
    ('rf' , rft)
])

In [57]:
p.fit_transform(df[['tp', 't2m']])

Unnamed: 0,tp,t2m,tp_rolling_mean_3,tp_rolling_sum_3
0,6.853062e-05,294.95035,0.000028,0.000085
1,1.560804e-05,295.11826,0.000028,0.000085
2,5.732055e-07,295.49854,0.000028,0.000085
3,8.079637e-05,296.61980,0.000032,0.000097
4,1.040163e-05,295.95758,0.000031,0.000092
...,...,...,...,...
28504,1.788707e-05,278.38266,0.000109,0.000326
28505,1.555908e-04,277.25555,0.000156,0.000468
28506,2.325291e-04,275.85210,0.000135,0.000406
28507,1.206086e-04,273.75018,0.000170,0.000509
