# Forecasting with Machine Learning Models.

## 0 Libraries Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from pathlib import Path
from warnings import simplefilter
from typing import List, Optional, Tuple
import os, sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.signal import periodogram
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)
%config InlineBackend.figure_format = 'retina'

#from utils.forecasting_function_utilities import *

In [3]:
os.getcwd()

'/content'

In [4]:
os.chdir('/content/drive/MyDrive/Colab_Notebooks/TimeSeries')

## 1 Feature Preparation for ML models

In [5]:
# Read

ts_train = pd.read_parquet('jena_train.parquet')
ts_val= pd.read_parquet('jena_val.parquet')
ts_test= pd.read_parquet('jena_test.parquet')

ts_train['type'] = 'train'
ts_val['type'] = 'val'
ts_test['type'] = 'test'
#
ts_train.head()

Unnamed: 0,p (mbar),T_degC,Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg),type
2009-01-01 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3,train
2009-01-01 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1,train
2009-01-01 00:30:00,996.53,-8.51,264.91,-9.31,93.9,3.21,3.01,0.2,1.88,3.02,1310.24,0.19,0.63,171.6,train
2009-01-01 00:40:00,996.51,-8.31,265.12,-9.07,94.2,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.5,198.0,train
2009-01-01 00:50:00,996.51,-8.27,265.15,-9.04,94.1,3.27,3.08,0.19,1.92,3.09,1309.0,0.32,0.63,214.3,train


In [6]:
full_df = pd.concat([ts_train,ts_val, ts_test], axis = 0)
full_df.head()

Unnamed: 0,p (mbar),T_degC,Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg),type
2009-01-01 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3,train
2009-01-01 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1,train
2009-01-01 00:30:00,996.53,-8.51,264.91,-9.31,93.9,3.21,3.01,0.2,1.88,3.02,1310.24,0.19,0.63,171.6,train
2009-01-01 00:40:00,996.51,-8.31,265.12,-9.07,94.2,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.5,198.0,train
2009-01-01 00:50:00,996.51,-8.27,265.15,-9.04,94.1,3.27,3.08,0.19,1.92,3.09,1309.0,0.32,0.63,214.3,train


In [7]:
full_df_Temp = full_df.loc[:,['T_degC', 'type']]
del ts_train, ts_val, ts_test, full_df

<b> The forecast horizon is the time for which you are making a forecast. We often describe a forecast by the number of time steps in its horizon: a "1-step" forecast or "5-step" forecast, say. The forecast horizon describes the target.

For the Jena dataset the task is to forecast the temperature the next 24 hours, as we have 6 values each hour, we are going to forecast the next 144 samples </b>

In order to forecast time series with ML algorithms, we need to transform the series into a dataframe we can use with those algorithms. (Unless, of course, you are only using deterministic features like trend and seasonality.)

We saw the first half of this process in Lesson 4 when we created a feature set out of lags. The second half is preparing the target. How we do this depends on the forecasting task.

Each row in a dataframe represents a single forecast. The time index of the row is the first time in the forecast horizon, but we arrange values for the entire horizon in the same row. For multistep forecasts, this means we are requiring a model to produce multiple outputs, one for each step.

In [8]:
full_df_Temp.head()

Unnamed: 0,T_degC,type
2009-01-01 00:10:00,-8.02,train
2009-01-01 00:20:00,-8.41,train
2009-01-01 00:30:00,-8.51,train
2009-01-01 00:40:00,-8.31,train
2009-01-01 00:50:00,-8.27,train


In [9]:
full_df_Temp.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 420122 entries, 2009-01-01 00:10:00 to 2016-12-29 19:10:00
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   T_degC  420122 non-null  float64
 1   type    420122 non-null  object 
dtypes: float64(1), object(1)
memory usage: 9.6+ MB


In [10]:
# Convert to float32 to save space
full_df_Temp['T_degC'] = full_df_Temp['T_degC'].astype('float32')
full_df_Temp.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 420122 entries, 2009-01-01 00:10:00 to 2016-12-29 19:10:00
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   T_degC  420122 non-null  float32
 1   type    420122 non-null  object 
dtypes: float32(1), object(1)
memory usage: 8.0+ MB


In [11]:
def make_lags(df, columns_for_lags, nlags, lead_time=1):

    df_lags = pd.DataFrame()
    for col in columns_for_lags:
        for i in range(lead_time, nlags + lead_time):
            df_lags[f'{col}_lag_{i}'] = df[col].shift(i).fillna(0.0)

    return df_lags

In [12]:
#If we want to capture weekly seasonality we need to include at least 6*24*7 lags
df_w_lags = make_lags(full_df_Temp,['T_degC'], 6*24)
df_w_lags.head()

Unnamed: 0,T_degC_lag_1,T_degC_lag_2,T_degC_lag_3,T_degC_lag_4,T_degC_lag_5,T_degC_lag_6,T_degC_lag_7,T_degC_lag_8,T_degC_lag_9,T_degC_lag_10,...,T_degC_lag_135,T_degC_lag_136,T_degC_lag_137,T_degC_lag_138,T_degC_lag_139,T_degC_lag_140,T_degC_lag_141,T_degC_lag_142,T_degC_lag_143,T_degC_lag_144
2009-01-01 00:10:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:20:00,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:30:00,-8.41,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:40:00,-8.51,-8.41,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:50:00,-8.31,-8.51,-8.41,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
def add_rolling_features(df, columns_for_features, windows, features):
    '''
    Ex:

    test_roll_feat = add_rolling_features(full_df_Temp,['T_degC'],[3,6,12,24], ['mean','median'])
    '''

    for col in columns_for_features:
        rolling_df = pd.concat(
            [df[col]
             .shift(1).rolling(w).agg({f"{col}_rolling_{w}_{agg}": agg for agg in features}) for w in windows
            ],
            axis=1,
        )
    return rolling_df.fillna(0)

In [14]:
roll_feat = add_rolling_features(full_df_Temp,['T_degC'],[3,6,12,24], ['mean','median'])
roll_feat.head()

Unnamed: 0,T_degC_rolling_3_mean,T_degC_rolling_3_median,T_degC_rolling_6_mean,T_degC_rolling_6_median,T_degC_rolling_12_mean,T_degC_rolling_12_median,T_degC_rolling_24_mean,T_degC_rolling_24_median
2009-01-01 00:10:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:40:00,-8.313334,-8.41,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:50:00,-8.41,-8.41,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
def add_seasonal_ewma(df, columns_for_features, alphas,features):

    for col in columns_for_features:
        ewm_df = pd.concat(
            [df[col]
             .shift(1).ewm(a).agg({f"{col}_ewm_{a}_{agg}": agg for agg in features}) for a in alphas
            ],
            axis=1,
        )
    return ewm_df.fillna(0)

In [16]:
ewm_feat = add_seasonal_ewma(full_df_Temp,['T_degC'],[0.3,0.5,0.8], ['mean'])
ewm_feat.head()

Unnamed: 0,T_degC_ewm_0.3_mean,T_degC_ewm_0.5_mean,T_degC_ewm_0.8_mean
2009-01-01 00:10:00,0.0,0.0,0.0
2009-01-01 00:20:00,-8.02,-8.02,-8.02
2009-01-01 00:30:00,-8.336875,-8.3125,-8.29
2009-01-01 00:40:00,-8.471705,-8.449231,-8.423985
2009-01-01 00:50:00,-8.346963,-8.35525,-8.358089


In [17]:
# Join Together

full_df_features = pd.concat([full_df_Temp, df_w_lags,roll_feat,ewm_feat], axis = 1)
full_df_features.index.name = 'timestamp'
full_df_features.head()

Unnamed: 0_level_0,T_degC,type,T_degC_lag_1,T_degC_lag_2,T_degC_lag_3,T_degC_lag_4,T_degC_lag_5,T_degC_lag_6,T_degC_lag_7,T_degC_lag_8,...,T_degC_rolling_3_median,T_degC_rolling_6_mean,T_degC_rolling_6_median,T_degC_rolling_12_mean,T_degC_rolling_12_median,T_degC_rolling_24_mean,T_degC_rolling_24_median,T_degC_ewm_0.3_mean,T_degC_ewm_0.5_mean,T_degC_ewm_0.8_mean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-01-01 00:10:00,-8.02,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-01-01 00:20:00,-8.41,train,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.02,-8.02,-8.02
2009-01-01 00:30:00,-8.51,train,-8.41,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.336875,-8.3125,-8.29
2009-01-01 00:40:00,-8.31,train,-8.51,-8.41,-8.02,0.0,0.0,0.0,0.0,0.0,...,-8.41,0.0,0.0,0.0,0.0,0.0,0.0,-8.471705,-8.449231,-8.423985
2009-01-01 00:50:00,-8.27,train,-8.31,-8.51,-8.41,-8.02,0.0,0.0,0.0,0.0,...,-8.41,0.0,0.0,0.0,0.0,0.0,0.0,-8.346963,-8.35525,-8.358089


In [18]:
def extract_time_features(df):
  '''
  Returns columns with Year, Month and Day as temporal features from a timestamp column

  '''
  df['timestamp'] = pd.to_datetime(df['timestamp'], unit = 'ns')
  df['timestamp']= df['timestamp'].dt.tz_localize(None)
  #df['timestamp']= df['timestamp'].dt.tz_convert(None)
  df['timestamp_Month'] = df['timestamp'].dt.month
  df['timestamp_Year'] = df['timestamp'].dt.year
  df['timestamp_Day'] = df['timestamp'].dt.day
  df['timestamp_Hour'] = df['timestamp'].dt.hour
  return df

In [19]:
full_df_features = full_df_features.reset_index()
full_df_features.head()

Unnamed: 0,timestamp,T_degC,type,T_degC_lag_1,T_degC_lag_2,T_degC_lag_3,T_degC_lag_4,T_degC_lag_5,T_degC_lag_6,T_degC_lag_7,...,T_degC_rolling_3_median,T_degC_rolling_6_mean,T_degC_rolling_6_median,T_degC_rolling_12_mean,T_degC_rolling_12_median,T_degC_rolling_24_mean,T_degC_rolling_24_median,T_degC_ewm_0.3_mean,T_degC_ewm_0.5_mean,T_degC_ewm_0.8_mean
0,2009-01-01 00:10:00,-8.02,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2009-01-01 00:20:00,-8.41,train,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.02,-8.02,-8.02
2,2009-01-01 00:30:00,-8.51,train,-8.41,-8.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.336875,-8.3125,-8.29
3,2009-01-01 00:40:00,-8.31,train,-8.51,-8.41,-8.02,0.0,0.0,0.0,0.0,...,-8.41,0.0,0.0,0.0,0.0,0.0,0.0,-8.471705,-8.449231,-8.423985
4,2009-01-01 00:50:00,-8.27,train,-8.31,-8.51,-8.41,-8.02,0.0,0.0,0.0,...,-8.41,0.0,0.0,0.0,0.0,0.0,0.0,-8.346963,-8.35525,-8.358089


In [20]:
full_df_features = extract_time_features(full_df_features)
full_df_features.head()

Unnamed: 0,timestamp,T_degC,type,T_degC_lag_1,T_degC_lag_2,T_degC_lag_3,T_degC_lag_4,T_degC_lag_5,T_degC_lag_6,T_degC_lag_7,...,T_degC_rolling_12_median,T_degC_rolling_24_mean,T_degC_rolling_24_median,T_degC_ewm_0.3_mean,T_degC_ewm_0.5_mean,T_degC_ewm_0.8_mean,timestamp_Month,timestamp_Year,timestamp_Day,timestamp_Hour
0,2009-01-01 00:10:00,-8.02,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,2009,1,0
1,2009-01-01 00:20:00,-8.41,train,-8.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-8.02,-8.02,-8.02,1,2009,1,0
2,2009-01-01 00:30:00,-8.51,train,-8.41,-8.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-8.336875,-8.3125,-8.29,1,2009,1,0
3,2009-01-01 00:40:00,-8.31,train,-8.51,-8.41,-8.02,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-8.471705,-8.449231,-8.423985,1,2009,1,0
4,2009-01-01 00:50:00,-8.27,train,-8.31,-8.51,-8.41,-8.02,0.0,0.0,0.0,...,0.0,0.0,0.0,-8.346963,-8.35525,-8.358089,1,2009,1,0


In [22]:

import re
import warnings
from typing import List, Optional, Tuple

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
from pandas.tseries import offsets
from pandas.tseries.frequencies import to_offset


def _calculate_fourier_terms(
    seasonal_cycle: np.ndarray, max_cycle: int, n_fourier_terms: int
):
    """Calculates Fourier Terms given the seasonal cycle and max_cycle"""
    sin_X = np.empty((len(seasonal_cycle), n_fourier_terms), dtype="float64")
    cos_X = np.empty((len(seasonal_cycle), n_fourier_terms), dtype="float64")
    for i in range(1, n_fourier_terms + 1):
        sin_X[:, i - 1] = np.sin((2 * np.pi * seasonal_cycle * i) / max_cycle)
        cos_X[:, i - 1] = np.cos((2 * np.pi * seasonal_cycle * i) / max_cycle)
    return np.hstack([sin_X, cos_X])


def add_fourier_features(
    df: pd.DataFrame,
    column_to_encode: str,
    max_value: Optional[int] = None,
    n_fourier_terms: int = 1,
    use_32_bit: bool = False,
) -> Tuple[pd.DataFrame, List]:
    """Adds Fourier Terms for the specified seasonal cycle column, like month, week, hour, etc.

    Args:
        df (pd.DataFrame): The dataframe which has the seasonal cyycles which has to be encoded
        column_to_encode (str): The column name which has the seasonal cycle
        max_value (int): The maximum value the seasonal cycle can attain. for eg. for month, max_value is 12.
            If not given, it will be inferred from the data, but if the data does not have at least a
            single full cycle, the inferred max value will not be appropriate. Defaults to None
        n_fourier_terms (int): Number of fourier terms to be added. Defaults to 1
        use_32_bit (bool, optional): Flag to use float32 or int32 to reduce memory. Defaults to False.
    Raises:
        warnings.warn: Raises a warning if max_value is None

    Returns:
        [Tuple[pd.DataFrame, List]]: Returns a tuple of the new dataframe and a list of features which were added
    """
    assert (
        column_to_encode in df.columns
    ), "`column_to_encode` should be a valid column name in the dataframe"
    assert is_numeric_dtype(
        df[column_to_encode]
    ), "`column_to_encode` should have numeric values."
    if max_value is None:
        max_value = df[column_to_encode].max()
        raise warnings.warn(
            "Inferring max cycle as {} from the data. This may not be accuracte if data is less than a single seasonal cycle."
        )
    fourier_features = _calculate_fourier_terms(
        df[column_to_encode].astype(int).values,
        max_cycle=max_value,
        n_fourier_terms=n_fourier_terms,
    )
    feature_names = [
        f"{column_to_encode}_sin_{i}" for i in range(1, n_fourier_terms + 1)
    ] + [f"{column_to_encode}_cos_{i}" for i in range(1, n_fourier_terms + 1)]

    if len(feature_names) == fourier_features.shape[1]:
      for i, col in enumerate(feature_names):
        df[col] = fourier_features[:,i]
    if use_32_bit:
        df[feature_names] = df[feature_names].astype("float32")
    return df, feature_names


def bulk_add_fourier_features(
    df: pd.DataFrame,
    columns_to_encode: List[str],
    max_values: List[int],
    n_fourier_terms: int = 1,
    use_32_bit: bool = False,
) -> Tuple[pd.DataFrame, List]:
    """Adds Fourier Terms for all the specified seasonal cycle columns, like month, week, hour, etc.

    Args:
        df (pd.DataFrame): The dataframe which has the seasonal cyycles which has to be encoded
        columns_to_encode (List[str]): The column names which has the seasonal cycle
        max_values (List[int]): The list of maximum values the seasonal cycles can attain in the
            same order as the columns to encode. for eg. for month, max_value is 12.
            If not given, it will be inferred from the data, but if the data does not have at least a
            single full cycle, the inferred max value will not be appropriate. Defaults to None
        n_fourier_terms (int): Number of fourier terms to be added. Defaults to 1
        use_32_bit (bool, optional): Flag to use float32 or int32 to reduce memory. Defaults to False.
    Raises:
        warnings.warn: Raises a warning if max_value is None

    Returns:
        [Tuple[pd.DataFrame, List]]: Returns a tuple of the new dataframe and a list of features which were added
    """
    assert len(columns_to_encode) == len(
        max_values
    ), "`columns_to_encode` and `max_values` should be of same length."
    added_features = []
    for column_to_encode, max_value in zip(columns_to_encode, max_values):
        df, features = add_fourier_features(
            df,
            column_to_encode,
            max_value,
            n_fourier_terms=n_fourier_terms,
            use_32_bit=use_32_bit,
        )
        added_features += features
    return df, added_features

In [23]:
 # Fourier Features
full_df_features, fourier_features = bulk_add_fourier_features(
    full_df_features,
     ['timestamp_Month', 'timestamp_Hour'],
     max_values = [12,24],
     n_fourier_terms=5,
     use_32_bit=True
 )
full_df_features.columns

Index(['timestamp', 'T_degC', 'type', 'T_degC_lag_1', 'T_degC_lag_2',
       'T_degC_lag_3', 'T_degC_lag_4', 'T_degC_lag_5', 'T_degC_lag_6',
       'T_degC_lag_7',
       ...
       'timestamp_Hour_sin_1', 'timestamp_Hour_sin_2', 'timestamp_Hour_sin_3',
       'timestamp_Hour_sin_4', 'timestamp_Hour_sin_5', 'timestamp_Hour_cos_1',
       'timestamp_Hour_cos_2', 'timestamp_Hour_cos_3', 'timestamp_Hour_cos_4',
       'timestamp_Hour_cos_5'],
      dtype='object', length=182)

In [24]:
# Save the featured engineered dataset
full_df_features.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420122 entries, 0 to 420121
Columns: 182 entries, timestamp to timestamp_Hour_cos_5
dtypes: datetime64[us](1), float32(165), float64(11), int32(4), object(1)
memory usage: 333.9 MB


In [None]:
full_df_features[full_df_features["type"] == "train"].drop(columns="type").to_parquet('jena_train_features.parquet')
full_df_features[full_df_features["type"] == "val"].drop(columns="type").to_parquet('jena_val_features.parquet')
full_df_features[full_df_features["type"] == "test"].drop(columns="type").to_parquet('jena_test_features.parquet')


## 2 Target Preparation for ML models

## 3 Ensembling and Stacking

## 4 Global forecasting models