In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.stattools import adfuller

warnings.filterwarnings("ignore")

In [2]:
file_path = 'us_births_dataset.tsf'

encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
for encoding in encodings:
    try:
        df = pd.read_csv(file_path, encoding=encoding, sep='\t')
        break
    except UnicodeDecodeError:
        print(f"Failed to decode using {encoding} encoding. Trying another encoding...")
        continue

In [3]:
df = df.iloc[14:].rename(columns={'# Dataset Information': 'val'})
df['dt'] = df['val'].apply(lambda x: x.split(':')[1])
df['val'] = df['val'].apply(lambda x: x.split(':')[2])
res = pd.DataFrame()
res['val'] = df.loc[14]['val'].split(',')
res['val'] = res['val'].astype(int)
res['dt'] = pd.date_range(start=df['dt'].loc[14], periods=len(res), freq='D')
df_train = res[res['dt']<'1988']
df_val = res[res['dt']>='1988']

## Tabularizing the data itself is not hard

In [4]:
train_raw = pd.DataFrame()

for i in range(30):
    name = f'feature_{i}' if i<29 else 'y'
    train_raw[name] = df_train['val'].shift(29-i)
    
train_raw = train_raw.dropna().astype(int)

## Let's get the baseline straight away -- just predict the YoY values

In [5]:
baseline = res[(res['dt']<'1988')&(res['dt']>='1987')]
# hack to get around leap year
before = baseline[baseline['dt']<='1987-02-28']
after = baseline[baseline['dt']>'1987-02-28']
leap = baseline[baseline['dt']=='1987-02-28']
baseline = pd.concat([before, leap, after])

pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    preds = baseline['val'].iloc[:n]
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds)}""")

Prediction period: 7 days, MAPE: 0.08566350097603315
Prediction period: 30 days, MAPE: 0.08876079721836848
Prediction period: 90 days, MAPE: 0.1063334702841136
Prediction period: 180 days, MAPE: 0.13028834209142345
Prediction period: 366 days, MAPE: 0.13830846637763883


In [6]:
%%time
model = LinearRegression()
model.fit(train_raw.drop(columns=['y']), train_raw['y'])

CPU times: total: 734 ms
Wall time: 897 ms


# Another interesting question concerns how quickly the model's performance degrades

In [7]:
from typing import List
# We will keep the slider as our X, and push the predictions to a list of predictions
# It is slow because we keep reinitializing the arra


def make_rolling_predictions(df_train: pd.DataFrame, period_len: int) -> List[float]:
    preds = []
    slider = train_raw.drop(columns=['feature_0']).iloc[-1].to_numpy()
    for i in range(period_len):
        pred = model.predict(slider[i:].reshape(1, -1))[0]
        slider = np.append(slider, pred)
        preds.append(pred)
    return preds

In [8]:
# Week
pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    preds = make_rolling_predictions(train_raw, n)
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds)}""")

Prediction period: 7 days, MAPE: 0.040616854752418545
Prediction period: 30 days, MAPE: 0.04128496022374253
Prediction period: 90 days, MAPE: 0.04073536140554237
Prediction period: 180 days, MAPE: 0.05380131757016138
Prediction period: 366 days, MAPE: 0.08288249899920987


## It seems that the first month is quite volatile, but after half a year performance starts degrading on average

# Will boosting be better?

In [23]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from xgboost import XGBRegressor

In [20]:
val_xgboost = train_raw.iloc[-365:]
train_xgboost = train_raw.iloc[:-365]

x_train = train_xgboost.drop(columns=['y'])
y_train = train_xgboost['y']

x_val = val_xgboost.drop(columns=['y'])
y_val = val_xgboost['y']

In [31]:
def make_rolling_predictions_xgboost(df_train: pd.DataFrame, period_len: int, model) -> List[float]:
    preds = []
    slider = df_train.iloc[-1].to_numpy()
    for i in range(period_len):
        pred = model.predict(slider[i:].reshape(1, -1))[0]
        slider = np.append(slider, pred)
        preds.append(pred)
    return preds

In [37]:
%%time

#XGBoost tuning

space={
    'max_depth': hp.quniform("max_depth", 3, 8, 1),
    'gamma': hp.uniform ('gamma', 1,9),
    'reg_alpha' : hp.quniform('reg_alpha', 40,150,1),
    'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 50, 500),
    'learning_rate': hp.uniform('learning_rate', 0.05,0.2),
}

def objective(space):
    hyperparams = {
        'eval_metric': "mape",
        'n_estimators': int(space['n_estimators']), 
        'max_depth': int(space['max_depth']), 
        'gamma': space['gamma'],
        'reg_alpha': int(space['reg_alpha']),
        'min_child_weight': int(space['min_child_weight']),
        'colsample_bytree': int(space['colsample_bytree']),
        'learning_rate': space['learning_rate'],
    }
    clf=XGBRegressor(**hyperparams)
    clf.fit(x_train, y_train)
    
    preds = make_rolling_predictions_xgboost(x_val, 365, clf)
    opt_metric = mean_absolute_percentage_error(y_val, preds)
    print(f"""Prediction period: 365 days, MAPE: {opt_metric}""")
    print ("SCORE:", opt_metric)
    return {'loss': opt_metric, 'status': STATUS_OK }

trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 500,
                        trials = trials)
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

Prediction period: 365 days, MAPE: 0.14460696997019232                                                                 
SCORE:                                                                                                                 
0.14460696997019232                                                                                                    
Prediction period: 365 days, MAPE: 0.14773007413932399                                                                 
SCORE:                                                                                                                 
0.14773007413932399                                                                                                    
Prediction period: 365 days, MAPE: 0.12944790454066346                                                                 
SCORE:                                                                                                                 
0.12944790454066346                     

0.06512164874732555                                                                                                    
Prediction period: 365 days, MAPE: 0.08750592945966836                                                                 
SCORE:                                                                                                                 
0.08750592945966836                                                                                                    
Prediction period: 365 days, MAPE: 0.06802289812830628                                                                 
SCORE:                                                                                                                 
0.06802289812830628                                                                                                    
Prediction period: 365 days, MAPE: 0.07426775788141995                                                                 
SCORE:                                  

SCORE:                                                                                                                 
0.10477582392649709                                                                                                    
Prediction period: 365 days, MAPE: 0.12172584942639555                                                                 
SCORE:                                                                                                                 
0.12172584942639555                                                                                                    
Prediction period: 365 days, MAPE: 0.1082761658934094                                                                  
SCORE:                                                                                                                 
0.1082761658934094                                                                                                     
Prediction period: 365 days, MAPE: 0.111

Prediction period: 365 days, MAPE: 0.07000595419230182                                                                 
SCORE:                                                                                                                 
0.07000595419230182                                                                                                    
Prediction period: 365 days, MAPE: 0.06751545272120266                                                                 
SCORE:                                                                                                                 
0.06751545272120266                                                                                                    
Prediction period: 365 days, MAPE: 0.10227634642023425                                                                 
SCORE:                                                                                                                 
0.10227634642023425                     

0.06804296334718003                                                                                                    
Prediction period: 365 days, MAPE: 0.0837422923403941                                                                  
SCORE:                                                                                                                 
0.0837422923403941                                                                                                     
Prediction period: 365 days, MAPE: 0.08250566379282002                                                                 
SCORE:                                                                                                                 
0.08250566379282002                                                                                                    
Prediction period: 365 days, MAPE: 0.10872190899008449                                                                 
SCORE:                                  

SCORE:                                                                                                                 
0.12644655973871496                                                                                                    
Prediction period: 365 days, MAPE: 0.07303310001851858                                                                 
SCORE:                                                                                                                 
0.07303310001851858                                                                                                    
Prediction period: 365 days, MAPE: 0.07105071814410867                                                                 
SCORE:                                                                                                                 
0.07105071814410867                                                                                                    
Prediction period: 365 days, MAPE: 0.086

Prediction period: 365 days, MAPE: 0.08545396959920916                                                                 
SCORE:                                                                                                                 
0.08545396959920916                                                                                                    
Prediction period: 365 days, MAPE: 0.13403346549321676                                                                 
SCORE:                                                                                                                 
0.13403346549321676                                                                                                    
Prediction period: 365 days, MAPE: 0.1282640892374834                                                                  
SCORE:                                                                                                                 
0.1282640892374834                      

0.0871654428428772                                                                                                     
Prediction period: 365 days, MAPE: 0.0698199668136274                                                                  
SCORE:                                                                                                                 
0.0698199668136274                                                                                                     
Prediction period: 365 days, MAPE: 0.09771302727626383                                                                 
SCORE:                                                                                                                 
0.09771302727626383                                                                                                    
Prediction period: 365 days, MAPE: 0.07439380897171521                                                                 
SCORE:                                  

SCORE:                                                                                                                 
0.10159066030601908                                                                                                    
Prediction period: 365 days, MAPE: 0.07046813664245163                                                                 
SCORE:                                                                                                                 
0.07046813664245163                                                                                                    
Prediction period: 365 days, MAPE: 0.08766447559162911                                                                 
SCORE:                                                                                                                 
0.08766447559162911                                                                                                    
Prediction period: 365 days, MAPE: 0.091

Prediction period: 365 days, MAPE: 0.07819005984233997                                                                 
SCORE:                                                                                                                 
0.07819005984233997                                                                                                    
Prediction period: 365 days, MAPE: 0.08450815809520403                                                                 
SCORE:                                                                                                                 
0.08450815809520403                                                                                                    
Prediction period: 365 days, MAPE: 0.10352439406977249                                                                 
SCORE:                                                                                                                 
0.10352439406977249                     

0.09484045175089723                                                                                                    
Prediction period: 365 days, MAPE: 0.07257223385829853                                                                 
SCORE:                                                                                                                 
0.07257223385829853                                                                                                    
Prediction period: 365 days, MAPE: 0.09112827657713148                                                                 
SCORE:                                                                                                                 
0.09112827657713148                                                                                                    
Prediction period: 365 days, MAPE: 0.06478319096679724                                                                 
SCORE:                                  

SCORE:                                                                                                                 
0.06762950908973776                                                                                                    
Prediction period: 365 days, MAPE: 0.09344185972275924                                                                 
SCORE:                                                                                                                 
0.09344185972275924                                                                                                    
Prediction period: 365 days, MAPE: 0.07296754097829933                                                                 
SCORE:                                                                                                                 
0.07296754097829933                                                                                                    
Prediction period: 365 days, MAPE: 0.141

Prediction period: 365 days, MAPE: 0.08435270148391447                                                                 
SCORE:                                                                                                                 
0.08435270148391447                                                                                                    
Prediction period: 365 days, MAPE: 0.07221345914095999                                                                 
SCORE:                                                                                                                 
0.07221345914095999                                                                                                    
Prediction period: 365 days, MAPE: 0.0708983453484731                                                                  
SCORE:                                                                                                                 
0.0708983453484731                      

0.08963622868157757                                                                                                    
Prediction period: 365 days, MAPE: 0.06195630899579082                                                                 
SCORE:                                                                                                                 
0.06195630899579082                                                                                                    
Prediction period: 365 days, MAPE: 0.06811810228109966                                                                 
SCORE:                                                                                                                 
0.06811810228109966                                                                                                    
Prediction period: 365 days, MAPE: 0.06924459137447962                                                                 
SCORE:                                  

SCORE:                                                                                                                 
0.13764684055045945                                                                                                    
Prediction period: 365 days, MAPE: 0.05501557595261017                                                                 
SCORE:                                                                                                                 
0.05501557595261017                                                                                                    
Prediction period: 365 days, MAPE: 0.07532079292351813                                                                 
SCORE:                                                                                                                 
0.07532079292351813                                                                                                    
Prediction period: 365 days, MAPE: 0.075

Prediction period: 365 days, MAPE: 0.0632338417046759                                                                  
SCORE:                                                                                                                 
0.0632338417046759                                                                                                     
Prediction period: 365 days, MAPE: 0.07379458479705378                                                                 
SCORE:                                                                                                                 
0.07379458479705378                                                                                                    
Prediction period: 365 days, MAPE: 0.09928710819314684                                                                 
SCORE:                                                                                                                 
0.09928710819314684                     

0.08883683985359034                                                                                                    
Prediction period: 365 days, MAPE: 0.09319662554940332                                                                 
SCORE:                                                                                                                 
0.09319662554940332                                                                                                    
Prediction period: 365 days, MAPE: 0.05749753226962692                                                                 
SCORE:                                                                                                                 
0.05749753226962692                                                                                                    
Prediction period: 365 days, MAPE: 0.10213444837396432                                                                 
SCORE:                                  

SCORE:                                                                                                                 
0.11452623495896246                                                                                                    
Prediction period: 365 days, MAPE: 0.09723291050940461                                                                 
SCORE:                                                                                                                 
0.09723291050940461                                                                                                    
Prediction period: 365 days, MAPE: 0.08161663662514108                                                                 
SCORE:                                                                                                                 
0.08161663662514108                                                                                                    
Prediction period: 365 days, MAPE: 0.068

Prediction period: 365 days, MAPE: 0.07703928446803956                                                                 
SCORE:                                                                                                                 
0.07703928446803956                                                                                                    
Prediction period: 365 days, MAPE: 0.06714560047031554                                                                 
SCORE:                                                                                                                 
0.06714560047031554                                                                                                    
Prediction period: 365 days, MAPE: 0.08293625020072942                                                                 
SCORE:                                                                                                                 
0.08293625020072942                     

0.08630918541055675                                                                                                    
Prediction period: 365 days, MAPE: 0.10005280786851113                                                                 
SCORE:                                                                                                                 
0.10005280786851113                                                                                                    
Prediction period: 365 days, MAPE: 0.06941471646958285                                                                 
SCORE:                                                                                                                 
0.06941471646958285                                                                                                    
Prediction period: 365 days, MAPE: 0.07010886136274358                                                                 
SCORE:                                  

SCORE:                                                                                                                 
0.07619663766599535                                                                                                    
Prediction period: 365 days, MAPE: 0.06758819385588874                                                                 
SCORE:                                                                                                                 
0.06758819385588874                                                                                                    
Prediction period: 365 days, MAPE: 0.06533034953602743                                                                 
SCORE:                                                                                                                 
0.06533034953602743                                                                                                    
Prediction period: 365 days, MAPE: 0.082

Prediction period: 365 days, MAPE: 0.069937269594544                                                                   
SCORE:                                                                                                                 
0.069937269594544                                                                                                      
Prediction period: 365 days, MAPE: 0.08967925971390167                                                                 
SCORE:                                                                                                                 
0.08967925971390167                                                                                                    
Prediction period: 365 days, MAPE: 0.08704095973493044                                                                 
SCORE:                                                                                                                 
0.08704095973493044                     

0.0701558958540125                                                                                                     
Prediction period: 365 days, MAPE: 0.0960138122362339                                                                  
SCORE:                                                                                                                 
0.0960138122362339                                                                                                     
100%|█████████████████████████████████████████████| 500/500 [08:40<00:00,  1.04s/trial, best loss: 0.05413192035198688]
The best hyperparameters are :  

{'colsample_bytree': 0.604424274212893, 'gamma': 4.908344535766618, 'learning_rate': 0.09482545161462563, 'max_depth': 3.0, 'min_child_weight': 10.0, 'n_estimators': 477.2564738821956, 'reg_alpha': 126.0, 'reg_lambda': 0.14229415737818346}
CPU times: total: 1h 35min 47s
Wall time: 8min 40s


In [44]:
%%time
import xgboost as xgb

params = {
    'colsample_bytree': 0.604424274212893, 
    'gamma': 4.908344535766618, 
    'learning_rate': 0.09482545161462563, 
    'max_depth': 3, 
    'min_child_weight': 10.0, 
    'n_estimators': 377, 
    'reg_alpha': 126.0, 
    'reg_lambda': 0.14229415737818346
}

model = xgb.XGBRegressor(**params)
model.fit(train_raw.drop(columns=['y']), train_raw['y'])

CPU times: total: 6.17 s
Wall time: 533 ms


In [45]:
pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    preds = make_rolling_predictions(train_raw, n)
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds)}""")

Prediction period: 7 days, MAPE: 0.040659403692688154
Prediction period: 30 days, MAPE: 0.03086103675251898
Prediction period: 90 days, MAPE: 0.03179675321045949
Prediction period: 180 days, MAPE: 0.03780194984594605
Prediction period: 366 days, MAPE: 0.06081115515936373


# Common technique with forecasting is making a time series stationary. Will it work here?

In [15]:
# https://analyzingalpha.com/make-time-series-stationary-python

t_stat, p_value, _, _, critical_values, _  = adfuller(res['val'].values, autolag='AIC')
print(f'ADF Statistic: {t_stat:.2f}')
print(f'p-value: {p_value:.2f}')
for key, value in critical_values.items():
    print('Critial Values:')
    print(f'   {key}, {value:.2f}')

ADF Statistic: -3.35
p-value: 0.01
Critial Values:
   1%, -3.43
Critial Values:
   5%, -2.86
Critial Values:
   10%, -2.57


### Since it is already stationary, we will skip this

# Let's try some enterprise-grade libraries

In [124]:
# Example of enterprise-grade code
# https://github.com/EnterpriseQualityCoding/FizzBuzzEnterpriseEdition

### Prophet

In [129]:
from prophet import Prophet

train_prophet = df_train.rename(columns={'val': 'y', 'dt': 'ds'})
train_prophet['ds'] = train_prophet['ds'].dt.tz_localize(None)
m = Prophet(seasonality_mode='multiplicative').fit(train_prophet)
future = m.make_future_dataframe(periods=366)
fcst = m.predict(future)
preds = fcst['yhat'][-366:]

11:15:05 - cmdstanpy - INFO - Chain [1] start processing
11:15:07 - cmdstanpy - INFO - Chain [1] done processing


In [137]:
pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds[:n])}""")

Prediction period: 7 days, MAPE: 0.07064687509902642
Prediction period: 30 days, MAPE: 0.03405022642598421
Prediction period: 90 days, MAPE: 0.030315835205533676
Prediction period: 180 days, MAPE: 0.035795417191863396
Prediction period: 366 days, MAPE: 0.03923938972163525


### Orbit

Note: could not install orbit on windows locally, ran experiment on Google collab

In [None]:
import orbit
from orbit.models import DLT, LGT

In [None]:
%%time

lgt = LGT(
    response_col='y',
    date_col='ds',
    seasonality=365,
)
lgt.fit(df=train_prophet, point_method='mean')
preds = lgt.predict(df=df_val)
preds = preds['prediction']

In [None]:
pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds[:n])}""")

In [None]:
Prediction period: 7 days, MAPE: 0.11142835532232606
Prediction period: 30 days, MAPE: 0.09469582851267362
Prediction period: 90 days, MAPE: 0.08957789656238524
Prediction period: 180 days, MAPE: 0.09654671277155341
Prediction period: 366 days, MAPE: 0.10335305100458006

In [None]:
dlt = DLT(
    response_col='y',
    date_col='ds',
    estimator='stan-map',
    seasonality=365,
    global_trend_option='linear',
    # for prediction uncertainty
    n_bootstrap_draws=1000,
)

dlt.fit(train_prophet)
test_df = dlt.make_future_df(periods=366)
predicted_df = dlt.predict(test_df)
preds = predicted_df['prediction']

In [None]:
pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds[:n])}""")

In [None]:
Prediction period: 7 days, MAPE: 0.1253211184178577
Prediction period: 30 days, MAPE: 0.10249061121421463
Prediction period: 90 days, MAPE: 0.0978171992098667
Prediction period: 180 days, MAPE: 0.10230472017225993
Prediction period: 366 days, MAPE: 0.1094757246466234

# Kite

Note: ran on google collab, could not install on windows

In [None]:
from greykite.common.data_loader import DataLoader
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results

In [None]:
metadata = MetadataParam(
    time_col="ds",
    value_col="y",
    freq="D",
)

forecaster = Forecaster()
result = forecaster.run_forecast_config(
    df=df,
    config=ForecastConfig(
        model_template=ModelTemplateEnum.AUTO.name,
        forecast_horizon=366,
        coverage=0.95,
        metadata_param=metadata
    )
)
preds = result.forecast.df
preds = preds[preds['actual'].isna()]['forecast']

In [None]:
pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds[:n])}""")

In [None]:
Prediction period: 7 days, MAPE: 0.03870756034638719
Prediction period: 30 days, MAPE: 0.01827488650558939
Prediction period: 90 days, MAPE: 0.016650699439004827
Prediction period: 180 days, MAPE: 0.02934550398573212
Prediction period: 366 days, MAPE: 0.03319412447428402

### Deep Learning

In [16]:
from sktime.forecasting.ltsf import LTSFLinearForecaster
# https://www.sktime.net/en/latest/api_reference/auto_generated/sktime.forecasting.ltsf.LTSFLinearForecaster.html
dl_forecaster = LTSFLinearForecaster(
    seq_len=180, 
    pred_len=180,
    num_epochs=50, 
    batch_size=24,
    lr=0.0003
)

In [17]:
%%time

horizon = list(range(366))
dl_forecaster.fit(df_train.set_index('dt'), fh=horizon)
preds = dl_forecaster.predict() 

CPU times: total: 2min 9s
Wall time: 22.8 s


In [18]:
pred_lens = [7, 30, 90, 180, 366]
for n in pred_lens:
    print(f"""Prediction period: {n} days, MAPE: {mean_absolute_percentage_error(df_val['val'].iloc[:n], preds.iloc[:n])}""")

Prediction period: 7 days, MAPE: 0.14480080867199027
Prediction period: 30 days, MAPE: 0.10108456423567207
Prediction period: 90 days, MAPE: 0.09461629589589346
Prediction period: 180 days, MAPE: 0.0947847687264964
Prediction period: 366 days, MAPE: 0.09191100032539415
