In [59]:
!pip install -U scikit-learn

^C


In [None]:
!pip install sktime --ignore-installed llvmlite

In [None]:
!pip install pandas_market_calendars

In [None]:
!pip install -U scikit-optimize

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
feature_df = pd.read_csv('features.csv')

X_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')
y_train = pd.read_csv('train_labels.csv')
y_test = pd.read_csv('test_labels.csv')

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

In [48]:
import pandas_market_calendars as mcal

for df in (feature_df, X_train, X_test, y_train, y_test):
    df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
    df.set_index('date', inplace=True)

nyse = mcal.get_calendar('NYSE')
bday_us = pd.offsets.CustomBusinessDay(holidays=nyse.adhoc_holidays, calendar=nyse.regular_holidays, weekmask="1111100")
feature_df = feature_df.asfreq(bday_us)
X_train = X_train.asfreq(bday_us)
X_test = X_test.asfreq(bday_us)
y_train = y_train.asfreq(bday_us)['ERP_Sign']
y_test = y_test.asfreq(bday_us)['ERP_Sign']

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1200), X_test: (855, 1200), y_train: (3416,), y_test: (855,)


In [50]:
data_series = pd.concat([y_train, y_test])

In [51]:
max_lags = 10
fcst_horizon = [5, 10, 15, 20]
model_vars = feature_df.columns

In [98]:
from sklearn.linear_model import LogisticRegression

logit_clf = LogisticRegression(penalty='none', solver='lbfgs', max_iter=10000, random_state=42)
ridge_clf = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000, random_state=42)
lasso_clf = LogisticRegression(penalty='l1', solver='saga', max_iter=10000, random_state=42)

In [7]:
from sklearn import metrics

# Define (negative) log-loss as the loss function, aka scorer
log_loss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])

In [None]:
from skopt.space import Real, Integer
from sklearn.model_selection import TimeSeriesSplit
from skopt import BayesSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series

fcst_train_df = pd.DataFrame(
    [(f'h{h}', f'{clf}') for h in fcst_horizon for clf in ('logit_clf', 'ridge_clf', 'lasso_clf')],
    columns=['fh','clf']    
)

fcst_test_df = pd.DataFrame(
    [(f'h{h}', f'{clf}') for h in fcst_horizon for clf in ('logit_clf', 'ridge_clf', 'lasso_clf')],
    columns=['fh','clf']    
)

fcst_pred_df = pd.DataFrame().reindex(data_series.index)
fcst_prob_df = pd.DataFrame().reindex(data_series.index)

fcst_params_dict = {}
fcst_coefs_dict = {}

# Define the parameter grid for cross-validation to search through
param_grid = {
    'C': Real(0.01, 100, prior='log-uniform')
}

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    # Create an expanding window splitter for cross-validation
    cv = TimeSeriesSplit(n_splits=10)

    for clf in (logit_clf, ridge_clf, lasso_clf):
        search = BayesSearchCV(estimator=clf, search_spaces=param_grid, n_iter=60, scoring=log_loss, cv=cv, random_state=42)
        with warnings.catch_warnings(record=True) as caught_warnings:
            warnings.filterwarnings('always', category=ConvergenceWarning, module='sklearn')
            search.fit(X=X_train_h, y=y_train_h)
            for warn in caught_warnings:
                fcst_train_df.loc[i, 'Convergence'] = False

        fcst_params_dict[(f'h{h}', f'{clf}')] = search.best_params_
        clf_fitted = clf.__class__(**search.best_estimator_.get_params()).fit(X=X_train_h, y=y_train_h)
        fcst_coefs_dict[(f'h{h}', f'{clf}')] = pd.concat([pd.DataFrame(X_train_h.columns), pd.DataFrame(np.transpose(clf_fitted.coef_))], axis=1).sorted
        
        y_pred_train = clf_fitted.predict(X_train_h)
        y_prob_train = clf_fitted.predict_proba(X_train_h)[:, 1]

        fcst_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
        fcst_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
        fcst_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
        fcst_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
        fcst_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
        fcst_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
        fcst_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

        y_pred_test = clf_fitted.predict(X_test_h)
        y_prob_test = clf_fitted.predict_proba(X_test_h)[:, 1]
        
        fcst_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
        fcst_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
        fcst_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
        fcst_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
        fcst_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
        fcst_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
        fcst_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)
        
        # Convert np arrays to datetime series for time series plotting
        y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
        y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

        fcst_pred_df[(f'h{h}', f'{clf}')] = y_pred
        fcst_prob_df[(f'h{h}', f'{clf}')] = y_prob

        # Plot the actual recession binary values with the predicted binary values
        plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
        plt.title(f'erp_binary_{clf}_h{h}')
        plt.tight_layout()
        plt.savefig(f'erp_binary_{clf}_h{h}.png')
        plt.close()

        # Plot the actual recession binary values with the predicted probabilities
        plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
        plt.title(f'erp_prob_{clf}_h{h}')
        plt.tight_layout()
        plt.savefig(f'erp_prob_{clf}_h{h}.png')
        plt.close()

        i += 1

In [None]:
from skopt.space import Real, Integer
from sklearn.model_selection import TimeSeriesSplit
from skopt import BayesSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series

elnet_train_df = pd.DataFrame(
    [(f'h{h}', 'elnet_clf') for h in fcst_horizon], columns=['fh','clf']    
)

elnet_test_df = pd.DataFrame(
    [(f'h{h}', 'elnet_clf') for h in fcst_horizon], columns=['fh','clf']    
)

elnet_pred_df = pd.DataFrame().reindex(data_series.index)
elnet_prob_df = pd.DataFrame().reindex(data_series.index)

elnet_params_dict = {}
elnet_coefs_dict = {}

# Define the parameter grid for cross-validation to search through
param_grid = {
    'C': Real(0.01, 100, prior='log-uniform'),
    'l1_ratio': Real(0.01, 1)
}

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    # Create an expanding window splitter for cross-validation
    cv = TimeSeriesSplit(n_splits=10)

    elnet_clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, random_state=42)
    search = BayesSearchCV(estimator=elnet_clf, search_spaces=param_grid, n_iter=60, scoring=log_loss, cv=cv, random_state=42)
    with warnings.catch_warnings(record=True) as caught_warnings:
        warnings.filterwarnings('always', category=ConvergenceWarning, module='sklearn')
        search.fit(X=X_train_h, y=y_train_h)
        for warn in caught_warnings:
            elnet_train_df.loc[i, 'Convergence'] = False
        
    elnet_params_dict[(f'h{h}', 'elnet_clf')] = search.best_params_

    elnet_fitted = LogisticRegression(**search.best_estimator_.get_params()).fit(X=X_train_h, y=y_train_h)
    elnet_coefs_dict[(f'h{h}', 'elnet_clf')] = pd.concat([pd.DataFrame(X_train_h.columns), pd.DataFrame(np.transpose(elnet_fitted.coef_))], axis=1).sorted
    
    y_pred_train = search.predict(X_train_h)
    y_prob_train = search.predict_proba(X_train_h)[:, 1]

    elnet_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    elnet_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    elnet_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
    elnet_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    elnet_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    elnet_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
    elnet_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

    y_pred_test = search.predict(X_test_h)
    y_prob_test = search.predict_proba(X_test_h)[:, 1]
    
    elnet_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    elnet_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    elnet_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
    elnet_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    elnet_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    elnet_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
    elnet_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
    y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

    elnet_pred_df[(f'h{h}', 'elnet_clf')] = y_pred
    elnet_prob_df[(f'h{h}', 'elnet_clf')] = y_prob

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_elnet_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_elnet_clf_h{h}.png')
    plt.close()

    # Plot the actual recession binary values with the predicted probabilities
    plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
    plt.title(f'erp_prob_elnet_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_prob_elnet_clf_h{h}.png')
    plt.close()

    i += 1

In [None]:
combined_train_df = pd.concat([fcst_train_df, elnet_train_df], ignore_index=True)
combined_test_df = pd.concat([fcst_test_df, elnet_test_df], ignore_index=True)
combined_pred_df = pd.concat([fcst_pred_df, elnet_pred_df], axis=1)
combined_prob_df = pd.concat([fcst_prob_df, elnet_prob_df], axis=1)

In [None]:
combined_train_df.to_csv('erp_logit_train_results.csv')
combined_test_df.to_csv('erp_logit_test_results.csv')
combined_pred_df.to_csv('erp_logit_pred_estimates.csv')
combined_prob_df.to_csv('erp_logit_prob_estimates.csv')

In [None]:
fcst_params_df = pd.DataFrame.from_dict(fcst_params_dict, orient='index')
elnet_params_df = pd.DataFrame.from_dict(elnet_params_dict, orient='index')
combined_params_df = pd.concat([fcst_params_df, elnet_params_df], ignore_index=True)
combined_params_df.to_csv('erp_logit_parameters.csv')

In [None]:
combined_coefs_df = pd.DataFrame()

for h in (5, 10):
    for clf in (logit_clf, ridge_clf, lasso_clf):
        tmp_coefs_df = fcst_coefs_dict[(f'h{h}', f'{clf}')])
        tmp_coefs_df.columns = ['features', f'{clf}']
        combined_coefs_df = pd.concat([combined_coefs_df, tmp_coefs_df], axis=1)
    
    tmp_coefs_df = elnet_coefs_dict[(f'h{h}', 'elnet_clf')])
    tmp_coefs_df.columns = ['features', 'elnet_clf']
    combined_coefs_df = pd.concat([combined_coefs_df, tmp_coefs_df], axis=1)
    combined_coefs_df.to_csv(f'erp_logit_h{h}_coefficients.csv')