In [1]:
!pip install -U scikit-learn

^C


In [None]:
!pip install sktime --ignore-installed llvmlite

In [None]:
!pip install pandas_market_calendars

In [None]:
!pip install -U scikit-optimize

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

In [3]:
feature_df = pd.read_csv('features.csv')

X_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')
y_train = pd.read_csv('train_labels.csv')
y_test = pd.read_csv('test_labels.csv')

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1201), X_test: (855, 1201), y_train: (3416, 2), y_test: (855, 2)


In [4]:
import pandas_market_calendars as mcal

for df in (feature_df, X_train, X_test, y_train, y_test):
    df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
    df.set_index('date', inplace=True)

nyse = mcal.get_calendar('NYSE')
bday_us = pd.offsets.CustomBusinessDay(holidays=nyse.adhoc_holidays, calendar=nyse.regular_holidays, weekmask="1111100")
feature_df = feature_df.asfreq(bday_us)
X_train = X_train.asfreq(bday_us)
X_test = X_test.asfreq(bday_us)
y_train = y_train.asfreq(bday_us)['ERP_Sign']
y_test = y_test.asfreq(bday_us)['ERP_Sign']

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1200), X_test: (855, 1200), y_train: (3416,), y_test: (855,)


In [5]:
data_series = pd.concat([y_train, y_test])

In [31]:
max_lags = 10
fcst_horizon = [5, 10]
model_vars = feature_df.columns

In [27]:
logit_params_df = pd.read_csv('erp_logit_parameters.csv', index_col=0).set_index('fh')
logit_params_df = logit_params_df.loc[logit_params_df['clf'] == 'elnet_clf'].drop('clf', axis=1)
logit_params_dict = logit_params_df.to_dict(orient='index')

In [14]:
xgb_params_df = pd.read_csv('erp_xgb_parameters.csv', index_col=0)
xgb_params_df.drop('Unnamed: 1', axis=1, inplace=True)
xgb_params_dict = xgb_params_df.to_dict(orient='index')

In [16]:
knn_params_df = pd.read_csv('erp_knn_parameters.csv', index_col=0)
knn_params_df.drop('Unnamed: 1', axis=1, inplace=True)
knn_params_dict = knn_params_df.to_dict(orient='index')

In [32]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series

voting_h_train_df = pd.DataFrame(
    [(f'h{h}', 'voting_h_clf') for h in fcst_horizon], columns=['fh','clf']    
)

voting_h_test_df = pd.DataFrame(
    [(f'h{h}', 'voting_h_clf') for h in fcst_horizon], columns=['fh','clf']    
)

voting_h_pred_df = pd.DataFrame().reindex(data_series.index)

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    log_clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, random_state=42, **logit_params_dict[f'h{h}'])
    xgb_clf = XGBClassifier(learning_rate=0.01, objective='binary:logistic', random_state=42, **xgb_params_dict[f'h{h}'])
    knn_clf = KNeighborsClassifier(**knn_params_dict[f'h{h}'])
    estimators_v = [('log', log_clf), ('xgb', xgb_clf), ('knn', knn_clf)]

    voting_h_clf = VotingClassifier(estimators=estimators_v, voting='hard')
    voting_h_clf.fit(X_train_h, y_train_h)
    
    y_pred_train = voting_h_clf.predict(X_train_h)

    voting_h_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    voting_h_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    voting_h_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    voting_h_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    voting_h_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)

    y_pred_test = voting_h_clf.predict(X_test_h)
    
    voting_h_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    voting_h_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    voting_h_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    voting_h_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    voting_h_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])

    voting_h_pred_df[(f'h{h}', 'voting_h_clf')] = y_pred

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_voting_h_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_voting_h_clf_h{h}.png')
    plt.close()

    i += 1


In [39]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series

voting_s_train_df = pd.DataFrame(
    [(f'h{h}', 'voting_s_clf') for h in fcst_horizon], columns=['fh','clf']    
)

voting_s_test_df = pd.DataFrame(
    [(f'h{h}', 'voting_s_clf') for h in fcst_horizon], columns=['fh','clf']    
)

voting_s_pred_df = pd.DataFrame().reindex(data_series.index)
voting_s_prob_df = pd.DataFrame().reindex(data_series.index)

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    log_clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, random_state=42, **logit_params_dict[f'h{h}'])
    xgb_clf = XGBClassifier(learning_rate=0.01, objective='binary:logistic', random_state=42, **xgb_params_dict[f'h{h}'])
    knn_clf = KNeighborsClassifier(**knn_params_dict[f'h{h}'])
    estimators_v = [('log', log_clf), ('xgb', xgb_clf), ('knn', knn_clf)]

    voting_s_clf = VotingClassifier(estimators=estimators_v, voting='soft')
    voting_s_clf.fit(X_train_h, y_train_h)
    
    y_pred_train = voting_s_clf.predict(X_train_h)
    y_prob_train = voting_s_clf.predict_proba(X_train_h)[:, 1]

    voting_s_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    voting_s_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    voting_s_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
    voting_s_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    voting_s_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    voting_s_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
    voting_s_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

    y_pred_test = voting_s_clf.predict(X_test_h)
    y_prob_test = voting_s_clf.predict_proba(X_test_h)[:, 1]
    
    voting_s_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    voting_s_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    voting_s_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
    voting_s_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    voting_s_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    voting_s_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
    voting_s_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
    y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

    voting_s_pred_df[(f'h{h}', 'voting_s_clf')] = y_pred
    voting_s_prob_df[(f'h{h}', 'voting_s_clf')] = y_prob

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_voting_s_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_voting_s_clf_h{h}.png')
    plt.close()

    # Plot the actual recession binary values with the predicted probabilities
    plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
    plt.title(f'erp_prob_voting_s_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_prob_voting_s_clf_h{h}.png')
    plt.close()

    i += 1

In [40]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series

stack_logit_train_df = pd.DataFrame(
    [(f'h{h}', 'stack_logit_clf') for h in fcst_horizon], columns=['fh','clf']    
)

stack_logit_test_df = pd.DataFrame(
    [(f'h{h}', 'stack_logit_clf') for h in fcst_horizon], columns=['fh','clf']    
)

stack_logit_pred_df = pd.DataFrame().reindex(data_series.index)
stack_logit_prob_df = pd.DataFrame().reindex(data_series.index)

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    log_clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, random_state=42, **logit_params_dict[f'h{h}'])
    xgb_clf = XGBClassifier(learning_rate=0.01, objective='binary:logistic', random_state=42, **xgb_params_dict[f'h{h}'])
    knn_clf = KNeighborsClassifier(**knn_params_dict[f'h{h}'])
    nb_clf = GaussianNB(priors=[0.5, 0.5])
    estimators_s = [('log', log_clf), ('xgb', xgb_clf), ('knn', knn_clf), ('nb', nb_clf)]

    stack_logit_clf = StackingClassifier(
        estimators=estimators_s, 
        final_estimator=LogisticRegression(random_state=42), cv=10
    )
    stack_logit_clf.fit(X_train_h, y_train_h)
    
    y_pred_train = stack_logit_clf.predict(X_train_h)
    y_prob_train = stack_logit_clf.predict_proba(X_train_h)[:, 1]

    stack_logit_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    stack_logit_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    stack_logit_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
    stack_logit_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    stack_logit_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    stack_logit_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
    stack_logit_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

    y_pred_test = stack_logit_clf.predict(X_test_h)
    y_prob_test = stack_logit_clf.predict_proba(X_test_h)[:, 1]
    
    stack_logit_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    stack_logit_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    stack_logit_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
    stack_logit_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    stack_logit_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    stack_logit_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
    stack_logit_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
    y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

    stack_logit_pred_df[(f'h{h}', 'stack_logit_clf')] = y_pred
    stack_logit_prob_df[(f'h{h}', 'stack_logit_clf')] = y_prob

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_stack_logit_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_stack_logit_clf_h{h}.png')
    plt.close()

    # Plot the actual recession binary values with the predicted probabilities
    plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
    plt.title(f'erp_prob_stack_logit_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_prob_stack_logit_clf_h{h}.png')
    plt.close()

    i += 1


In [41]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series

stack_rf_train_df = pd.DataFrame(
    [(f'h{h}', 'stack_rf_clf') for h in fcst_horizon], columns=['fh','clf']    
)

stack_rf_test_df = pd.DataFrame(
    [(f'h{h}', 'stack_rf_clf') for h in fcst_horizon], columns=['fh','clf']    
)

stack_rf_pred_df = pd.DataFrame().reindex(data_series.index)
stack_rf_prob_df = pd.DataFrame().reindex(data_series.index)

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    log_clf = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, random_state=42, **logit_params_dict[f'h{h}'])
    xgb_clf = XGBClassifier(learning_rate=0.01, objective='binary:logistic', random_state=42, **xgb_params_dict[f'h{h}'])
    knn_clf = KNeighborsClassifier(**knn_params_dict[f'h{h}'])
    nb_clf = GaussianNB(priors=[0.5, 0.5])
    estimators_s = [('log', log_clf), ('xgb', xgb_clf), ('knn', knn_clf), ('nb', nb_clf)]

    stack_rf_clf = StackingClassifier(
        estimators=estimators_s, 
        final_estimator=RandomForestClassifier(random_state=42), cv=10
    )
    stack_rf_clf.fit(X_train_h, y_train_h)
    
    y_pred_train = stack_rf_clf.predict(X_train_h)
    y_prob_train = stack_rf_clf.predict_proba(X_train_h)[:, 1]

    stack_rf_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    stack_rf_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    stack_rf_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
    stack_rf_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    stack_rf_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    stack_rf_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
    stack_rf_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

    y_pred_test = stack_rf_clf.predict(X_test_h)
    y_prob_test = stack_rf_clf.predict_proba(X_test_h)[:, 1]
    
    stack_rf_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    stack_rf_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    stack_rf_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
    stack_rf_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    stack_rf_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    stack_rf_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
    stack_rf_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
    y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

    stack_rf_pred_df[(f'h{h}', 'stack_rf_clf')] = y_pred
    stack_rf_prob_df[(f'h{h}', 'stack_rf_clf')] = y_prob

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_stack_rf_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_stack_rf_clf_h{h}.png')
    plt.close()

    # Plot the actual recession binary values with the predicted probabilities
    plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
    plt.title(f'erp_prob_stack_rf_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_prob_stack_rf_clf_h{h}.png')
    plt.close()

    i += 1


In [42]:
combined_train_df = pd.concat([voting_h_train_df, voting_s_train_df, stack_logit_train_df, stack_rf_train_df], ignore_index=True)
combined_test_df = pd.concat([voting_h_test_df, voting_s_test_df, stack_logit_test_df, stack_rf_test_df], ignore_index=True)
combined_pred_df = pd.concat([voting_h_pred_df, voting_s_pred_df, stack_logit_pred_df, stack_rf_pred_df], axis=1)
combined_prob_df = pd.concat([voting_s_prob_df, stack_logit_prob_df, stack_rf_prob_df], axis=1)

In [43]:
combined_train_df.to_csv('erp_ensemble_train_results.csv')
combined_test_df.to_csv('erp_ensemble_test_results.csv')
combined_pred_df.to_csv('erp_ensemble_pred_estimates.csv')
combined_prob_df.to_csv('erp_ensemble_prob_estimates.csv')