In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
feature_df = pd.read_csv('features.csv')

X_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')
y_train = pd.read_csv('train_labels.csv')
y_test = pd.read_csv('test_labels.csv')

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1201), X_test: (855, 1201), y_train: (3416, 2), y_test: (855, 2)


In [None]:
import pandas_market_calendars as mcal

for df in (feature_df, X_train, X_test, y_train, y_test):
    df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
    df.set_index('date', inplace=True)

nyse = mcal.get_calendar('NYSE')
bday_us = pd.offsets.CustomBusinessDay(holidays=nyse.adhoc_holidays, calendar=nyse.regular_holidays, weekmask="1111100")
feature_df = feature_df.asfreq(bday_us)
X_train = X_train.asfreq(bday_us)
X_test = X_test.asfreq(bday_us)
y_train = y_train.asfreq(bday_us)['ERP_Sign']
y_test = y_test.asfreq(bday_us)['ERP_Sign']

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1200), X_test: (855, 1200), y_train: (3416,), y_test: (855,)


In [None]:
data_series = pd.concat([y_train, y_test])

In [None]:
max_lags = 10
fcst_horizon = [5, 10, 15, 20]
max_horizon = fcst_horizon[-1]
model_vars = feature_df.columns

In [None]:
from sklearn import metrics

# Define (negative) log-loss as the loss function, aka scorer
log_loss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series
import seaborn as sns
from sklearn.tree import plot_tree

tree_train_df = pd.DataFrame(
    [(f'h{h}', 'tree_clf') for h in fcst_horizon], columns=['fh','clf']    
)

tree_test_df = pd.DataFrame(
    [(f'h{h}', 'tree_clf') for h in fcst_horizon], columns=['fh','clf']    
)

tree_pred_df = pd.DataFrame().reindex(data_series.index)
tree_prob_df = pd.DataFrame().reindex(data_series.index)

tree_features_dict = {}

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    tree_clf = DecisionTreeClassifier(random_state=42)
    tree_clf.fit(X=X_train_h, y=y_train_h)
    
    y_pred_train = tree_clf.predict(X_train_h)
    y_prob_train = tree_clf.predict_proba(X_train_h)[:, 1]

    tree_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    tree_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    tree_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
    tree_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    tree_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    tree_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
    tree_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

    y_pred_test = tree_clf.predict(X_test_h)
    y_prob_test = tree_clf.predict_proba(X_test_h)[:, 1]
    
    tree_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    tree_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    tree_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
    tree_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    tree_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    tree_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
    tree_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)

    # Extract the top 20 features by importance
    feature_importance = pd.Series(tree_clf.feature_importances_, index=X_train_h.columns)
    best_features = feature_importance.sort_values(ascending=False).head(20)
    tree_features_dict[(f'h{h}', 'tree_clf')] = best_features

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
    y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

    tree_pred_df[(f'h{h}', 'tree_clf')] = y_pred
    tree_prob_df[(f'h{h}', 'tree_clf')] = y_prob

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_tree_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_tree_clf_h{h}.png')
    plt.close()

    # Plot the actual recession binary values with the predicted probabilities
    plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
    plt.title(f'erp_prob_tree_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_prob_tree_clf_h{h}.png')
    plt.close()

    # Plot the top 20 features by importance
    # Save all three graphs and close them for matplotlib to release memory
    plt.figure(figsize=(20,28))
    sns.barplot(x=best_features.values, y=best_features.index)
    plt.title(f'feature_importance_tree_h{h}')
    plt.tight_layout()
    plt.savefig(f'feature_importance_tree_h{h}.png')
    plt.close()

    plt.figure(figsize=(25,20))
    plot_tree(tree_clf, feature_names=X_train_h.columns, class_names=['0', '1'], filled=True)
    plt.savefig(f'decision_tree_h{h}.png')
    plt.close()
    
    i += 1

In [None]:
tree_train_df.to_csv('erp_tree_train_results.csv')
tree_test_df.to_csv('erp_tree_test_results.csv')
tree_pred_df.to_csv('erp_tree_pred_estimates.csv')
tree_prob_df.to_csv('erp_tree_prob_estimates.csv')