In [None]:
!pip install -U scikit-learn

In [None]:
!pip install sktime --ignore-installed llvmlite

In [None]:
!pip install pandas_market_calendars

In [None]:
!pip install -U scikit-optimize

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

In [11]:
feature_df = pd.read_csv('features.csv')

X_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')
y_train = pd.read_csv('train_labels.csv')
y_test = pd.read_csv('test_labels.csv')

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1201), X_test: (855, 1201), y_train: (3416, 2), y_test: (855, 2)


In [12]:
import pandas_market_calendars as mcal

for df in (feature_df, X_train, X_test, y_train, y_test):
    df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
    df.set_index('date', inplace=True)

nyse = mcal.get_calendar('NYSE')
bday_us = pd.offsets.CustomBusinessDay(holidays=nyse.adhoc_holidays, calendar=nyse.regular_holidays, weekmask="1111100")
feature_df = feature_df.asfreq(bday_us)
X_train = X_train.asfreq(bday_us)
X_test = X_test.asfreq(bday_us)
y_train = y_train.asfreq(bday_us)['ERP_Sign']
y_test = y_test.asfreq(bday_us)['ERP_Sign']

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}')

X_train: (3416, 1200), X_test: (855, 1200), y_train: (3416,), y_test: (855,)


In [13]:
data_series = pd.concat([y_train, y_test])

In [14]:
max_lags = 10
fcst_horizon = [5, 10, 15, 20]
max_horizon = fcst_horizon[-1]
model_vars = feature_df.columns

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

tree_clf = DecisionTreeClassifier(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(gamma='scale', probability=True, max_iter=10000, random_state=42)

In [15]:
from sklearn import metrics

# Define (negative) log-loss as the loss function, aka scorer
log_loss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])

In [None]:
from skopt.space import Real, Integer
from sklearn.model_selection import TimeSeriesSplit
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series
import seaborn as sns

rf_train_df = pd.DataFrame(
    [(f'h{h}', 'rf_clf') for h in fcst_horizon], columns=['fh','clf']    
)

rf_test_df = pd.DataFrame(
    [(f'h{h}', 'rf_clf') for h in fcst_horizon], columns=['fh','clf']    
)

rf_pred_df = pd.DataFrame().reindex(data_series.index)
rf_prob_df = pd.DataFrame().reindex(data_series.index)

rf_params_dict = {}
rf_features_dict = {}

# Define the parameter grid for cross-validation to search through
param_grid = {
    'n_estimators': Integer(1, 150), 
    'min_samples_leaf': Integer(1, 4),
    'min_samples_split': Integer(2, 10),
    'max_depth': Integer(10, 120),
    'criterion': ['gini', 'entropy', 'log_loss'],
    'ccp_alpha': Real(1e-5, 0.35)
}

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    # Create an expanding window splitter for cross-validation
    cv = TimeSeriesSplit(n_splits=10)

    search = BayesSearchCV(estimator=rf_clf, search_spaces=param_grid, n_iter=60, scoring=log_loss, cv=cv, random_state=42)
    search.fit(X=X_train_h, y=y_train_h)
    rf_params_dict[(f'h{h}', 'rf_clf')] = search.best_params_
    rf_fitted = RandomForestClassifier(**search.best_estimator_.get_params()).fit(X=X_train_h, y=y_train_h)
    
    y_pred_train = search.predict(X_train_h)
    y_prob_train = search.predict_proba(X_train_h)[:, 1]

    rf_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    rf_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    rf_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
    rf_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    rf_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    rf_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
    rf_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

    y_pred_test = search.predict(X_test_h)
    y_prob_test = search.predict_proba(X_test_h)[:, 1]
    
    rf_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    rf_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    rf_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
    rf_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    rf_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    rf_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
    rf_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)

    # Extract the top 20 features by importance
    feature_importance = pd.Series(rf_fitted.feature_importances_, index=X_train_h.columns)
    best_features = feature_importance.sort_values(ascending=False).head(20)
    rf_features_dict[(f'h{h}', 'rf_clf')] = best_features

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
    y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

    rf_pred_df[(f'h{h}', 'rf_clf')] = y_pred
    rf_prob_df[(f'h{h}', 'rf_clf')] = y_prob

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_rf_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_rf_clf_h{h}.png')
    plt.close()

    # Plot the actual recession binary values with the predicted probabilities
    plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
    plt.title(f'erp_prob_rf_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_prob_rf_clf_h{h}.png')
    plt.close()

    # Plot the top 20 features by importance
    # Save all three graphs and close them for matplotlib to release memory
    plt.figure(figsize=(20,28))
    sns.barplot(x=best_features.values, y=best_features.index)
    plt.title(f'feature_importance_rf_h{h}')
    plt.tight_layout()
    plt.savefig(f'feature_importance_rf_h{h}.png')
    plt.close()
    
    i += 1

In [None]:
from skopt.space import Real, Integer
from sklearn.model_selection import TimeSeriesSplit
from skopt import BayesSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score, roc_auc_score
from sktime.utils.plotting import plot_series

svm_train_df = pd.DataFrame(
    [(f'h{h}', 'svm_clf') for h in fcst_horizon], columns=['fh','clf']    
)

svm_test_df = pd.DataFrame(
    [(f'h{h}', 'svm_clf') for h in fcst_horizon], columns=['fh','clf']    
)

svm_pred_df = pd.DataFrame().reindex(data_series.index)
svm_prob_df = pd.DataFrame().reindex(data_series.index)

svm_params_dict = {}

# Define the parameter grid for cross-validation to search through
param_grid = {
    'C': Real(0.01, 1000, prior='log-uniform')
}

i = 0
for h in fcst_horizon:
    p = max_lags + h
    time_start = y_train.index[0] + p*bday_us - 1*bday_us
    mask_Xstart = X_train.index >= time_start
    mask_ystart = y_train.index >= time_start

    # Select only X and y columns and observations for the relevant forecast horizon
    X_train_h = X_train.loc[mask_Xstart, [f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    X_test_h = X_test[[f'{x}L{lag}' for x in model_vars for lag in range(h, p)]]
    y_train_h = y_train.loc[mask_ystart]

    # Create an expanding window splitter for cross-validation
    cv = TimeSeriesSplit(n_splits=10)

    search = BayesSearchCV(estimator=svm_clf, search_spaces=param_grid, n_iter=60, scoring=log_loss, cv=cv, random_state=42)
    with warnings.catch_warnings(record=True) as caught_warnings:
            warnings.filterwarnings('always', category=ConvergenceWarning, module='sklearn')
            search.fit(X=X_train_h, y=y_train_h)
            for warn in caught_warnings:
                svm_train_df.loc[i, 'Convergence'] = False
    
    svm_params_dict[(f'h{h}', 'svm_clf')] = search.best_params_

    svm_fitted = SVC(**search.best_estimator_.get_params()).fit(X=X_train_h, y=y_train_h)
    
    y_pred_train = search.predict(X_train_h)
    y_prob_train = search.predict_proba(X_train_h)[:, 1]

    svm_train_df.loc[i, 'Accuracy'] = accuracy_score(y_train_h, y_pred_train)
    svm_train_df.loc[i, 'F1 Score'] = f1_score(y_train_h, y_pred_train)
    svm_train_df.loc[i, 'PR AUC'] = average_precision_score(y_train_h, y_prob_train)
    svm_train_df.loc[i, 'Precision'] = precision_score(y_train_h, y_pred_train)
    svm_train_df.loc[i, 'Sensitivity'] = recall_score(y_train_h, y_pred_train)
    svm_train_df.loc[i, 'Specificity'] = recall_score(y_train_h, y_pred_train, pos_label=0)
    svm_train_df.loc[i, 'ROC AUC'] = roc_auc_score(y_train_h, y_prob_train)

    y_pred_test = search.predict(X_test_h)
    y_prob_test = search.predict_proba(X_test_h)[:, 1]
    
    svm_test_df.loc[i, 'Accuracy'] = accuracy_score(y_test, y_pred_test)
    svm_test_df.loc[i, 'F1 Score'] = f1_score(y_test, y_pred_test)
    svm_test_df.loc[i, 'PR AUC'] = average_precision_score(y_test, y_prob_test)
    svm_test_df.loc[i, 'Precision'] = precision_score(y_test, y_pred_test)
    svm_test_df.loc[i, 'Sensitivity'] = recall_score(y_test, y_pred_test)
    svm_test_df.loc[i, 'Specificity'] = recall_score(y_test, y_pred_test, pos_label=0)
    svm_test_df.loc[i, 'ROC AUC'] = roc_auc_score(y_test, y_prob_test)

    # Convert np arrays to datetime series for time series plotting
    y_pred = pd.concat([pd.Series(y_pred_train).set_axis(y_train_h.index), pd.Series(y_pred_test).set_axis(y_test.index)])
    y_prob = pd.concat([pd.Series(y_prob_train).set_axis(y_train_h.index), pd.Series(y_prob_test).set_axis(y_test.index)])

    svm_pred_df[(f'h{h}', 'svm_clf')] = y_pred
    svm_prob_df[(f'h{h}', 'svm_clf')] = y_prob

    # Plot the actual recession binary values with the predicted binary values
    plot_series(y_train_h, y_test, y_pred, labels=['y_train', 'y_test', 'y_pred'], x_label='Date', y_label='ERP Sign (Binary)')
    plt.title(f'erp_binary_svm_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_binary_svm_clf_h{h}.png')
    plt.close()

    # Plot the actual recession binary values with the predicted probabilities
    plot_series(y_train_h, y_test, y_prob, labels=['y_train', 'y_test', 'y_prob'], x_label='Date', y_label='Pr(ERP Sign)')
    plt.title(f'erp_prob_svm_clf_h{h}')
    plt.tight_layout()
    plt.savefig(f'erp_prob_svm_clf_h{h}.png')
    plt.close()

    i += 1

In [None]:
rf_train_df.to_csv('erp_rf_train_results.csv')
rf_test_df.to_csv('erp_rf_test_results.csv')
rf_pred_df.to_csv('erp_rf_pred_estimates.csv')
rf_prob_df.to_csv('erp_rf_prob_estimates.csv')

In [19]:
svm_train_df.to_csv('erp_svm_train_results.csv')
svm_test_df.to_csv('erp_svm_test_results.csv')
svm_pred_df.to_csv('erp_svm_pred_estimates.csv')
svm_prob_df.to_csv('erp_svm_prob_estimates.csv')

In [None]:
rf_params_df = pd.DataFrame.from_dict(rf_params_dict, orient='index')
rf_params_df.to_csv('erp_rf_parameters.csv')

In [20]:
svm_params_df = pd.DataFrame.from_dict(svm_params_dict, orient='index')
svm_params_df.to_csv('erp_svm_parameters.csv')

In [None]:
# Perform some manipulation and export feature importances to CSV

feature_importance_df = pd.DataFrame()

for h in fcst_horizon:
    tmp_feature = rf_features_dict[(f'h{h}', 'rf_clf')].rename_axis(f'h{h}_rf_feature').reset_index(name=f'h{h}_rf_importance')
    feature_importance_df = pd.concat([feature_importance_df, tmp_feature.T])

feature_importance_df.to_csv('erp_rf_feature_importances.csv')