In [1]:
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import figure
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [2]:
# Settings:
pd.set_option('display.width', 190)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('max_colwidth', 200)
pd.options.display.float_format = '{:.4f}'.format
plt.style.use('default')
np.set_printoptions(threshold = 30, edgeitems = 30, precision = 2, suppress = False)


In [3]:
df_path = "../merged_data/features_USRECD.csv"
features = ["BCI", "BCIp", "BCIg", 'IE_SP_Comp', 'IE_SP_Dividend', 'IE_SP_Earnings', 'IE_Consumer_CPI', 'IE_Long_Interest', 'IE_Real_Price', 'IE_Real_Dividend', 'IE_Return_Price', 'IE_Real_Earnings',
                'IE_Scaled_Earnings', 'IE_Monthly_Returns', 'IE_Real_Returns', "YC_10_Year", "YC_3_Month", "YC_3_Month_Bond", "YC_Spread", "YC_Rec_Prob"]
model_names = ["Logistic Regression", "Penalized SVM", "Random Forest"]
get_models = [lambda: linear_model.LogisticRegression(), lambda: svm.SVC(kernel='linear', class_weight='balanced', probability=True), 
          lambda: RandomForestClassifier()]


In [4]:
# Read the data and do a little bit of wrangling:
df = pd.read_csv(df_path)
df.Date = pd.to_datetime(df.Date)
df = df.set_index("Date", drop=True)
df = df.drop(columns="Unnamed: 0")
df.head()

Unnamed: 0_level_0,BCI,BCIp,BCIg,USRECD,IE_SP_Comp,IE_SP_Dividend,IE_SP_Earnings,IE_Consumer_CPI,IE_Long_Interest,IE_Real_Price,IE_Real_Dividend,IE_Return_Price,IE_Real_Earnings,IE_Scaled_Earnings,IE_Monthly_Returns,IE_Real_Returns,YC_10_Year,YC_3_Month,YC_3_Month_Bond,YC_Spread,YC_Rec_Prob
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1967-02-09,4.6052,6.587,3.4751,0,4.4362,1.0578,1.7084,3.4935,1.5217,6.5522,3.1739,11.4502,3.8238,8.7218,0.0,2.5153,1.5217,1.5518,1.5776,1.2065,-1.1432
1967-02-16,4.6052,6.5863,3.4751,0,4.4362,1.0578,1.7084,3.4935,1.5217,6.5522,3.1739,11.4502,3.8238,8.7218,0.0,2.5153,1.5217,1.5518,1.5776,1.2065,-1.1432
1967-02-23,4.6012,6.5774,3.4751,0,4.4362,1.0578,1.7084,3.4935,1.5217,6.5522,3.1739,11.4502,3.8238,8.7218,0.0,2.5153,1.5217,1.5518,1.5776,1.2065,-1.1432
1967-03-02,4.6032,6.582,3.4751,0,4.47,1.0613,1.7011,3.4935,1.5326,6.586,3.1772,11.4869,3.8177,8.7185,0.01,2.5153,1.5326,1.5173,1.5427,1.2692,-1.2586
1967-03-09,4.6042,6.5852,3.4751,0,4.47,1.0613,1.7011,3.4935,1.5326,6.586,3.1772,11.4869,3.8177,8.7185,0.01,2.5153,1.5326,1.5173,1.5427,1.2692,-1.2586


In [5]:
# Split into training and test sets and hold out the test set until the end, so that it remains "unseen".
lag_of_y = 21 # This is the lag we introduce to the target variable so that we assess the indicator's 
              # ability to predict the target variable this many steps into the future.
              # With BCI, a lag of 21 data points corresponds to about half a year.
        
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:-lag_of_y, df.columns != "USRECD"], \
    df.iloc[lag_of_y:, df.columns == "USRECD"], test_size=0.1, shuffle=False)

In [6]:
# for feature in features:
#     plt.figure()
#     X_train[feature].hist(bins = 50)
#     plt.xlabel(feature,fontsize=15)
#     plt.ylabel("Frequency",fontsize=15)
#     plt.show()

In [7]:
# Do a time series cross-validation on the test set by splitting it to k folds and doing a "rolling"
# validation against a validation fold, then averaging out the metrics.
splits = 4 # This is the number of splits/folds in the rolling validation.
tscv = TimeSeriesSplit(n_splits=splits)

for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29 ... 471 472 473 474 475
 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
 494 495 496 497 498 499 500] TEST: [ 501  502  503  504  505  506  507  508  509  510  511  512  513  514
  515  516  517  518  519  520  521  522  523  524  525  526  527  528
  529  530 ...  971  972  973  974  975  976  977  978  979  980  981  982
  983  984  985  986  987  988  989  990  991  992  993  994  995  996
  997  998  999 1000]
TRAIN: [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29 ...  971  972  973  974  975  976  977  978  979  980  981  982
  983  984  985  986  987  988  989  990  991  992  993  994  995  996
  997  998  999 1000] TEST: [1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
 1015 1016 1017 1018 1019 1020 102

## Validation

In [8]:
AUC_ROCs = dict()
ACCs = dict()
for model_name, get_model in zip(model_names, get_models):
    print(model_name)
    AUC_ROCs[model_name] = 0
    ACCs[model_name] = 0
    for train_index, test_index in tscv.split(X_train): # Rolling cross-validation happens inside this loop.
        X_train_fold, X_validation_fold = X_train.iloc[train_index[:-lag_of_y], X_train.columns != "USRECD"], \
            X_train.iloc[test_index[:-lag_of_y], X_train.columns != "USRECD"]
        y_train_fold, y_validation_fold = y_train.iloc[train_index[lag_of_y:], y_train.columns == "USRECD"], \
            y_train.iloc[test_index[lag_of_y:], y_train.columns == "USRECD"]
            
        scalers = dict()
        for feature in features:
            scalers[feature] = StandardScaler()
            scalers[feature].fit(X_train_fold[[feature]])
            X_train_fold[feature] = scalers[feature].transform(X_train_fold[[feature]])
            X_validation_fold[feature] = scalers[feature].transform(X_validation_fold[[feature]])
            
        model = get_model()
        model.fit(X_train_fold[features], y_train_fold["USRECD"])
        positive_probs = [p[1] for p in model.predict_proba(X_validation_fold[features])]
        AUC_ROC = metrics.roc_auc_score(y_validation_fold, positive_probs)
        AUC_ROCs[model_name] += AUC_ROC
        predictions = model.predict(X_validation_fold[features])
        ACC = accuracy_score(y_validation_fold, predictions)
        ACCs[model_name] += ACC
        print(AUC_ROC, ACC)
        
    AUC_ROCs[model_name] /= splits
    ACCs[model_name] /= splits

Logistic Regression
0.8996436403508772 0.4112734864300626
0.7099099099099099 0.9269311064718163
0.995045045045045 0.7014613778705637
0.8209923908178272 0.837160751565762
Penalized SVM
0.7395833333333333 0.302713987473904
0.806048906048906 0.9269311064718163
0.978957528957529 0.46346555323590816
0.6914764371123473 0.837160751565762
Random Forest
0.6559347587719297 0.8016701461377871
0.8904118404118404 0.8872651356993737
0.98005148005148 0.9478079331941545
0.82118421893983 0.8392484342379958


In [9]:
for model_name in model_names:
    print(model_name)
    print(f"AUC ROC: {AUC_ROCs[model_name]}")
    print(f"accuracy: {ACCs[model_name]}")

Logistic Regression
AUC ROC: 0.8563977465309149
accuracy: 0.7192066805845512
Penalized SVM
AUC ROC: 0.804016551363029
accuracy: 0.6325678496868476
Random Forest
AUC ROC: 0.8368955745437701
accuracy: 0.8689979123173278


## Test

In [10]:
# random guess
total = y_train.shape[0]
metrics.roc_auc_score(y_train.USRECD, np.zeros(total)), accuracy_score(y_train.USRECD, np.zeros(total))

(0.5, 0.8560575769692124)

In [11]:
y_test.value_counts()

USRECD
0         274
1           4
dtype: int64

In [12]:
X_train = X_train.copy()
X_test = X_test.copy()

all_scalers = dict()
for feature in features:
    all_scalers[feature] = StandardScaler()
    all_scalers[feature].fit(X_train[[feature]])
    X_train[feature] = all_scalers[feature].transform(X_train[[feature]])
    X_test[feature] = all_scalers[feature].transform(X_test[[feature]])

In [13]:
for feature in features:
    print(all_scalers[feature].mean_)

[4.8]
[6.44]
[3.61]
[5.88]
[2.29]
[3.06]
[4.71]
[1.81]
[6.78]
[3.18]
[12.51]
[3.96]
[9.69]
[0.01]
[2.96]
[1.81]
[1.07]
[1.1]
[1.57]
[-2.89]


In [14]:
for model_name, get_model in zip(model_names, get_models):
    print(model_name)
    model = get_model()
    model.fit(X_train[features], y_train["USRECD"])
    positive_probs = [p[1] for p in model.predict_proba(X_test[features])]
    AUC_ROC = metrics.roc_auc_score(y_test, positive_probs)
    print(AUC_ROC)
    predictions = model.predict(X_test[features])
    ACC = accuracy_score(y_test, predictions)
    print(ACC)

Logistic Regression
0.6541970802919709
0.7661870503597122
Penalized SVM
0.7901459854014597
0.7877697841726619
Random Forest
0.9260948905109488
0.9856115107913669
