In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.set_option("display.max_columns", 100)
import matplotlib.ticker as mtick

In [None]:
target_variable = "trr_5"

In [None]:
os.chdir("../../results")

In [None]:
print(pd. __version__)

In [None]:
def filter_market_caps(results_df, min_market_cap_percentile = 0.6, max_market_cap_percentile = None):
    current_results = results_df.copy()
    
    min_market_caps = current_results.groupby("date")["market_cap_usd"].quantile(min_market_cap_percentile)
    
    if max_market_cap_percentile != None:
        max_market_caps = current_results.groupby("date")["market_cap_usd"].quantile(max_market_cap_percentile)
           
    current_results = current_results.groupby("date").apply(lambda x: x[x["market_cap_usd"] >= min_market_caps.loc[x.name]]).reset_index(drop=True)

    if max_market_cap_percentile != None:
        current_results = current_results.groupby("date").apply(lambda x: x[x["market_cap_usd"] <= max_market_caps.loc[x.name]]).reset_index(drop=True)
    current_results.sort_values(["date", "gvkey"], inplace=True)

    return current_results.copy()

In [None]:
def add_quantiles(results_df, quantiles=10):
    results_df = results_df.copy()
    
    def g(df):
        df['conviction_quantile'] = pd.qcut(df['conviction'], quantiles, labels=False, duplicates="drop")
        df['top_quantile'] = pd.qcut(df['pred_2'], quantiles, labels=False, duplicates="drop")
        df['bottom_quantile'] = pd.qcut(df['pred_0'], quantiles, labels=False, duplicates="drop")
        return df
        
    results_df = results_df.groupby("date").apply(g).reset_index(drop=True)
    return results_df.copy()

In [None]:
def set_time_period(results_df, first_date, last_date):
    current_results = results_df.copy()
    current_results = current_results[current_results["date"] > pd.Timestamp(first_date)]
    current_results = current_results[current_results["date"] < pd.Timestamp(last_date)]
    return current_results.copy()

In [None]:
min_market_cap_percentile_us = 0.6
min_market_cap_percentile_global = 0.65

In [None]:
def prepare_results(df, exchange_codes = None, currencies = None, quantiles = 20, 
                    min_date = "2020-01-01", max_date="2023-12-31", n_gvkeys = 500, svm=False,
                    min_market_cap_percentile = 0.6,
                   use_percentile_cap = False, min_volume_usd_5 = 1000, lower_rank = None, max_market_cap_percentile=None):
    
    df = df.copy()
    
    if not ("conviction" in df.columns):
        print("conviction not in columns")
        if svm:
            df["conviction"] = df["pred_1"]
        else:
            df["conviction"] = df["pred_2"] - df["pred_0"]
        
    
    if exchange_codes != None:
        df = df[df["exchange_code"].isin(exchange_codes)]
    if currencies != None:
        df = df[df["currency"].isin(currencies)]
        
        
    df["trr_5_fwd_ar"] = np.exp(df["trr_5_fwd"]) - 1
    if use_percentile_cap:
        df = filter_market_caps(df, min_market_cap_percentile, max_market_cap_percentile)
    else:
        if "market_cap_usd" in df.columns:
            df = df[df["volume_usd_5"] >= min_volume_usd_5]
            df["market_cap_rank"] = df.groupby("date")["market_cap_usd"].rank(ascending=False, method="first").astype(int)
            df = df[df["market_cap_rank"] <= n_gvkeys]
            if lower_rank != None:
                df = df[df["market_cap_rank"] >= lower_rank]

        elif "market_cap_rank" in df.columns:
            df = df[df["volume_usd_5"] >= min_volume_usd_5]
            df = df[df["market_cap_rank"] <= n_gvkeys]
        else:
            print("No market cap or rank in df")
    df = add_quantiles(df, quantiles=quantiles)
    df = set_time_period(df, min_date, max_date)
    
    return df
    

In [None]:
us_lookup = pd.read_parquet("../data/lookup/us_lookup.parquet", engine="pyarrow")
us_lookup["date"] = pd.to_datetime(us_lookup["date"])
us_lookup["trr_5_ar"] = np.exp(us_lookup["trr_5"]) - 1

In [None]:
eu_lookup = pd.read_parquet("../data/lookup/eu_lookup.parquet", engine="pyarrow")
eu_lookup["date"] = pd.to_datetime(eu_lookup["date"])
eu_lookup["trr_5_ar"] = np.exp(eu_lookup["trr_5"]) - 1

In [None]:
jp_lookup = pd.read_parquet("../data/lookup/jp_lookup.parquet", engine="pyarrow")
jp_lookup["date"] = pd.to_datetime(jp_lookup["date"])
jp_lookup["trr_5_ar"] = np.exp(jp_lookup["trr_5"]) - 1

In [None]:
result_cols = ['date', 'gvkey', 'company_name', 'currency', 'exchange_code',
       'trr_5_fwd', 'trr_5_fwd_class', 'pred_0', 'pred_1', 'pred_2',
       'pred_class', 'market_cap_rank', 'train_file', 'split_year', 'gsector',
       'ggroup', 'gind', 'gsubind', 'market_cap_usd', 'trr_5', 'volume_usd_5', 'volatility_5',
       'price_close_usd']

In [None]:
def load_digest(path, region_name, model_name, keep_groups = True):
    digest = pd.read_parquet(path, engine="pyarrow")
    digest["date"] = pd.to_datetime(digest["date"])
    digest["region"] = region_name
    digest["model"] = model_name
    digest.reset_index(inplace=True, drop=True)
    digest["min_max_quantile"] = list(zip(digest["min_quantile"], digest["max_quantile"]))
    if not keep_groups:
        digest = digest[digest["all"] == True]
    return digest

In [None]:
delete_part_files = True

In [None]:
all_ensemble_us_digest = load_digest("digests/['catboost', 'xgb', 'logreg', 'rf']_ensemble_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet", 
                                         "us", "all_ensemble")
all_ensemble_eu_digest = load_digest("digests/['catboost', 'xgb', 'logreg', 'rf']_ensemble_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet", 
                                         "eu", "all_ensemble")
all_ensemble_jp_digest = load_digest("digests/['catboost', 'xgb', 'logreg', 'rf']_ensemble_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet", 
                                         "jp", "all_ensemble")
all_ensemble_digest = pd.concat([all_ensemble_us_digest, all_ensemble_eu_digest, all_ensemble_jp_digest])

if delete_part_files:
    del all_ensemble_us_digest, all_ensemble_eu_digest, all_ensemble_jp_digest

In [None]:
brf_ensemble_us_digest = load_digest("digests/['catboost', 'xgb', 'rf']_ensemble_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                         "us", "brf_ensemble")
brf_ensemble_eu_digest = load_digest("digests/['catboost', 'xgb', 'rf']_ensemble_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                         "eu", "brf_ensemble")
brf_ensemble_jp_digest = load_digest("digests/['catboost', 'xgb', 'rf']_ensemble_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                         "jp", "brf_ensemble")
brf_ensemble_digest = pd.concat([brf_ensemble_us_digest, brf_ensemble_eu_digest, brf_ensemble_jp_digest])

if delete_part_files:
    del brf_ensemble_us_digest, brf_ensemble_eu_digest, brf_ensemble_jp_digest

In [None]:
b_ensemble_us_digest = load_digest("digests/['catboost', 'xgb']_ensemble_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                       "us", "b_ensemble")
b_ensemble_eu_digest = load_digest("digests/['catboost', 'xgb']_ensemble_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                       "eu", "b_ensemble")
b_ensemble_jp_digest = load_digest("digests/['catboost', 'xgb']_ensemble_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                       "jp", "b_ensemble")
b_ensemble_digest = pd.concat([b_ensemble_us_digest, b_ensemble_eu_digest, b_ensemble_jp_digest])

if delete_part_files:
    del b_ensemble_us_digest, b_ensemble_eu_digest, b_ensemble_jp_digest
    

In [None]:
crf_ensemble_us_digest = load_digest("digests/['catboost', 'rf']_ensemble_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                       "us", "crf_ensemble")
crf_ensemble_eu_digest = load_digest("digests/['catboost', 'rf']_ensemble_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                       "eu", "crf_ensemble")
crf_ensemble_jp_digest = load_digest("digests/['catboost', 'rf']_ensemble_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                       "jp", "crf_ensemble")
crf_ensemble_digest = pd.concat([crf_ensemble_us_digest, crf_ensemble_eu_digest, crf_ensemble_jp_digest])

if delete_part_files:
    del crf_ensemble_us_digest, crf_ensemble_eu_digest, crf_ensemble_jp_digest
    

In [None]:
ensemble_digests = pd.concat([all_ensemble_digest, brf_ensemble_digest, b_ensemble_digest, crf_ensemble_digest])

if delete_part_files:
    del all_ensemble_digest, brf_ensemble_digest, b_ensemble_digest, crf_ensemble_digest

In [None]:
catboost_us_digest = load_digest("digests/catboost_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "us", "catboost")
catboost_eu_digest = load_digest("digests/catboost_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "eu", "catboost")
catboost_jp_digest = load_digest("digests/catboost_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "jp", "catboost")
catboost_digest = pd.concat([catboost_us_digest, catboost_eu_digest, catboost_jp_digest])

if delete_part_files:
    del catboost_us_digest, catboost_eu_digest, catboost_jp_digest



In [None]:
xgb_us_digest = load_digest("digests/xgb_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "us", "xgb")
xgb_eu_digest = load_digest("digests/xgb_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "eu", "xgb")
xgb_jp_digest = load_digest("digests/xgb_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "jp", "xgb")
xgb_digest = pd.concat([xgb_us_digest, xgb_eu_digest, xgb_jp_digest])

if delete_part_files:
    del xgb_us_digest, xgb_eu_digest, xgb_jp_digest



In [None]:
logreg_us_digest = load_digest("digests/logreg_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "us", "logreg")
logreg_eu_digest = load_digest("digests/logreg_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "eu", "logreg")
logreg_jp_digest = load_digest("digests/logreg_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "jp", "logreg")
logreg_digest = pd.concat([logreg_us_digest, logreg_eu_digest, logreg_jp_digest])

if delete_part_files:
    del logreg_us_digest, logreg_eu_digest, logreg_jp_digest


In [None]:
rf_us_digest = load_digest("digests/rf_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "us", "rf")
rf_eu_digest = load_digest("digests/rf_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "eu", "rf")
rf_jp_digest = load_digest("digests/rf_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "jp", "rf")
rf_digest = pd.concat([rf_us_digest, rf_eu_digest, rf_jp_digest])

if delete_part_files:
    del rf_us_digest, rf_eu_digest, rf_jp_digest


In [None]:
svm_us_digest = load_digest("digests/svm_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "us", "svm")
svm_eu_digest = load_digest("digests/svm_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "eu", "svm")
svm_jp_digest = load_digest("digests/svm_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_digest.parquet",
                                 "jp", "svm")
svm_digest = pd.concat([svm_us_digest, svm_eu_digest, svm_jp_digest])

if delete_part_files:
    del svm_us_digest, svm_eu_digest, svm_jp_digest


In [None]:
all_digests = pd.concat([ensemble_digests, catboost_digest, xgb_digest, logreg_digest, rf_digest, svm_digest])

if delete_part_files:
    del ensemble_digests, catboost_digest, xgb_digest, logreg_digest, rf_digest, svm_digest

In [None]:
all_digests = all_ensemble_digest

In [None]:
all_digests_nogroup = all_digests[all_digests["all"] == True]

In [None]:
all_digests_nogroup = all_digests

In [None]:
us_lookup = pd.read_parquet("../data/lookup/us_lookup.parquet", engine="pyarrow")
us_lookup["date"] = pd.to_datetime(us_lookup["date"])

us_lookup_mc_cap = filter_market_caps(us_lookup, 0.78, 1)
us_lookup_mc_cap = us_lookup_mc_cap[us_lookup_mc_cap["trr_5"] <= 0.3]
us_lookup_mc_cap = us_lookup_mc_cap[us_lookup_mc_cap["trr_5"] >= -0.3]
us_lookup_mc_cap = us_lookup_mc_cap[us_lookup_mc_cap["date"].dt.dayofweek == 4]
us_lookup_mc_cap_avg = us_lookup_mc_cap.groupby("date")["trr_5_ar"].mean()

eu_lookup = pd.read_parquet("../data/lookup/eu_lookup.parquet", engine="pyarrow")
eu_lookup["date"] = pd.to_datetime(eu_lookup["date"])

eu_lookup_mc_cap = filter_market_caps(eu_lookup, 0.78, 1)
eu_lookup_mc_cap = eu_lookup_mc_cap[eu_lookup_mc_cap["trr_5"] <= 0.3]
eu_lookup_mc_cap = eu_lookup_mc_cap[eu_lookup_mc_cap["trr_5"] >= -0.3]
eu_lookup_mc_cap = eu_lookup_mc_cap[eu_lookup_mc_cap["date"].dt.dayofweek == 4]
eu_lookup_mc_cap_avg = eu_lookup_mc_cap.groupby("date")["trr_5_ar"].mean()

jp_lookup = pd.read_parquet("../data/lookup/jp_lookup.parquet", engine="pyarrow")
jp_lookup["date"] = pd.to_datetime(jp_lookup["date"])

jp_lookup_mc_cap = filter_market_caps(jp_lookup, 0.78, 1)
jp_lookup_mc_cap = jp_lookup_mc_cap[jp_lookup_mc_cap["trr_5"] <= 0.3]
jp_lookup_mc_cap = jp_lookup_mc_cap[jp_lookup_mc_cap["trr_5"] >= -0.3]
jp_lookup_mc_cap = jp_lookup_mc_cap[jp_lookup_mc_cap["date"].dt.dayofweek == 4]
jp_lookup_mc_cap_avg = jp_lookup_mc_cap.groupby("date")["trr_5_ar"].mean()

# Historical

In [None]:
fig, ax = plt.subplots(figsize=(14, 8))

current_min_max_quantile = (0.78,1)

plot_feature = "top_minus_bottom_4_trr_5_fwd_ar_mean"

resample_freq = "YS"

current_digest = all_digests_nogroup[(all_digests_nogroup["region"] == "us") & 
                                     (all_digests_nogroup["min_max_quantile"] == current_min_max_quantile) &
                                    (all_digests_nogroup["train_file"] == "all_dates")]


avg_trr_5_ar = us_lookup_mc_cap[us_lookup_mc_cap["date"] > pd.Timestamp("1980-01-01")].groupby("date")["trr_5_ar"].mean().reset_index()
avg_trr_5_ar.set_index("date")["trr_5_ar"].resample(resample_freq).mean().plot(ax=ax, label="Average Return")

current_digest[current_digest["model"] == "logreg"].set_index("date")[plot_feature].resample(resample_freq).mean().plot(ax=ax, label="Logistic Regression")
current_digest[current_digest["model"] == "catboost"].set_index("date")[plot_feature].resample(resample_freq).mean().plot(ax=ax, label="CatBoost")
current_digest[current_digest["model"] == "xgb"].set_index("date")[plot_feature].resample(resample_freq).mean().plot(ax=ax, label="XGBoost")
current_digest[current_digest["model"] == "rf"].set_index("date")[plot_feature].resample(resample_freq).mean().plot(ax=ax, label="Random Forest")
current_digest[current_digest["model"] == "svm"].set_index("date")[plot_feature].resample(resample_freq).mean().plot(ax=ax, label="Support Vector Machine")
current_digest[current_digest["model"] == "all_ensemble"].set_index("date")[plot_feature].resample(resample_freq).mean().plot(ax=ax, label="Ensemble (ex. SVM)")

ax.axvline(pd.Timestamp("2003-01-01"), color="black", linestyle="--", lw=2)

ax.tick_params(axis='both', which='major', labelsize=16)
ax.legend(fontsize=16)
ax.grid()
ax.set_xlabel("")
plt.tight_layout()
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))



In [None]:
fig.savefig("../figures/all_models_avg_returns_all_trained_min078.pdf", dpi=3000)

# Models

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12,8))

min_date = pd.Timestamp("2003-01-01")
max_date = pd.Timestamp("2023-12-31")


current_min_max_quantile = (0.78,1)

current_digest = all_digests_nogroup[(all_digests_nogroup["region"] == "us") & 
                                     (all_digests_nogroup["min_max_quantile"] == current_min_max_quantile) &
                                    (all_digests_nogroup["train_file"] == "all_dates")]

current_digest = current_digest[(current_digest["date"] > min_date) & (current_digest["date"] < max_date)]


top_n_quantiles = 4

current_digest_model = current_digest[current_digest["model"] == "logreg"]

i = 0
bar_width=0.20

bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width, label="Total", color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, label="Longs", color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, label="Shorts", color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "catboost"]


bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width, color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width,  color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "xgb"]


bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width, color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "rf"]

bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width, color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "svm"]


bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width, color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "all_ensemble"]

bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width,  color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "brf_ensemble"]

bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width,  color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "b_ensemble"]

bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width,  color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

current_digest_model = current_digest[current_digest["model"] == "crf_ensemble"]

bar_all = ax.bar(height = current_digest_model[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i-0.2, width=bar_width,  color="tab:purple", edgecolor="black")
bar_longs = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i, width=bar_width, color="tab:blue", edgecolor="black")
bar_shorts = ax.bar(height = current_digest_model[current_digest_model["date"].between(min_date, max_date)][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
   x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black")

i+= 1

avg_trr_5_ar = us_lookup_mc_cap[us_lookup_mc_cap["date"].dt.dayofweek == 4]
avg_trr_5_ar = avg_trr_5_ar[(avg_trr_5_ar["date"] >= min_date) & (avg_trr_5_ar["date"] <= max_date)].groupby("date")["trr_5_ar"].mean().mean()
ax.axhline(avg_trr_5_ar, color="black", linestyle="--")

ax.axhline(0, color="black")

plt.xticks(rotation=45, ha='right')
ax.tick_params(axis='both', which='major', labelsize=18, labelbottom=True)
ax.set_xticks(range(9))
ax.set_xticklabels(["Logistic Regression", "CatBoost", "XGBoost", "Random Forest", "Support Vector Machine", "Ensemble (ex. SVM)", "Boosting + RF Ensemble", "Boosting Ensemble", "CatBoost + RF Ensemble"], rotation=45, ha='right')
ax.set_ylim(top=0.004)
ax.grid(axis="y")
ax.legend(fontsize=15)
plt.tight_layout()
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=2))


In [None]:
fig.savefig("../figures/all_models_returns_bar_after_2003_min078_top10pct.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/all_models_returns_bar_after_2003_min078_top2p5pct.pdf", dpi=3000)

# Market cap

In [None]:
quantile_ranges = [(0, 0.12), (0.12, 0.34), (0.34, 0.56), (0.56, 0.78), (0.78, 0.89), (0.89,  1.0)]

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
quantile_returns = all_ensemble_us_digest_cap[all_ensemble_us_digest_cap["min_max_quantile"].isin(quantile_ranges)].groupby(["min_max_quantile", "date"])[["top_minus_bottom_4_trr_5_fwd_ar_mean", "n_stocks_in_group"]].agg({"top_minus_bottom_4_trr_5_fwd_ar_mean" : "mean", "n_stocks_in_group" : "median"}).reset_index()

for quantile_range in quantile_ranges:
    quantile_returns[quantile_returns["min_max_quantile"] == quantile_range].plot(x="date", y="top_minus_bottom_4_trr_5_fwd_ar_mean", ax=ax, label=str(quantile_range))

n_stocks_in_group = quantile_returns[quantile_returns.index.isin(quantile_ranges)]["n_stocks_in_group"]

plt.xticks(rotation=45, ha='right')
ax.tick_params(axis='both', which='major', labelsize=14)
ax.set_xlabel("Market Cap Quantile Ranges", fontsize = 16)
plt.tight_layout()

In [None]:
for i, quantile_range in enumerate(quantile_ranges):
    print(i, quantile_range)
    current_cap = us_lookup_mc_cap
    current_cap = current_cap.groupby("date")[["trr_5", "trr_5_ar", "volume_usd_5", "volatility_5", "market_cap_usd"]].mean().reset_index()
    current_cap["min_max_quantile"] = str(quantile_range)
    if i == 0:
        cap_means = current_cap
    else:
        cap_means = pd.concat([cap_means, current_cap])
        

In [None]:
cap_means["volatility_5_adj_annual"] = cap_means["volatility_5"] * np.sqrt(50)

In [None]:
# VOLATILITY OF STOCKS THEMSELVES:

fig, ax = plt.subplots(figsize=(14, 8))


for quantile_range in quantile_ranges:
    cap_means[cap_means["min_max_quantile"] == str(quantile_range)][["date", "volatility_5_adj_annual"]].rolling(f"{60}D", on="date").mean().plot(x="date", y="volatility_5_adj_annual", ax=ax, label=str(quantile_range), lw=2)


plt.xticks(rotation=45, ha='right')
ax.tick_params(axis='both', which='major', labelsize=16)
ax.set_ylabel("5-day Volatilitity (Annualised)", fontsize = 18)
ax.set_xlabel("")
ax.legend(fontsize=16)
plt.tight_layout()
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

In [None]:
fig.savefig("../figures/volatility_market_cap_quantile_ranges_cap_rmean_60D_historical_annualised.pdf", dpi=3000)

In [None]:
fig, ax = plt.subplots(figsize=(13, 8))
current_digest = all_digests_nogroup[(all_digests_nogroup["train_file"] == "all_dates") &
                                     (all_digests_nogroup["model"] == "all_ensemble")]

current_digest = current_digest[current_digest["region"] == "us"]

min_date = pd.Timestamp("2003-01-01")

current_digest = current_digest[current_digest["date"] >= min_date]

top_n_quantiles = 4

grouped_df = current_digest[current_digest["min_max_quantile"].isin(quantile_ranges)].groupby(["min_quantile", "max_quantile"])[[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean", f"top_{top_n_quantiles}_trr_5_fwd_ar_mean", f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean", "n_stocks_in_group"]].mean()

n_stocks_in_group = grouped_df[grouped_df.index.isin(quantile_ranges)]["n_stocks_in_group"]

bar_width = 0.2
hatch=None

for i, train_file in enumerate(grouped_df.index):
    if i == 0:
        bar_all = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i-0.2, width=bar_width, label="Total", color="tab:purple", edgecolor="black", hatch=hatch)
        bar_longs = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i, width=bar_width, label="Longs", color="tab:blue", edgecolor="black", hatch=hatch)
        bar_shorts = ax.bar(height = grouped_df[grouped_df.index == train_file][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"],
           x=i+0.2, width=bar_width, label="Shorts", color="tab:orange", edgecolor="black", hatch=hatch)
    else:
        bar_all = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i-0.2, width=bar_width, color="tab:purple", edgecolor="black", hatch=hatch)
        bar_longs = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i, width=bar_width, color="tab:blue", edgecolor="black", hatch=hatch)
        bar_shorts = ax.bar(height = grouped_df[grouped_df.index == train_file][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"],
           x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black", hatch=hatch)
    
    
    text_height = max(bar_longs.patches[0].get_height(), bar_all.patches[0].get_height())

    ax.annotate(text = f"n = {int(n_stocks_in_group.iloc[i])}", 
           xy = (bar_longs.patches[0].get_x() + bar_longs.patches[0].get_width() / 2, 
            text_height + 0.0012), ha='center', va='center',
           size=16
           )
    mean_return = cap_means[(cap_means["date"] >= min_date) & (cap_means["min_max_quantile"] == str(quantile_ranges[i]))]["trr_5"].mean()

    ax.hlines(mean_return,bar_longs.patches[0].get_x() - bar_width, bar_longs.patches[0].get_x() + 2*bar_width, color="black", linestyle="--", lw=2)

    
n_stocks_in_group = quantile_returns[quantile_returns.index.isin(quantile_ranges)]["n_stocks_in_group"]

    
    
ax.axhline(0, color="black")
    

plt.xticks(rotation=0)
ax.tick_params(axis='both', which='major', labelsize=18)
ax.set_xlabel("Market Cap Quantile Ranges", fontsize = 20)
ax.legend(fontsize=16)
ax.set_xticks(range(len(quantile_ranges)))
ax.set_xticklabels(quantile_ranges)
plt.tight_layout()
ax.grid(axis="y")
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))


In [None]:
fig.savefig("../figures/all_returns_us_market_cap_quantile_ranges_cap_after_2003_longs_shorts_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/all_returns_us_market_cap_quantile_ranges_cap_all_dates_longs_shorts_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/all_returns_us_market_cap_quantile_ranges_cap_after_2003_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/all_returns_us_market_cap_quantile_ranges_cap_all_dates_bar.pdf", dpi=3000)

# Regions

In [None]:
current_digest = all_digests_nogroup[(all_digests_nogroup["train_file"] == "all_dates") &
                                     (all_digests_nogroup["model"] == "all_ensemble")]

current_digest = current_digest[current_digest["date"] >= pd.Timestamp("2005-01-01")]

current_quantile_ranges = [(0,1), (0.12,1), (0.34,1), (0.56,1), (0.78,1), (0.89,1)]

current_digest = current_digest[current_digest["min_max_quantile"].isin(current_quantile_ranges)]

top_n_quantiles = 4

n_stock_annot = True

fig, ax = plt.subplots(figsize=(15, 8))

hatch = ""

bar_width = 0.2

text_offset = 0.002

for i, quantile_range in enumerate(current_quantile_ranges):

    current_current_digest = current_digest[current_digest["min_max_quantile"] == quantile_range]
    
    if i == 0:
        bar_us = ax.bar(height = current_current_digest[current_current_digest["region"] == "us"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i-0.2, width=bar_width, label="US", color="tab:red", edgecolor="black", hatch=hatch)
        bar_eu = ax.bar(height = current_current_digest[current_current_digest["region"] == "eu"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i, width=bar_width, label="Europe", color="tab:blue", edgecolor="black", hatch=hatch)
        bar_jp = ax.bar(height = current_current_digest[current_current_digest["region"] == "jp"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i+0.2, width=bar_width, label="Japan", color="tab:green", edgecolor="black", hatch=hatch)

    else:
        bar_us = ax.bar(height = current_current_digest[current_current_digest["region"] == "us"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i-0.2, width=bar_width, color="tab:red", edgecolor="black", hatch=hatch)
        bar_eu = ax.bar(height = current_current_digest[current_current_digest["region"] == "eu"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i, width=bar_width, color="tab:blue", edgecolor="black", hatch=hatch)
        bar_jp = ax.bar(height = current_current_digest[current_current_digest["region"] == "jp"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i+0.2, width=bar_width, color="tab:green", edgecolor="black", hatch=hatch)
        
    if n_stock_annot:
        n_stocks_in_group = current_current_digest[current_current_digest["region"] == "us"]["n_stocks_in_group"].mean()
        rect_us = bar_us.patches[0]
        ax.annotate(text = f"n = {int(n_stocks_in_group)}", 
                   xy = (rect_us.get_x() + rect_us.get_width() / 2, 
                    rect_us.get_height() + text_offset), ha='center', va='center',
                   size=16, rotation=90
                   )
        n_stocks_in_group = current_current_digest[current_current_digest["region"] == "eu"]["n_stocks_in_group"].mean()
        rect_eu = bar_eu.patches[0]
        ax.annotate(text = f"n = {int(n_stocks_in_group)}", 
                   xy = (rect_eu.get_x() + rect_eu.get_width() / 2, 
                    rect_eu.get_height() + text_offset), ha='center', va='center',
                   size=16, rotation=90
                   )
        n_stocks_in_group = current_current_digest[current_current_digest["region"] == "jp"]["n_stocks_in_group"].mean()
        rect_jp = bar_jp.patches[0]
        ax.annotate(text = f"n = {int(n_stocks_in_group)}", 
                   xy = (rect_jp.get_x() + rect_jp.get_width() / 2, 
                    rect_jp.get_height() + text_offset), ha='center', va='center',
                   size=16, rotation=90
                   )
    
    
ax.tick_params(axis='both', which='major', labelsize=18)
ax.set_xticks(range(len(current_quantile_ranges)))
ax.set_xticklabels(current_quantile_ranges)
ax.legend(fontsize=18)
ax.set_xlabel("Market Cap Quantile Ranges", fontsize = 20)

ax.set_ylim(top=0.021)
plt.tight_layout()
ax.grid(axis="y")
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))


In [None]:
fig.savefig("../figures/all_returns_regions_mc_cap_ranges_after_2005_top_10pct_bar.pdf", dpi=3000)

# Returns in bear/bull and rec/exp

In [None]:
nber_rec_dates = pd.read_csv("../time_periods/model_train_ready/nber_recession_dates.csv")
nber_rec_dates["date"] = pd.to_datetime(nber_rec_dates["date"])

In [None]:
nber_exp_dates = pd.read_csv("../time_periods/model_train_ready/nber_expansion_dates.csv")
nber_exp_dates["date"] = pd.to_datetime(nber_exp_dates["date"])

In [None]:
sp500_bear_dates = pd.read_csv("../time_periods/model_train_ready/bear_dates_sp500.csv")
sp500_bear_dates["date"] = pd.to_datetime(sp500_bear_dates["date"])

In [None]:
sp500_bull_dates = pd.read_csv("../time_periods/model_train_ready/bull_dates_sp500.csv")
sp500_bull_dates["date"] = pd.to_datetime(sp500_bull_dates["date"])

In [None]:
train_file_dict = {
    "nber_recession_dates" : "NBER Recession",
    "nber_expansion_dates" : "NBER Expansion",
    "bear_dates_sp500" : "Qualitative Bear",
    "bull_dates_sp500" : "Qualitative Bull",
    "non_bear_dates_sp500" : "Qualitative Non-Bear",
    "flat_dates_sp500" : "Qualitative Flat",
    "markov_rec" : "Markov Recession",
    "markov_exp" : "Markov Expansion",
    "return_filter_bear_m_short_2_3" : "Negative Filter (ST)",
    "return_filter_bear_m_long_3_6_" : "Negative Filter (LT)",
    "return_filter_bull_m_short_2_3" : "Positive Filter (ST)",
    "return_filter_bull_m_long_3_6_" : "Positive Filter (LT)",
    "EPU_rec_2yr" : "EPU Recession",
    "EPU_exp_2yr" : "EPU Expansion",
    "all_dates" : "All Dates",
}

In [None]:
import matplotlib.patches as patches

In [None]:
top_n_quantiles = 4

current_digest = all_digests_nogroup[(all_digests_nogroup["model"] == "all_ensemble")]

current_digest = current_digest[current_digest["region"] == "us"]

current_digest = current_digest[current_digest["min_max_quantile"] == (0.78, 1)]

current_digest = current_digest[current_digest["date"] >= pd.Timestamp("2020-02-21")]
current_digest = current_digest[current_digest["date"] <= pd.Timestamp("2020-03-20")]


n_stock_annot = False

fig, ax = plt.subplots(figsize=(15, 8))
grouped_df = current_digest.groupby(["train_file"])[[f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean", f"top_{top_n_quantiles}_trr_5_fwd_ar_mean", f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean", "n_stocks_in_group"]].mean()
grouped_df.sort_values(f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean", ascending=False, inplace=True)

grouped_df.index = grouped_df.index.map(train_file_dict)

bar_width = 0.20

for i, train_file in enumerate(grouped_df.index):
    hatch = None
    if " Bear" in train_file or "Recession" in train_file or "Negative" in train_file:
        hatch = "\\\\"
    if "Flat" in train_file or "All" in train_file:
        hatch = '..'
    if i == 0:
        bar_all = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i-0.2, width=bar_width, label="Total", color="tab:purple", edgecolor="black", hatch=hatch)
        bar_longs = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i, width=bar_width, label="Longs", color="tab:blue", edgecolor="black", hatch=hatch)
        bar_shorts = ax.bar(height = grouped_df[grouped_df.index == train_file][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i+0.2, width=bar_width, label="Shorts", color="tab:orange", edgecolor="black", hatch=hatch)

    else:
        bar_all = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i-0.2, width=bar_width, color="tab:purple", edgecolor="black", hatch=hatch)
        bar_longs = ax.bar(height = grouped_df[grouped_df.index == train_file][f"top_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i, width=bar_width, color="tab:blue", edgecolor="black", hatch=hatch)
        bar_shorts = ax.bar(height = grouped_df[grouped_df.index == train_file][f"bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"], 
           x=i+0.2, width=bar_width, color="tab:orange", edgecolor="black", hatch=hatch)
    

if n_stock_annot:
    n_stocks_in_group = grouped_df["n_stocks_in_group"]

    for i, bar in enumerate(ax.patches):
        ax.annotate(text = f"n = {int(n_stocks_in_group.iloc[i])}", 
                   xy = (bar.get_x() + bar.get_width() / 2, 
                    bar.get_height() + 0.0015), ha='center', va='center',
                   size=12
                   )

plt.axhline(0, color="black", lw=1)


plt.xticks(rotation=45, ha='right')
ax.tick_params(axis='both', which='major', labelsize=20, labelbottom=True)
ax.set_xticks(range(len(grouped_df.index)))
ax.set_xticklabels(grouped_df.index, rotation=45, ha='right')
ax.legend(fontsize=16)
leg = ax.get_legend()
leg.legend_handles[0].set_hatch("")
leg.legend_handles[1].set_hatch("")
leg.legend_handles[2].set_hatch("")

avg_trr_5_ar = us_lookup_mc_cap[us_lookup_mc_cap["date"].dt.dayofweek == 4]
avg_trr_5_ar = avg_trr_5_ar[avg_trr_5_ar["date"].isin(current_digest["date"].unique())].groupby("date")["trr_5_ar"].mean().mean()
ax.axhline(avg_trr_5_ar, color="black", linestyle="--", lw=2)

plt.tight_layout()
ax.grid(axis="y")
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=2))

In [None]:
fig.savefig("../figures/train_file_ensemble_returns_us_nber_rec_over_078_after_2003_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/train_file_ensemble_returns_us_nber_exp_over_078_after_2003_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/train_file_ensemble_returns_us_qual_bear_over_078_after_2003_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/train_file_ensemble_returns_us_qual_bull_over_078_after_2003_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/train_file_ensemble_returns_us_all_dates_over_078_after_2003_bar.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/train_file_ensemble_returns_us_covid_over_078_bar.pdf", dpi=3000)

# Adaptive model

In [None]:
import seaborn as sb
from matplotlib.patches import Rectangle

In [None]:
test_date_files = os.listdir("../time_periods/model_test_ready/")

In [None]:
bull_rec_test_files = [x for x in test_date_files if ("bull" in x or "exp" in x or "non_bear" in x or "non_rec" in x) 
                       and ("non_exp" not in x) and ("non_bull" not in x)]

In [None]:
bear_rec_test_files = [x for x in test_date_files if ("bear" in x or "rec" in x or "non_bull" in x or "non_exp" in x) 
                       and ("non_rec" not in x) and ("non_bear" not in x)]

In [None]:
train_file_names = current_digest["train_file"].unique()

In [None]:
bull_rec_train_files = [x for x in train_file_names if ("bull" in x or "exp" in x or "non_bear" in x or "non_rec" in x) 
                       and ("non_exp" not in x) and ("non_bull" not in x)]

In [None]:
bear_rec_train_files = [x for x in train_file_names if ("bear" in x or "rec" in x or "non_bull" in x or "non_exp" in x) 
                       and ("non_rec" not in x) and ("non_bear" not in x)]

In [None]:
test_date_files

In [None]:
test_file_name_dict = {
    'nber_recession_dates_class_lstm_ba4da75c' : "Recession Class LSTM",
    'nber_non_recession_dates_class_lstm_ba4da75c' : "Non-Recession Class LSTM",
    'bear_dates_qbear_class_lstm_9046df4a' : "Bear Class LSTM",
    'bull_dates_qbull_class_lstm_f241ab59' : "Bull Class LSTM",
    'non_bear_dates_qbear_class_lstm_9046df4a' : "Non-Bear Class LSTM",
    'non_bull_dates_bqull_class_lstm_f241ab59' : "Non-Bull Class LSTM",
    'markov_rec_dates_test_all_years_order1_4_10_5yr_avg' : "Markov Recession",
    'markov_exp_dates_test_all_years_order1_4_10_5yr_avg' : "Markov Expansion",
    'return_filter_bull_m_long_3_6_' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",
    'bear_lstm_mc_change_class' : "Negative MC Change LSTM",
    'bull_lstm_mc_change_class' : "Positive MC Change LSTM",
    'return_filter_bear_m_long_3_6_12' : "Negative Filter (LT)",
    'return_filter_bear_m_short_2_3' : "Negative Filter (ST)",
    'return_filter_bull_m_long_3_6_12' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",  
}

In [None]:
train_name_dict = {
    'nber_recession_dates' : "NBER Recession",
    'nber_expansion_dates' : "NBER Expansion",
    'bear_dates_sp500' : "Qualitative Bear",
    'bull_dates_sp500' : "Qualitative Bull",
    'non_bear_dates_sp500' : "Qualitative Non-Bear",
    'flat_dates_sp500' : "Qualitative Flat",
    'return_filter_bear_m_long_3_6_' : "Negative Filter (LT)",
    'return_filter_bear_m_long_3_6_12' : "Negative Filter (LT)",
    'return_filter_bear_m_short_2_3' : "Negative Filter (ST)",
    'return_filter_bull_m_long_3_6_' : "Positive Filter (LT)",
    'return_filter_bull_m_long_3_6_12' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",
    'markov_rec' : "Markov Recession",
    'markov_exp' : "Markov Expansion",
    'markov_rec_dates_train_2020_order1_4_10_smooth_5yr_avg' : "Markov Recession",
    'markov_exp_dates_train_2020_order1_4_10_smooth_5yr_avg' : "Markov Expansion",
    'EPU_rec_2yr' : "EPU Recession",
    'EPU_exp_2yr' : "EPU Expansion",
    'all_dates' : "All Dates"
}

### Test on predicted dates

In [None]:
current_digest = all_digests_nogroup

current_digest = all_digests_nogroup[all_digests_nogroup["model"] == "all_ensemble"]
current_digest = current_digest[current_digest["region"] == "us"]
current_digest = current_digest[current_digest["min_max_quantile"] == (0.78, 1)]
current_digest = current_digest[current_digest["date"] >= pd.Timestamp("2003-01-01")]

#current_digest = current_digest[~((current_digest["date"] >= pd.Timestamp("2019-12-31")) & (current_digest["date"] <= pd.Timestamp("2020-05-31")))]
current_digest = current_digest[current_digest["date"] <= pd.Timestamp("2019-12-31")]

feature = "top_minus_bottom_4_trr_5_fwd_ar_mean"

flip_filters = True

current_bear_rec_train_files = bear_rec_train_files
current_bull_rec_train_files = bull_rec_train_files

current_bear_rec_test_files = bear_rec_test_files
current_bull_rec_test_files = bull_rec_test_files

subtract_average = False

times_dates = False



if flip_filters:
    bear_filters = [x for x in current_bear_rec_train_files if "filter" in x]
    bull_filters = [x for x in current_bull_rec_train_files if "filter" in x]
    current_bear_rec_train_files = [x for x in current_bear_rec_train_files if "filter" not in x] + bull_filters
    current_bull_rec_train_files = [x for x in current_bull_rec_train_files if "filter" not in x] + bear_filters
    

    
current_test_date_files = sorted(current_bear_rec_test_files)# + ["return_filter_bull_m_long_3_6_12.csv", "return_filter_bull_m_short_2_3.csv"]
#current_test_date_files = sorted(current_bull_rec_test_files)# + ["return_filter_bear_m_long_3_6_12.csv", "return_filter_bear_m_short_2_3.csv"]



current_train_file_names = sorted(current_bear_rec_train_files) + sorted(current_bull_rec_train_files) + ["flat_dates_sp500", "all_dates"]

current_test_file_names = [x.split(".csv")[0] for x in current_test_date_files]

results_df = pd.DataFrame(index=current_train_file_names, columns=current_test_file_names + ["Mean of Row", "All Dates"
                                                                                             , "Qualitative Bear"
                                                                                             #, "Qualitative Bull"
                                                                                            ])

for i, train_file_name in enumerate(current_train_file_names):
    current_row = []
    for test_file_path in current_test_date_files:
        
        current_current_digest = current_digest[current_digest["train_file"] == train_file_name]
        
        current_test_dates = pd.read_csv(f"../time_periods/model_test_ready/{test_file_path}")
        current_test_dates["date"] = pd.to_datetime(current_test_dates["date"])
        
        
        
        if subtract_average:
            avg_return = (current_current_digest[current_current_digest["date"].isin(current_test_dates["date"])].set_index("date")[feature] - us_lookup_mc_cap_avg[us_lookup_mc_cap_avg.index.isin(current_test_dates["date"])]).mean()
        else:
            avg_return = current_current_digest[current_current_digest["date"].isin(current_test_dates["date"])][feature].mean()

        if times_dates:
            avg_return = avg_return*current_current_digest[current_current_digest["date"].isin(current_test_dates["date"])]["date"].nunique()
        current_row.append(avg_return)
        
    current_row.append(float(sum(current_row)/len(current_row)))
    
    current_current_digest = current_digest[current_digest["train_file"] == train_file_name]
    avg_return_all_dates = current_current_digest[feature].mean()
    if subtract_average:
        avg_return_all_dates = (current_current_digest.set_index("date")[feature] - us_lookup_mc_cap_avg[us_lookup_mc_cap_avg.index.isin(current_current_digest["date"])]).mean()
    else:
        avg_return_all_dates = current_current_digest[feature].mean()

    current_row.append(avg_return_all_dates)
    
    qual_bear_dates = pd.read_csv("../time_periods/model_train_ready/bear_dates_sp500.csv")
    qual_bear_dates["date"] = pd.to_datetime(qual_bear_dates["date"])
    avg_qual_bear_returns = current_current_digest[current_current_digest["date"].isin(qual_bear_dates["date"])][feature].mean()
    current_row.append(avg_qual_bear_returns)
    
    qual_bull_dates = pd.read_csv("../time_periods/model_train_ready/bull_dates_sp500.csv")
    qual_bull_dates["date"] = pd.to_datetime(qual_bull_dates["date"])
    avg_qual_bull_returns = current_current_digest[current_current_digest["date"].isin(qual_bull_dates["date"])][feature].mean()
    
    results_df.loc[train_file_name] = current_row
results_df = results_df.astype(float)

results_df.rename(columns = test_file_name_dict, index=train_name_dict, inplace=True)

fig, ax = plt.subplots(1,1, figsize=(10,9))

ax = sb.heatmap(results_df, cmap="flare", annot=True, cbar=False, square=False, annot_kws={"fontsize": 11}, fmt='.2%')


ax.add_patch(Rectangle((0.02, 0.02), 8.96, 5.94, fill=False, edgecolor='tab:red', lw=2))
ax.add_patch(Rectangle((0.02, 6.02), 8.96, 6.95, fill=False, edgecolor='tab:blue', lw=2))
ax.add_patch(Rectangle((0.02, 13.02), 8.96, 1.93, fill=False, edgecolor='tab:green', lw=2))

ax.set_ylabel('Train dates', fontsize = 15)
ax.set_xlabel('Test dates', fontsize = 15)

plt.xticks(rotation=30, ha='right')
ax.tick_params(axis='both', which='major', labelsize=12, labelbottom=True)
plt.tight_layout()

In [None]:
fig.savefig("../figures/ensemble_us_returns_train_and_predict_bear_dates_after_2003_before_2020_filterflip_with_qual_bear.pdf", dpi=3000)


In [None]:
fig.savefig("../figures/ensemble_us_returns_train_and_predict_bear_dates_after_2003_excl_2020firsthalf_filterflip_with_qual_bear.pdf", dpi=3000)


In [None]:
fig.savefig("../figures/ensemble_eu_returns_train_and_predict_bear_dates_after_2005_no_filterflip_with_qual_bear.pdf", dpi=3000)


In [None]:
fig.savefig("../figures/ensemble_jp_returns_train_and_predict_bear_dates_after_2005_no_filterflip_with_qual_bear_top_2p5pct.pdf", dpi=3000)


In [None]:
fig.savefig("../figures/ensemble_eu_returns_train_and_predict_bull_dates_after_2003_filterflip_with_qual_bull.pdf", dpi=3000)


In [None]:
fig.savefig("../figures/ensemble_jp_returns_train_and_predict_bull_dates_after_2003_filterflip_with_qual_bull.pdf", dpi=3000)


In [None]:
train_date_files = os.listdir("../time_periods/model_train_ready/")

In [None]:
train_date_files = train_date_files + ["markov_rec_dates_train_2020_order1_4_10_smooth_5yr_avg.csv", "markov_exp_dates_train_2020_order1_4_10_smooth_5yr_avg.csv"]
                                       

In [None]:
bull_rec_train_date_files = [x for x in train_date_files if ("bull" in x or "exp" in x or "non_bear" in x or "non_rec" in x) 
                       and ("non_exp" not in x) and ("non_bull" not in x)]

In [None]:
bear_rec_train_date_files = [x for x in train_date_files if ("bear" in x or "rec" in x or "non_bull" in x or "non_exp" in x) 
                       and ("non_rec" not in x) and ("non_bear" not in x)]

### Test on training dates

In [None]:
current_digest = all_digests_nogroup

current_digest = all_digests_nogroup[all_digests_nogroup["model"] == "all_ensemble"]
current_digest = current_digest[current_digest["region"] == "eu"]
current_digest = current_digest[current_digest["min_max_quantile"] == (0.78, 1)]
current_digest = current_digest[current_digest["date"] >= pd.Timestamp("2003-01-01")]
#current_digest = current_digest[current_digest["date"] <= pd.Timestamp("2020-01-01")]


feature = "top_minus_bottom_4_trr_5_fwd_ar_mean"

flip_filters = False

current_bear_rec_train_files = bear_rec_train_files
current_bull_rec_train_files = bull_rec_train_files

current_bear_rec_train_date_files = bear_rec_train_date_files
current_bull_rec_train_date_files = bull_rec_train_date_files



if flip_filters:
    bear_filters = [x for x in current_bear_rec_train_files if "filter" in x]
    bull_filters = [x for x in current_bull_rec_train_files if "filter" in x]
    current_bear_rec_train_files = [x for x in current_bear_rec_train_files if "filter" not in x] + bull_filters
    current_bull_rec_train_files = [x for x in current_bull_rec_train_files if "filter" not in x] + bear_filters
    
current_train_date_files = sorted(current_bear_rec_train_date_files)
#current_train_date_files = sorted(current_bull_rec_train_date_files)

current_train_date_files = sorted(current_bear_rec_train_date_files) + sorted(current_bull_rec_train_date_files)



current_train_file_names = sorted(current_bear_rec_train_files) + sorted(current_bull_rec_train_files) + ["flat_dates_sp500", "all_dates"]

current_train_date_files = current_train_date_files + ["flat_dates_sp500.csv", "all_dates.csv"]

current_train_date_file_names = [x.split(".csv")[0] for x in current_train_date_files]

results_df = pd.DataFrame(index=current_train_file_names, columns=current_train_date_file_names)

for i, train_file_name in enumerate(current_train_file_names):
    current_row = []
    for train_date_file_path in current_train_date_files:
        
        current_current_digest = current_digest[current_digest["train_file"] == train_file_name]
        
        if "markov" in train_date_file_path:
            current_train_dates = pd.read_csv(f"../time_periods/model_train_ready_before_test/{train_date_file_path}")
        else:
            current_train_dates = pd.read_csv(f"../time_periods/model_train_ready/{train_date_file_path}")
        current_train_dates["date"] = pd.to_datetime(current_train_dates["date"])
                
        avg_return = current_current_digest[current_current_digest["date"].isin(current_train_dates["date"])][feature].mean()

        current_row.append(avg_return)
    
    results_df.loc[train_file_name] = current_row 
results_df = results_df.astype(float)

results_df.rename(columns = train_name_dict, index=train_name_dict, inplace=True)

fig, ax = plt.subplots(1,1, figsize=(10,9))

ax = sb.heatmap(results_df, cmap="flare", annot=True, cbar=False, square=False, annot_kws={"fontsize": 10}, fmt='.2%')


ax.add_patch(Rectangle((0.02, 0.02), 0.02, 5.92, fill=False, edgecolor='tab:red', lw=2))
ax.add_patch(Rectangle((0.02, 6.02), 0.02, 6.93, fill=False, edgecolor='tab:blue', lw=2))
ax.add_patch(Rectangle((0.02, 13.02), 0.02, 1.93, fill=False, edgecolor='tab:green', lw=2))

ax.add_patch(Rectangle((0.02, 14.96), 5.92, 0.02, fill=False, edgecolor='tab:red', lw=2))
ax.add_patch(Rectangle((6, 14.96), 7, 0.02, fill=False, edgecolor='tab:blue', lw=2))
ax.add_patch(Rectangle((13.02, 14.96), 1.93, 0.02, fill=False, edgecolor='tab:green', lw=2))


ax.set_ylabel('Train dates', fontsize = 15)
ax.set_xlabel('Test dates', fontsize = 15)
plt.xticks(rotation=45, ha='right')
ax.tick_params(axis='both', which='major', labelsize=12, labelbottom=True)
plt.tight_layout()

In [None]:
fig.savefig("../figures/ensemble_us_returns_train_and_test_dates_after_2003_no_filterflip.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/ensemble_eu_returns_train_and_test_dates_after_2003_no_filterflip_top_2p5pct.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/ensemble_jp_returns_train_and_test_dates_after_2003_no_filterflip_top_2p5pct.pdf", dpi=3000)

### Mean of quadrants

In [None]:
# Positive, negative, flat heatmap, TRAIN DATES


current_digest = all_digests_nogroup

current_digest = all_digests_nogroup[all_digests_nogroup["model"] == "all_ensemble"]
current_digest = current_digest[current_digest["region"] == "us"]
current_digest = current_digest[current_digest["min_max_quantile"] == (0.78, 1)]
current_digest = current_digest[current_digest["date"] >= pd.Timestamp("2005-01-01")]
current_digest = current_digest[current_digest["date"] <= pd.Timestamp("2020-01-01")]


feature = "top_minus_bottom_4_trr_5_fwd_ar_mean"

#train_file_names = current_digest["train_file"].unique()

include_filters = False

flip_filters = False

current_bear_rec_train_files = bear_rec_train_files
current_bull_rec_train_files = bull_rec_train_files

current_bear_rec_train_date_files = bear_rec_train_date_files
current_bull_rec_train_date_files = bull_rec_train_date_files



if flip_filters:
    bear_filters = [x for x in current_bear_rec_train_files if "filter" in x]
    bull_filters = [x for x in current_bull_rec_train_files if "filter" in x]
    current_bear_rec_train_files = [x for x in current_bear_rec_train_files if "filter" not in x] + bull_filters
    current_bull_rec_train_files = [x for x in current_bull_rec_train_files if "filter" not in x] + bear_filters
    
if not include_filters:
    current_bear_rec_train_date_files = [x for x in current_bear_rec_train_date_files if "filter" not in x]
    current_bull_rec_train_date_files = [x for x in current_bull_rec_train_date_files if "filter" not in x]

current_train_file_categories = [sorted(current_bear_rec_train_date_files), sorted(current_bull_rec_train_date_files), 
                                # ["flat_dates_sp500.csv"], 
                                 ["all_dates.csv"]
                                ]


categories = ["Neg.", "Pos.", 
             # "Flat", 
              "All"
             ]

results_df = pd.DataFrame(index=categories, columns=categories)

for i, current_train_file_category_train in enumerate(current_train_file_categories):
    current_row = []
    current_train_file_category_train_names = [x.split(".")[0] for x in current_train_file_category_train]
    current_current_digest = current_digest[current_digest["train_file"].isin(current_train_file_category_train_names)]
    for train_date_file_paths_test in current_train_file_categories:
        mean_value = 0
        for train_date_file_path in train_date_file_paths_test:
            
            
            
            if "markov" in train_date_file_path:
                current_train_dates = pd.read_csv(f"../time_periods/model_train_ready_before_test/{train_date_file_path}")
            else:
                current_train_dates = pd.read_csv(f"../time_periods/model_train_ready/{train_date_file_path}")
            current_train_dates["date"] = pd.to_datetime(current_train_dates["date"])

            avg_return = current_current_digest[current_current_digest["date"].isin(current_train_dates["date"])][feature].mean()
            mean_value += avg_return
        mean_value = mean_value / len(train_date_file_paths_test)
        current_row.append(mean_value)

    results_df.loc[categories[i]] = current_row 
results_df = results_df.astype(float)


fig, ax = plt.subplots(1,1, figsize=(5,5))

ax = sb.heatmap(results_df, cmap="flare", annot=True, cbar=False, square=False, annot_kws={"fontsize": 22}, fmt='.2%')



ax.set_ylabel('Train dates', fontsize = 21)
ax.set_xlabel('Test dates', fontsize = 21)
plt.xticks(rotation=45, ha='right')
ax.tick_params(axis='both', which='major', labelsize=20, labelbottom=True)
plt.tight_layout()

In [None]:
fig.savefig("../figures/mean_returns_train_dates_and_test_dates_small_no_filters_jp_after_2005_top_2p5pct.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/mean_returns_train_dates_and_test_dates_small_no_filters_eu_after_2005_top_2p5pct.pdf", dpi=3000)

In [None]:
fig.savefig("../figures/mean_returns_train_dates_and_test_dates_small_no_filters_us_after_2005_top_2p5pct.pdf", dpi=3000)

In [None]:
# Positive, negative, flat heatmap TEST DATES


current_digest = all_digests_nogroup

current_digest = all_digests_nogroup[all_digests_nogroup["model"] == "all_ensemble"]
current_digest = current_digest[current_digest["region"] == "jp"]
current_digest = current_digest[current_digest["min_max_quantile"] == (0.78, 1)]
current_digest = current_digest[current_digest["date"] >= pd.Timestamp("2005-01-01")]
#current_digest = current_digest[current_digest["date"] <= pd.Timestamp("2020-01-01")]


feature = "top_minus_bottom_4_trr_5_fwd_ar_mean"

include_filters = False

flip_filters = False

current_bear_rec_train_files = bear_rec_train_files
current_bull_rec_train_files = bull_rec_train_files

current_bear_rec_train_date_files = bear_rec_train_date_files
current_bull_rec_train_date_files = bull_rec_train_date_files



if flip_filters:
    bear_filters = [x for x in current_bear_rec_train_files if "filter" in x]
    bull_filters = [x for x in current_bull_rec_train_files if "filter" in x]
    current_bear_rec_train_files = [x for x in current_bear_rec_train_files if "filter" not in x] + bull_filters
    current_bull_rec_train_files = [x for x in current_bull_rec_train_files if "filter" not in x] + bear_filters
    
if not include_filters:
    current_bear_rec_train_date_files = [x for x in current_bear_rec_train_date_files if "filter" not in x]
    current_bull_rec_train_date_files = [x for x in current_bull_rec_train_date_files if "filter" not in x]

current_train_file_categories = [sorted(current_bear_rec_train_date_files), sorted(current_bull_rec_train_date_files), 
                                # ["flat_dates_sp500.csv"], 
                                 ["all_dates.csv"]
                                ]





categories = ["Neg.", "Pos.", 
             # "Flat", 
              "All"
             ]

results_df = pd.DataFrame(index=categories, columns=categories)

for i, current_train_file_category_train in enumerate(current_train_file_categories):
    current_row = []
    current_train_file_category_train_names = [x.split(".")[0] for x in current_train_file_category_train]
    current_current_digest = current_digest[current_digest["train_file"].isin(current_train_file_category_train_names)]
    for train_date_file_paths_test in current_train_file_categories:
        mean_value = 0
        for train_date_file_path in train_date_file_paths_test:
            
            
            
            if "markov" in train_date_file_path:
                current_train_dates = pd.read_csv(f"../time_periods/model_train_ready_before_test/{train_date_file_path}")
            else:
                current_train_dates = pd.read_csv(f"../time_periods/model_train_ready/{train_date_file_path}")
            current_train_dates["date"] = pd.to_datetime(current_train_dates["date"])

            avg_return = current_current_digest[current_current_digest["date"].isin(current_train_dates["date"])][feature].mean()
            mean_value += avg_return
        mean_value = mean_value / len(train_date_file_paths_test)
        current_row.append(mean_value)

    results_df.loc[categories[i]] = current_row 
results_df = results_df.astype(float)


fig, ax = plt.subplots(1,1, figsize=(5,5))

ax = sb.heatmap(results_df, cmap="flare", annot=True, cbar=False, square=False, annot_kws={"fontsize": 22}, fmt='.2%')

ax.set_ylabel('Train dates', fontsize = 21)
ax.set_xlabel('Test dates', fontsize = 21)
plt.xticks(rotation=45, ha='right')
ax.tick_params(axis='both', which='major', labelsize=20, labelbottom=True)
plt.tight_layout()

# Reversal

In [None]:
current_digest = all_digests_nogroup[(all_digests_nogroup["model"] == "all_ensemble")]

current_digest = current_digest[current_digest["date"] >= pd.Timestamp("2005-01-01")]

train_files =  ['return_filter_bear_m_long_3_6_',
    'return_filter_bear_m_short_2_3',
    'return_filter_bull_m_long_3_6_',
    'return_filter_bull_m_short_2_3']

train_file_names = [train_name_dict[x] for x in train_files]

top_n_quantiles = 4

n_stock_annot = True

fig, ax = plt.subplots(figsize=(15, 8))

hatch = ""

bar_width = 0.2

text_offset = 0.002

for i, train_file in enumerate(train_files):

    current_current_digest = current_digest[current_digest["train_file"] == train_file]
    
    if i == 0:
        bar_us = ax.bar(height = current_current_digest[current_current_digest["region"] == "us"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i-0.2, width=bar_width, label="US", color="tab:red", edgecolor="black", hatch=hatch)
        bar_eu = ax.bar(height = current_current_digest[current_current_digest["region"] == "eu"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i, width=bar_width, label="Europe", color="tab:blue", edgecolor="black", hatch=hatch)
        bar_jp = ax.bar(height = current_current_digest[current_current_digest["region"] == "jp"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i+0.2, width=bar_width, label="Japan", color="tab:green", edgecolor="black", hatch=hatch)

    else:
        bar_us = ax.bar(height = current_current_digest[current_current_digest["region"] == "us"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i-0.2, width=bar_width, color="tab:red", edgecolor="black", hatch=hatch)
        bar_eu = ax.bar(height = current_current_digest[current_current_digest["region"] == "eu"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i, width=bar_width, color="tab:blue", edgecolor="black", hatch=hatch)
        bar_jp = ax.bar(height = current_current_digest[current_current_digest["region"] == "jp"][f"top_minus_bottom_{top_n_quantiles}_trr_5_fwd_ar_mean"].mean(), 
           x=i+0.2, width=bar_width, color="tab:green", edgecolor="black", hatch=hatch)
    
    
ax.tick_params(axis='both', which='major', labelsize=18)
ax.set_xticks(range(len(train_files)))
ax.set_xticklabels(train_file_names)
ax.legend(fontsize=18)
ax.set_xlabel("Market Cap Quantile Ranges", fontsize = 20)

plt.tight_layout()
ax.grid(axis="y")
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

In [None]:
fig.savefig("../figures/filter_reversal_by_regions.pdf", dpi=3000)

# t-test

In [None]:
all_ensemble_preds_us = pd.read_parquet("digests/['catboost', 'xgb', 'logreg', 'rf']_ensemble_USmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_all_preds_078_after_2003.parquet", engine="pyarrow")
all_ensemble_preds_us["date"] = pd.to_datetime(all_ensemble_preds_us["date"])

In [None]:
all_ensemble_preds_eu = pd.read_parquet("digests/['catboost', 'xgb', 'logreg', 'rf']_ensemble_Europemax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_all_preds_078_after_2003.parquet", engine="pyarrow")
all_ensemble_preds_eu["date"] = pd.to_datetime(all_ensemble_preds_eu["date"])

In [None]:
all_ensemble_preds_jp = pd.read_parquet("digests/['catboost', 'xgb', 'logreg', 'rf']_ensemble_Japanmax_trr_5_fwd_ar_0.3min_trr_5_fwd_ar_-0.3_all_preds_078_after_2003.parquet", engine="pyarrow")
all_ensemble_preds_jp["date"] = pd.to_datetime(all_ensemble_preds_jp["date"])

In [None]:
from scipy.stats import ttest_ind, ttest_rel, normaltest, skewtest, kurtosistest, skew, kurtosis

In [None]:
test_file_name_dict = {
    'nber_recession_dates_class_lstm_ba4da75c' : "Recession Class LSTM",
    'nber_non_recession_dates_class_lstm_ba4da75c' : "Non-Recession Class LSTM",
    'bear_dates_qbear_class_lstm_9046df4a' : "Bear Class LSTM",
    'bull_dates_qbull_class_lstm_f241ab59' : "Bull Class LSTM",
    'non_bear_dates_qbear_class_lstm_9046df4a' : "Non-Bear Class LSTM",
    'non_bull_dates_bqull_class_lstm_f241ab59' : "Non-Bull Class LSTM",
    'markov_rec_dates_test_all_years_order1_4_10_5yr_avg' : "Markov Recession",
    'markov_exp_dates_test_all_years_order1_4_10_5yr_avg' : "Markov Expansion",
    'return_filter_bull_m_long_3_6_' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",
    'bear_lstm_mc_change_class' : "Negative MC Change LSTM",
    'bull_lstm_mc_change_class' : "Positive MC Change LSTM",
    'return_filter_bear_m_long_3_6_12' : "Negative Filter (LT)",
    'return_filter_bear_m_short_2_3' : "Negative Filter (ST)",
    'return_filter_bull_m_long_3_6_12' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",  
}

In [None]:
#Using all predictions:

current_preds = all_ensemble_preds_eu.copy()

min_date = pd.Timestamp("2003-01-01")
max_date = pd.Timestamp("2019-12-31")

current_region = "eu"


current_preds = current_preds[current_preds["date"] >= min_date]
current_preds = current_preds[current_preds["date"] <= max_date]

#current_preds = current_preds[~((current_preds["date"] >= pd.Timestamp("2019-12-31")) & (current_preds["date"] <= pd.Timestamp("2020-05-31")))]

all_dates = False
adaptive_model = True

long_short_every_week = False

compare_with_all_dates = True


current_train_file = "all_dates"
test_date_file_name = ""

buy_conviction_quantiles = [39,38,37,36]
sell_conviction_quantiles = [0,1,2,3]

current_train_files = [
    #"non_bear_dates_sp500", 
    #"non_bear_dates_sp500", 
    "all_dates",
    "all_dates",
    "all_dates",
    #"non_bear_dates_sp500",
    #"non_bear_dates_sp500",
    #"non_bear_dates_sp500",
    #"nber_expansion_dates",
    #"bull_dates_sp500",
    #"bull_dates_sp500",
    #"bull_dates_sp500"
                      ]

final_train_file = "all_dates"

test_date_file_names = [
    "nber_recession_dates_class_lstm_ba4da75c", 
    #"bull_lstm_mc_change_class"
    "bear_dates_qbear_class_lstm_9046df4a",
    "return_filter_bear_m_short_2_3", 
    #"return_filter_bear_m_long_3_6_12",
    #"non_bull_dates_bqull_class_lstm_f241ab59"
]

comparison_train_file = "all_dates"



if all_dates:
    current_preds_adaptive = current_preds[current_preds["train_file"] == current_train_files[0]]
    current_preds_adaptive_buys = current_preds_adaptive[current_preds_adaptive["conviction_quantile"].isin(buy_conviction_quantiles)]
    current_preds_adaptive_sells = current_preds_adaptive[current_preds_adaptive["conviction_quantile"].isin(sell_conviction_quantiles)]
    
    current_preds_adaptive_buys.sort_values("date", inplace=True)
    current_preds_adaptive_sells.sort_values("date", inplace=True)


elif adaptive_model:
    for i in range(len(current_train_files)):
        
        test_dates = pd.read_csv(f"../time_periods/model_test_ready/{test_date_file_names[i]}.csv")
        test_dates["date"] = pd.to_datetime(test_dates["date"])
        if i != 0:
            test_dates = test_dates[~test_dates["date"].isin(current_preds_adaptive_buys["date"])]
        current_current_preds = current_preds[current_preds["date"].isin(test_dates["date"])]
        print(f"N dates adaptive state {i}:", current_current_preds["date"].nunique())
        current_current_preds = current_current_preds[current_current_preds["train_file"] == current_train_files[i]]
        current_current_preds["train_file"] = current_train_files[i]
        current_current_preds["test_file"] = test_date_file_names[i]
        
        current_current_pred_buys = current_current_preds[current_current_preds["conviction_quantile"].isin(buy_conviction_quantiles)]
        current_current_pred_sells = current_current_preds[current_current_preds["conviction_quantile"].isin(sell_conviction_quantiles)]

        if i == 0:
            current_preds_adaptive_buys = current_current_pred_buys
            current_preds_adaptive_sells = current_current_pred_sells
            
        else:
            current_preds_adaptive_buys = pd.concat([current_preds_adaptive_buys, current_current_pred_buys])
            current_preds_adaptive_sells = pd.concat([current_preds_adaptive_sells, current_current_pred_sells])
            
        print(f"Length adaptive state buys {i}:", current_current_pred_buys["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state buys {i}:", current_current_pred_buys["trr_5_fwd_ar"].mean())
        print(f"Length adaptive state sells {i}:", current_current_pred_sells["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state sells {i}:", current_current_pred_sells["trr_5_fwd_ar"].mean())
        
    if long_short_every_week:
        current_current_preds = current_preds[current_preds["train_file"] == final_train_file]
        current_current_preds = current_current_preds[~current_current_preds["date"].isin(current_preds_adaptive_buys["date"])]
        print(f"N dates adaptive state {i+1}:", current_current_preds["date"].nunique())
        current_current_preds["train_file"] = current_train_files[i]
        current_current_preds["test_file"] = test_date_file_names[i]
        
        current_current_pred_buys = current_current_preds[current_current_preds["conviction_quantile"].isin(buy_conviction_quantiles)]
        current_current_pred_sells = current_current_preds[current_current_preds["conviction_quantile"].isin(sell_conviction_quantiles)]


        current_preds_adaptive_buys = pd.concat([current_preds_adaptive_buys, current_current_pred_buys])
        current_preds_adaptive_sells = pd.concat([current_preds_adaptive_sells, current_current_pred_sells])
        
        print(f"Length adaptive state buys {i+1}:", current_current_pred_buys["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state buys {i+1}:", current_current_pred_buys["trr_5_fwd_ar"].mean())
        print(f"Length adaptive state sells {i+1}:", current_current_pred_sells["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state sells {i+1}:", current_current_pred_sells["trr_5_fwd_ar"].mean())
        
        
    current_preds_adaptive_buys.sort_values("date", inplace=True)
    current_preds_adaptive_sells.sort_values("date", inplace=True)
    

for i, group in enumerate(current_preds_adaptive_buys.groupby("date")):
    current_buys = current_preds_adaptive_buys[current_preds_adaptive_buys["date"] == group[0]]
    current_sells = current_preds_adaptive_sells[current_preds_adaptive_sells["date"] == group[0]]
    current_n_buys = current_buys["gvkey"].nunique()
    current_n_sells = current_sells["gvkey"].nunique()
        
    current_n_pairs = min(current_n_buys, current_n_sells)
    
    
    current_current_preds_returns = current_buys.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs) - current_sells.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs)
    
    if i == 0:
        current_pred_returns = current_current_preds_returns
    else:
        current_pred_returns = pd.concat([current_pred_returns, current_current_preds_returns])

if current_region == "us":
    current_index = us_lookup_mc_cap_avg
elif current_region == "eu":
    current_index = eu_lookup_mc_cap_avg
elif current_region == "jp":
    current_index = jp_lookup_mc_cap_avg

comparison_preds = current_preds[current_preds["train_file"] == comparison_train_file].copy()

comparison_buys = comparison_preds[comparison_preds["conviction_quantile"].isin(buy_conviction_quantiles)]
comparison_sells = comparison_preds[comparison_preds["conviction_quantile"].isin(sell_conviction_quantiles)]

for i, group in enumerate(comparison_buys.groupby("date")):
    current_buys = comparison_buys[comparison_buys["date"] == group[0]]
    current_sells = comparison_sells[comparison_sells["date"] == group[0]]
    current_n_buys = current_buys["gvkey"].nunique()
    current_n_sells = current_sells["gvkey"].nunique()
        
    current_n_pairs = min(current_n_buys, current_n_sells)
    
    
    current_comparison_returns = current_buys.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs) - current_sells.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs)
    
    if i == 0:
        comparison_returns = current_comparison_returns
    else:
        comparison_returns = pd.concat([comparison_returns, current_comparison_returns])

#comparison_returns = comparison_buys.set_index("date")["trr_5_fwd_ar"] - comparison_sells.set_index("date")["trr_5_fwd_ar"]


if compare_with_all_dates:
    comparison_returns = comparison_returns[comparison_returns.index >= min_date]
    comparison_returns = comparison_returns[comparison_returns.index <= max_date]
else:  
    comparison_returns = comparison_returns[comparison_returns.index.isin(current_pred_returns.index)]

print("Comparison n dates:", comparison_returns.index.nunique())
print("Pred n dates:", current_pred_returns.index.nunique())

print("Comparison investments:", comparison_returns.shape)
print("Pred investments:", current_pred_returns.shape)
print("pct overlap", 100*current_pred_returns.shape[0]/comparison_returns.shape[0])

print("Comparison return:", comparison_returns.mean()*100)
print("Pred return:", current_pred_returns.mean()*100)
print("Rel pred return:", current_pred_returns.mean()*100 - comparison_returns.mean()*100)

print("Comparison std:", comparison_returns.std()*np.sqrt(52))
print("Pred std:", current_pred_returns.std()*np.sqrt(52))

print("Comparison sharpe:", (comparison_returns.mean()*100)/(comparison_returns.std()*np.sqrt(52)))
print("Pred sharpe:", (current_pred_returns.mean()*100)/(current_pred_returns.std()*np.sqrt(52)))

print("Ind:", ttest_ind(current_pred_returns.values, comparison_returns.values, equal_var=False, alternative="greater"))


print()
print("Skew:", skew(current_pred_returns.values))
print("Skew test:", skewtest(current_pred_returns.values, alternative="two-sided"))

print("Kurtosis:", kurtosis(current_pred_returns.values))
print("Kurtosis test:", kurtosistest(current_pred_returns.values, alternative="two-sided"))

print("Comparison normal:", normaltest(comparison_returns))
print("Pred normal:", normaltest(current_pred_returns))


In [None]:
train_name_dict = {
    'nber_recession_dates' : "NBER Recession",
    'nber_expansion_dates' : "NBER Expansion",
    'bear_dates_sp500' : "Qualitative Bear",
    'bull_dates_sp500' : "Qualitative Bull",
    'non_bear_dates_sp500' : "Qualitative Non-Bear",
    'return_filter_bear_m_long_3_6_12' : "Negative Filter (LT)",
    'return_filter_bear_m_short_2_3' : "Negative Filter (LT)",
    'return_filter_bull_m_long_3_6_12' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",
    'EPU_rec_2yr' : "EPU Recession",
    'EPU_exp_2yr' : "EPU Expansion",
}

In [None]:
#Using all predictions, train dates:

current_preds = all_ensemble_preds_us.copy()

min_date = pd.Timestamp("2003-01-01")
max_date = pd.Timestamp("2023-12-31")

current_region = "us"


current_preds = current_preds[current_preds["date"] >= min_date]
current_preds = current_preds[current_preds["date"] <= max_date]

all_dates = False
adaptive_model = True

long_short_every_week = True

compare_with_all_dates = False


current_train_file = "all_dates"
test_date_file_name = ""

buy_conviction_quantiles = [39,38,37,36]
sell_conviction_quantiles = [0,1,2,3]

current_train_files = [
    #"non_bear_dates_sp500", 
    #"non_bear_dates_sp500", 
    #"all_dates",
    "bear_dates_sp500",
    #"non_bear_dates_sp500",
    #"non_bear_dates_sp500",
    #"non_bear_dates_sp500",
    #"nber_expansion_dates",
    "bull_dates_sp500",
    #"bull_dates_sp500",
    #"bull_dates_sp500"
                      ]

final_train_file = "all_dates"

test_date_file_names = [
    #'nber_recession_dates',
    #'nber_expansion_dates',
    'bear_dates_sp500',
    'bull_dates_sp500',
    #'non_bear_dates_sp500',
    #'return_filter_bear_m_long_3_6_12',
    #'return_filter_bear_m_short_2_3',
    #'return_filter_bull_m_long_3_6_12',
    #'return_filter_bull_m_short_2_3',
    #'EPU_rec_2yr',
    #'EPU_exp_2yr'
]

comparison_train_file = "all_dates"



if all_dates:
    current_preds_adaptive = current_preds[current_preds["train_file"] == current_train_files[0]]
    current_preds_adaptive_buys = current_preds_adaptive[current_preds_adaptive["conviction_quantile"].isin(buy_conviction_quantiles)]
    current_preds_adaptive_sells = current_preds_adaptive[current_preds_adaptive["conviction_quantile"].isin(sell_conviction_quantiles)]
    
    current_preds_adaptive_buys.sort_values("date", inplace=True)
    current_preds_adaptive_sells.sort_values("date", inplace=True)


elif adaptive_model:
    for i in range(len(current_train_files)):
        
        test_dates = pd.read_csv(f"../time_periods/model_train_ready/{test_date_file_names[i]}.csv")
        test_dates["date"] = pd.to_datetime(test_dates["date"])
        if i != 0:
            test_dates = test_dates[~test_dates["date"].isin(current_preds_adaptive_buys["date"])]
        current_current_preds = current_preds[current_preds["date"].isin(test_dates["date"])]
        print(f"N dates adaptive state {i}:", current_current_preds["date"].nunique())
        current_current_preds = current_current_preds[current_current_preds["train_file"] == current_train_files[i]]
        current_current_preds["train_file"] = current_train_files[i]
        current_current_preds["test_file"] = test_date_file_names[i]
        
        current_current_pred_buys = current_current_preds[current_current_preds["conviction_quantile"].isin(buy_conviction_quantiles)]
        current_current_pred_sells = current_current_preds[current_current_preds["conviction_quantile"].isin(sell_conviction_quantiles)]

        if i == 0:
            current_preds_adaptive_buys = current_current_pred_buys
            current_preds_adaptive_sells = current_current_pred_sells
            
        else:
            current_preds_adaptive_buys = pd.concat([current_preds_adaptive_buys, current_current_pred_buys])
            current_preds_adaptive_sells = pd.concat([current_preds_adaptive_sells, current_current_pred_sells])
            
        print(f"Length adaptive state buys {i}:", current_current_pred_buys["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state buys {i}:", current_current_pred_buys["trr_5_fwd_ar"].mean())
        print(f"Length adaptive state sells {i}:", current_current_pred_sells["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state sells {i}:", current_current_pred_sells["trr_5_fwd_ar"].mean())
        
    if long_short_every_week:
        current_current_preds = current_preds[current_preds["train_file"] == final_train_file]
        current_current_preds = current_current_preds[~current_current_preds["date"].isin(current_preds_adaptive_buys["date"])]
        print(f"N dates adaptive state {i+1}:", current_current_preds["date"].nunique())
        current_current_preds["train_file"] = current_train_files[i]
        current_current_preds["test_file"] = test_date_file_names[i]
        
        current_current_pred_buys = current_current_preds[current_current_preds["conviction_quantile"].isin(buy_conviction_quantiles)]
        current_current_pred_sells = current_current_preds[current_current_preds["conviction_quantile"].isin(sell_conviction_quantiles)]


        current_preds_adaptive_buys = pd.concat([current_preds_adaptive_buys, current_current_pred_buys])
        current_preds_adaptive_sells = pd.concat([current_preds_adaptive_sells, current_current_pred_sells])
        
        print(f"Length adaptive state buys {i+1}:", current_current_pred_buys["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state buys {i+1}:", current_current_pred_buys["trr_5_fwd_ar"].mean())
        print(f"Length adaptive state sells {i+1}:", current_current_pred_sells["trr_5_fwd_ar"].shape)
        print(f"Returns adaptive state sells {i+1}:", current_current_pred_sells["trr_5_fwd_ar"].mean())
        
        
    current_preds_adaptive_buys.sort_values("date", inplace=True)
    current_preds_adaptive_sells.sort_values("date", inplace=True)
    

for i, group in enumerate(current_preds_adaptive_buys.groupby("date")):
    current_buys = current_preds_adaptive_buys[current_preds_adaptive_buys["date"] == group[0]]
    current_sells = current_preds_adaptive_sells[current_preds_adaptive_sells["date"] == group[0]]
    current_n_buys = current_buys["gvkey"].nunique()
    current_n_sells = current_sells["gvkey"].nunique()
        
    current_n_pairs = min(current_n_buys, current_n_sells)
    
    
    current_current_preds_returns = current_buys.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs) - current_sells.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs)
    
    if i == 0:
        current_pred_returns = current_current_preds_returns
    else:
        current_pred_returns = pd.concat([current_pred_returns, current_current_preds_returns])

if current_region == "us":
    current_index = us_lookup_mc_cap_avg
elif current_region == "eu":
    current_index = eu_lookup_mc_cap_avg
elif current_region == "jp":
    current_index = jp_lookup_mc_cap_avg


comparison_preds = current_preds[current_preds["train_file"] == comparison_train_file].copy()

comparison_buys = comparison_preds[comparison_preds["conviction_quantile"].isin(buy_conviction_quantiles)]
comparison_sells = comparison_preds[comparison_preds["conviction_quantile"].isin(sell_conviction_quantiles)]

for i, group in enumerate(comparison_buys.groupby("date")):
    current_buys = comparison_buys[comparison_buys["date"] == group[0]]
    current_sells = comparison_sells[comparison_sells["date"] == group[0]]
    current_n_buys = current_buys["gvkey"].nunique()
    current_n_sells = current_sells["gvkey"].nunique()
        
    current_n_pairs = min(current_n_buys, current_n_sells)
    
    
    current_comparison_returns = current_buys.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs) - current_sells.set_index("date")["trr_5_fwd_ar"].head(current_n_pairs)
    
    if i == 0:
        comparison_returns = current_comparison_returns
    else:
        comparison_returns = pd.concat([comparison_returns, current_comparison_returns])

if compare_with_all_dates:
    comparison_returns = comparison_returns[comparison_returns.index >= min_date]
    comparison_returns = comparison_returns[comparison_returns.index <= max_date]
else:  
    comparison_returns = comparison_returns[comparison_returns.index.isin(current_pred_returns.index)]

print("Comparison n dates:", comparison_returns.index.nunique())
print("Pred n dates:", current_pred_returns.index.nunique())

print("Comparison investments:", comparison_returns.shape)
print("Pred investments:", current_pred_returns.shape)

print("Comparison return:", comparison_returns.mean()*100)
print("Pred return:", current_pred_returns.mean()*100)
print("Rel pred return:", current_pred_returns.mean()*100 - comparison_returns.mean()*100)

print("Comparison std:", comparison_returns.std()*np.sqrt(52))
print("Pred std:", current_pred_returns.std()*np.sqrt(52))

print("Comparison sharpe:", (comparison_returns.mean()*100)/(comparison_returns.std()*np.sqrt(52)))
print("Pred sharpe:", (current_pred_returns.mean()*100)/(current_pred_returns.std()*np.sqrt(52)))

print("Ind:", ttest_ind(current_pred_returns.values, comparison_returns.values, equal_var=False, alternative="greater"))


print()
print("Skew:", skew(current_pred_returns.values))
print("Skew test:", skewtest(current_pred_returns.values, alternative="two-sided"))

print("Kurtosis:", kurtosis(current_pred_returns.values))
print("Kurtosis test:", kurtosistest(current_pred_returns.values, alternative="two-sided"))

print("Comparison normal:", normaltest(comparison_returns))
print("Pred normal:", normaltest(current_pred_returns))


In [None]:
#Using only mean returns on dates
#Predicted

current_digest = all_digests_nogroup

current_region = "us"

min_date = pd.Timestamp("2003-01-01")
max_date = pd.Timestamp("2023-12-31")

current_digest = all_digests_nogroup[all_digests_nogroup["model"] == "all_ensemble"]
current_digest = current_digest[current_digest["region"] == current_region]
current_digest = current_digest[current_digest["min_max_quantile"] == (0.78, 1)]
current_digest = current_digest[current_digest["date"] >= min_date]
current_digest = current_digest[current_digest["date"] <= max_date]

all_dates = True
adaptive_model = True

long_short_every_week = False

compare_with_all_dates = False


current_train_file = "all_dates"
test_date_file_name = ""

feature = "top_minus_bottom_4_trr_5_fwd_ar_mean"

current_train_files = [
    "non_bear_dates_sp500", 
    "non_bear_dates_sp500", 
    #"all_dates",
    "non_bear_dates_sp500"
                      ]

final_train_file = "non_bear_dates_sp500"

test_date_file_names = [
    "nber_recession_dates_class_lstm_ba4da75c", 
    #"bull_lstm_mc_change_class"
    "return_filter_bear_m_short_2_3", 
    "bear_dates_qbear_class_lstm_9046df4a",
    #"non_bull_dates_bqull_class_lstm_f241ab59"
]



if all_dates:
    current_pred = current_digest[current_digest["train_file"] == current_train_file]

elif adaptive_model:
    for i in range(len(current_train_files)):
        
        test_dates = pd.read_csv(f"../time_periods/model_test_ready/{test_date_file_names[i]}.csv")
        test_dates["date"] = pd.to_datetime(test_dates["date"])
        if i != 0:
            test_dates = test_dates[~test_dates["date"].isin(current_pred["date"])]
        current_current_digest = current_digest[current_digest["date"].isin(test_dates["date"])]
        current_current_digest = current_current_digest[current_current_digest["train_file"] == current_train_files[i]]
        current_current_digest["train_file"] = current_train_files[i]
        current_current_digest["test_file"] = test_date_file_names[i]
        if i == 0:
            current_pred = current_current_digest
        else:
            current_pred = pd.concat([current_pred, current_current_digest])
            
        print(f"Length adaptive state {i}:", current_current_digest[feature].shape)
        print(f"Returns adaptive state {i}:", current_current_digest[feature].mean())
        
    if long_short_every_week:
        current_current_digest = current_digest[current_digest["train_file"] == final_train_file]
        current_current_digest = current_current_digest[~current_current_digest["date"].isin(current_pred["date"])]
        current_pred = pd.concat([current_pred, current_current_digest])
        print(f"Length adaptive state {len(current_train_files)}:", current_current_digest[feature].shape)
        print(f"Returns adaptive state {len(current_train_files)}:", current_current_digest[feature].mean())
        
    current_pred.sort_values("date", inplace=True)
    

    
else:
    test_dates = pd.read_csv(f"../time_periods/model_test_ready/{test_date_file_name}.csv")
    test_dates["date"] = pd.to_datetime(test_dates["date"])
    current_digest = current_digest[current_digest["date"].isin(test_dates["date"])]

    current_pred = current_digest[current_digest["train_file"] == current_train_file]


current_pred_returns = current_pred.set_index("date")[feature]

if current_region == "us":
    current_index = us_lookup_mc_cap_avg
elif current_region == "eu":
    current_index = eu_lookup_mc_cap_avg
elif current_region == "jp":
    current_index = jp_lookup_mc_cap_avg

comparison_returns = current_digest[current_digest["train_file"] == "non_bear_dates_sp500"].set_index("date")[feature]

if compare_with_all_dates:
    comparison_returns = comparison_returns[comparison_returns.index >= min_date]
    comparison_returns = comparison_returns[comparison_returns.index <= max_date]
else:  
    comparison_returns = comparison_returns[comparison_returns.index.isin(current_pred_returns.index)]

print("Comparison length:", comparison_returns.shape)
print("Pred length:", current_pred_returns.shape)

print("Comparison return:", comparison_returns.mean()*100)
print("Pred return:", current_pred_returns.mean()*100)
print("Rel pred return:", current_pred_returns.mean()*100 - comparison_returns.mean()*100)

print("Comparison std:", comparison_returns.std())
print("Pred std:", current_pred_returns.std())

print("Comparison normal:", normaltest(comparison_returns))
print("Pred normal:", normaltest(current_pred_returns))

print("Ind:", ttest_ind(current_pred_returns, comparison_returns, equal_var=False, alternative="greater"))


In [None]:
train_name_dict = {
    'nber_recession_dates' : "NBER Recession",
    'nber_expansion_dates' : "NBER Expansion",
    'bear_dates_sp500' : "Qualitative Bear",
    'bull_dates_sp500' : "Qualitative Bull",
    'non_bear_dates_sp500' : "Qualitative Non-Bear",
    'return_filter_bear_m_long_3_6_12' : "Negative Filter (LT)",
    'return_filter_bear_m_short_2_3' : "Negative Filter (LT)",
    'return_filter_bull_m_long_3_6_12' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",
    'EPU_rec_2yr' : "EPU Recession",
    'EPU_exp_2yr' : "EPU Expansion",
}

In [None]:
#Using only mean returns on dates
#In-sample classification

current_digest = all_digests_nogroup

current_region = "jp"

min_date = pd.Timestamp("2003-01-01")
max_date = pd.Timestamp("2023-12-31")

current_digest = all_digests_nogroup[all_digests_nogroup["model"] == "all_ensemble"]
current_digest = current_digest[current_digest["region"] == current_region]
current_digest = current_digest[current_digest["min_max_quantile"] == (0.78, 1)]
current_digest = current_digest[current_digest["date"] >= min_date]
current_digest = current_digest[current_digest["date"] <= max_date]

all_dates = False
adaptive_model = True

long_short_every_week = False

compare_with_all_dates = False


current_train_file = "all_dates"
test_date_file_name = ""

feature = "top_minus_bottom_4_trr_5_fwd_ar_mean"

current_train_files = [
    #'all_dates',
    #'nber_recession_dates',
    #'nber_expansion_dates',
    #'bear_dates_sp500',
    'bull_dates_sp500',
    #'non_bear_dates_sp500',
    #'return_filter_bear_m_long_3_6_12',
    #'return_filter_bear_m_short_2_3',
    #'return_filter_bull_m_long_3_6_12',
    #'return_filter_bull_m_short_2_3',
    #'EPU_rec_2yr',
    #'EPU_exp_2yr'
                      ]

final_train_file = "non_bear_dates_sp500"

test_date_file_names = [
    #'nber_recession_dates',
    'nber_expansion_dates',
    #'bear_dates_sp500',
    #'bull_dates_sp500',
    #'non_bear_dates_sp500',
    #'return_filter_bear_m_long_3_6_12',
    #'return_filter_bear_m_short_2_3',
    #'return_filter_bull_m_long_3_6_12',
    #'return_filter_bull_m_short_2_3',
    #'EPU_rec_2yr',
    #'EPU_exp_2yr'
]

compare_train_file = 'bear_dates_sp500'



if all_dates:
    current_pred = current_digest[current_digest["train_file"] == current_train_files[0]]

elif adaptive_model:
    for i in range(len(current_train_files)):
        
        test_dates = pd.read_csv(f"../time_periods/model_train_ready/{test_date_file_names[i]}.csv")
        test_dates["date"] = pd.to_datetime(test_dates["date"])
        if i != 0:
            test_dates = test_dates[~test_dates["date"].isin(current_pred["date"])]
        current_current_digest = current_digest[current_digest["date"].isin(test_dates["date"])]
        current_current_digest = current_current_digest[current_current_digest["train_file"] == current_train_files[i]]
        current_current_digest["train_file"] = current_train_files[i]
        current_current_digest["test_file"] = test_date_file_names[i]
        if i == 0:
            current_pred = current_current_digest
        else:
            current_pred = pd.concat([current_pred, current_current_digest])
            
        print(f"Length adaptive state {i}:", current_current_digest[feature].shape)
        print(f"Returns adaptive state {i}:", current_current_digest[feature].mean())
        
    if long_short_every_week:
        current_current_digest = current_digest[current_digest["train_file"] == final_train_file]
        current_current_digest = current_current_digest[~current_current_digest["date"].isin(current_pred["date"])]
        current_pred = pd.concat([current_pred, current_current_digest])
        print(f"Length adaptive state {len(current_train_files)}:", current_current_digest[feature].shape)
        print(f"Returns adaptive state {len(current_train_files)}:", current_current_digest[feature].mean())
        
    current_pred.sort_values("date", inplace=True)
    

    
else:
    test_dates = pd.read_csv(f"../time_periods/model_test_ready/{test_date_file_name}.csv")
    test_dates["date"] = pd.to_datetime(test_dates["date"])
    current_digest = current_digest[current_digest["date"].isin(test_dates["date"])]

    current_pred = current_digest[current_digest["train_file"] == current_train_file]


current_pred_returns = current_pred.set_index("date")[feature]

if current_region == "us":
    current_index = us_lookup_mc_cap_avg
elif current_region == "eu":
    current_index = eu_lookup_mc_cap_avg
elif current_region == "jp":
    current_index = jp_lookup_mc_cap_avg

comparison_returns = current_digest[current_digest["train_file"] == compare_train_file].set_index("date")[feature]

if compare_with_all_dates:
    comparison_returns = comparison_returns[comparison_returns.index >= min_date]
    comparison_returns = comparison_returns[comparison_returns.index <= max_date]
else:  
    comparison_returns = comparison_returns[comparison_returns.index.isin(current_pred_returns.index)]

print("Comparison length:", comparison_returns.shape)
print("Pred length:", current_pred_returns.shape)

#print("Comparison return:", comparison_returns.mean()*100)
#print("Pred return:", current_pred_returns.mean()*100)
print("Rel pred return:", current_pred_returns.mean()*100 - comparison_returns.mean()*100)

#print("Comparison std:", comparison_returns.std())
#print("Pred std:", current_pred_returns.std())

#print("Comparison normal:", normaltest(comparison_returns))
#print("Pred normal:", normaltest(current_pred_returns))

print("Ind:", ttest_ind(current_pred_returns, comparison_returns, equal_var=False, alternative="greater"))

### Test indicators

In [None]:
indicators = pd.read_parquet("../data/indicators/US/all_indicators_raw_outer.parquet", engine="pyarrow")
indicators["date"] = pd.to_datetime(indicators["date"])
indicators.reset_index(drop=True, inplace=True)
us_top_500 = pd.read_parquet("../data/indicators/US/us_top_500.parquet", engine="pyarrow")
us_top_500["date"] = pd.to_datetime(us_top_500["date"])
data = pd.merge(indicators, us_top_500, on=["date"], how="outer")
data.set_index("date", inplace=True)

In [None]:
data["unemployment_change"] = data["unemployment"].dropna().pct_change()
data["initial_claims_change"] = data["initial_claims"].dropna().pct_change()
data["trr_w_fri"] = data["market_cap_usd"].resample("W-FRI").last().pct_change()

In [None]:
test_file_name_dict = {
    'nber_recession_dates_class_lstm_ba4da75c' : "Recession Class LSTM",
    'nber_non_recession_dates_class_lstm_ba4da75c' : "Non-Recession Class LSTM",
    'bear_dates_qbear_class_lstm_9046df4a' : "Bear Class LSTM",
    'bull_dates_qbull_class_lstm_f241ab59' : "Bull Class LSTM",
    'non_bear_dates_qbear_class_lstm_9046df4a' : "Non-Bear Class LSTM",
    'non_bull_dates_bqull_class_lstm_f241ab59' : "Non-Bull Class LSTM",
    'markov_rec_dates_test_all_years_order1_4_10_5yr_avg' : "Markov Recession",
    'markov_exp_dates_test_all_years_order1_4_10_5yr_avg' : "Markov Expansion",
    'return_filter_bull_m_long_3_6_12' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",
    'bear_lstm_mc_change_class' : "Negative MC Change LSTM",
    'bull_lstm_mc_change_class' : "Positive MC Change LSTM",
    'return_filter_bear_m_long_3_6_12' : "Negative Filter (LT)",
    'return_filter_bear_m_short_2_3' : "Negative Filter (ST)"
}

In [None]:

current_region = "us"

test_macros = True

test_train_files = False

min_date = pd.Timestamp("1980-01-01")
max_date = pd.Timestamp("2023-12-31")

feature = "trr_w_fri"



test_date_file_names = [
    #"nber_recession_dates_class_lstm_ba4da75c", 
    #"markov_rec_dates_test_all_years_order1_4_10_5yr_avg",
    #"bull_lstm_mc_change_class"
    #"return_filter_bear_m_short_2_3", 
    "bear_dates_qbear_class_lstm_9046df4a"
]

test_date_file_names = list(test_file_name_dict.keys())

train_date_file_names = [
    "nber_recession_dates", 
]

if test_train_files:
    for i in range(len(train_date_file_names)):

        current_test_dates = pd.read_csv(f"../time_periods/model_train_ready/{train_date_file_names[i]}.csv")
        current_test_dates["date"] = pd.to_datetime(current_test_dates["date"])
        if i != 0:
            current_test_dates = current_test_dates[~current_test_dates["date"].isin(test_dates["date"])]

        if i == 0:
            test_dates = current_test_dates
        else:
            test_dates = pd.concat([test_dates, current_test_dates])

        print(f"Length test file {i}:", current_test_dates.shape)
    
else:  
    for i in range(len(test_date_file_names)):

        current_test_dates = pd.read_csv(f"../time_periods/model_test_ready/{test_date_file_names[i]}.csv")
        current_test_dates["date"] = pd.to_datetime(current_test_dates["date"])
        if i != 0:
            current_test_dates = current_test_dates[~current_test_dates["date"].isin(test_dates["date"])]

        if i == 0:
            test_dates = current_test_dates
        else:
            test_dates = pd.concat([test_dates, current_test_dates])

        print(f"Length test file {i}:", current_test_dates.shape)

    
if test_macros:
    current_index = data[feature].dropna()
    
else:
    if current_region == "us":
        current_index = us_lookup_mc_cap_avg
    elif current_region == "eu":
        current_index = eu_lookup_mc_cap_avg
    elif current_region == "jp":
        current_index = jp_lookup_mc_cap_avg
    
current_index = current_index[current_index.index >= min_date]
current_index = current_index[current_index.index <= max_date]
    
current_chosen_values = current_index.copy()[current_index.index.isin(test_dates["date"])]

comparison_values = current_index.copy()

#print("Comparison length:", comparison_values.shape)
#print("Pred length:", current_chosen_values.shape)

#print("Comparison mean value:", comparison_values.mean()*100)
#print("Pred mean value:", current_chosen_values.mean()*100)
#print("Rel mean value:", current_chosen_values.mean()*100 - comparison_values.mean()*100)

#print("Comparison std:", comparison_values.std())
#print("Pred std:", current_chosen_values.std())

#print("Comparison normal:", normaltest(comparison_values))
#print("Pred normal:", normaltest(current_chosen_values))


print("Ind:", ttest_ind(current_chosen_values, comparison_values, equal_var=False, alternative = "less"))

In [None]:
train_name_dict = {
    'nber_recession_dates' : "NBER Recession",
    'nber_expansion_dates' : "NBER Expansion",
    'bear_dates_sp500' : "Qualitative Bear",
    'bull_dates_sp500' : "Qualitative Bull",
    'non_bear_dates_sp500' : "Qualitative Non-Bear",
    'return_filter_bear_m_long_3_6_12' : "Negative Filter (LT)",
    'return_filter_bear_m_short_2_3' : "Negative Filter (LT)",
    'return_filter_bull_m_long_3_6_12' : "Positive Filter (LT)",
    'return_filter_bull_m_short_2_3' : "Positive Filter (ST)",
    'EPU_rec_2yr' : "EPU Recession",
    'EPU_exp_2yr' : "EPU Expansion",
}

In [None]:
train_name_alternative_dict = {
    'nber_recession_dates' : "negative",
    'nber_expansion_dates' : "positive",
    'bear_dates_sp500' : "negative",
    'bull_dates_sp500' : "positive",
    'non_bear_dates_sp500' : "positive",
    'return_filter_bear_m_long_3_6_12' : "negative",
    'return_filter_bear_m_short_2_3' : "negative",
    'return_filter_bull_m_long_3_6_12' : "positive",
    'return_filter_bull_m_short_2_3' : "positive",
    'EPU_rec_2yr' : "negative",
    'EPU_exp_2yr' : "positive",
}

In [None]:
test_file_name_alternative_dict = {
    'nber_recession_dates_class_lstm_ba4da75c' : "negative",
    'nber_non_recession_dates_class_lstm_ba4da75c' : "positive",
    'bear_dates_qbear_class_lstm_9046df4a' : "negative",
    'bull_dates_qbull_class_lstm_f241ab59' : "positive",
    'non_bear_dates_qbear_class_lstm_9046df4a' : "positive",
    'non_bull_dates_bqull_class_lstm_f241ab59' : "negative",
    'markov_rec_dates_test_all_years_order1_4_10_5yr_avg' : "negative",
    'markov_exp_dates_test_all_years_order1_4_10_5yr_avg' : "positive",
    'return_filter_bull_m_long_3_6_12' : "positive",
    'return_filter_bull_m_short_2_3' : "positive",
    'bear_lstm_mc_change_class' : "negative",
    'bull_lstm_mc_change_class' : "positive",
    'return_filter_bear_m_long_3_6_12' : "negative",
    'return_filter_bear_m_short_2_3' : "negative"
}

In [None]:
#Plot all indicator t-test results

current_region = "us"

test_macros = True

min_date = pd.Timestamp("1967-01-01")
max_date = pd.Timestamp("2019-12-31")

feature = "trr_w_fri"

negative_is_less = True

test_date_file_names = list(train_name_dict.keys())


for i in range(len(test_date_file_names)):

    current_test_dates = pd.read_csv(f"../time_periods/model_train_ready/{test_date_file_names[i]}.csv")
    current_test_dates["date"] = pd.to_datetime(current_test_dates["date"])
    
    if test_macros:
        current_index = data[feature].dropna()

    else:
        if current_region == "us":
            current_index = us_lookup_mc_cap_avg
        elif current_region == "eu":
            current_index = eu_lookup_mc_cap_avg
        elif current_region == "jp":
            current_index = jp_lookup_mc_cap_avg

    current_index = current_index[current_index.index >= min_date]
    current_index = current_index[current_index.index <= max_date]

    current_chosen_values = current_index.copy()[current_index.index.isin(current_test_dates["date"])]

    comparison_values = current_index.copy()
    
    if i == 0:
        print(feature)
        print(min_date, max_date)
        print("Comparison length:", comparison_values.shape)
        print("Comparison mean value:", comparison_values.mean()*100)
        print()
        
    print(train_name_dict[test_date_file_names[i]])
    print("Pred % chosen:", 100*current_chosen_values.shape[0]/comparison_values.shape[0])

    print("Rel mean value:", current_chosen_values.mean()*100 - comparison_values.mean()*100)
    
    if train_name_alternative_dict[test_date_file_names[i]] == "negative":
        if negative_is_less:
            alternative = "less"
        else:
            alternative = "greater"
            
    else:
        if negative_is_less:
            alternative = "greater"
        else:
            alternative = "less"
            
    print(alternative)
    


    print("T-value:", ttest_ind(current_chosen_values, comparison_values, equal_var=False, alternative = alternative).statistic)
    print("P-value:", ttest_ind(current_chosen_values, comparison_values, equal_var=False, alternative = alternative).pvalue)
    print()