In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import sys

pd.set_option("display.max_columns", 100)

In [None]:
target_variable = "trr_5"

In [None]:
os.chdir("../../results")

In [None]:
os.getcwd()

In [None]:
def filter_market_caps(results_df, min_market_cap_percentile = 0.6, max_market_cap_percentile = None):
    current_results = results_df.copy()
    
    min_market_caps = current_results.groupby("date")["market_cap_usd"].quantile(min_market_cap_percentile)
    
    if max_market_cap_percentile != None:
        max_market_caps = current_results.groupby("date")["market_cap_usd"].quantile(max_market_cap_percentile)
           
    current_results = current_results.groupby("date").apply(lambda x: x[x["market_cap_usd"] >= min_market_caps.loc[x.name]]).reset_index(drop=True)

    if max_market_cap_percentile != None:
        current_results = current_results.groupby("date").apply(lambda x: x[x["market_cap_usd"] <= max_market_caps.loc[x.name]]).reset_index(drop=True)
    current_results.sort_values(["date", "gvkey"], inplace=True)

    return current_results.copy()

In [None]:
def add_quantiles(results_df, quantiles=10):
    results_df = results_df.copy()
    
    def g(df):
        df['conviction_quantile'] = pd.qcut(df['conviction'], quantiles, labels=False, duplicates="drop")
        df['top_quantile'] = pd.qcut(df['pred_2'], quantiles, labels=False, duplicates="drop")
        df['bottom_quantile'] = pd.qcut(df['pred_0'], quantiles, labels=False, duplicates="drop")
        return df
        
    results_df = results_df.groupby("date").apply(g).reset_index(drop=True)
    return results_df.copy()

In [None]:
def set_time_period(results_df, first_date, last_date):
    current_results = results_df.copy()
    current_results = current_results[current_results["date"] > pd.Timestamp(first_date)]
    current_results = current_results[current_results["date"] < pd.Timestamp(last_date)]
    return current_results.copy()

In [None]:
def prepare_results(df, exchange_codes = None, currencies = None, quantiles = 20, 
                    min_date = "2020-01-01", max_date="2023-12-31", n_gvkeys = 500, svm=False,
                    min_market_cap_percentile = 0.6,
                   use_percentile_cap = False, min_volume_usd_5 = 1000, lower_rank = None, max_market_cap_percentile=None):
    
    df = df.copy()
    
    if not ("conviction" in df.columns):
        print("conviction not in columns")
        if svm:
            df["conviction"] = df["pred_1"]
        else:
            df["conviction"] = df["pred_2"] - df["pred_0"]
        
    
    if exchange_codes != None:
        df = df[df["exchange_code"].isin(exchange_codes)]
    if currencies != None:
        df = df[df["currency"].isin(currencies)]
        
        
    df["trr_5_fwd_ar"] = np.exp(df["trr_5_fwd"]) - 1
    if use_percentile_cap:
        df = filter_market_caps(df, min_market_cap_percentile, max_market_cap_percentile)
    else:
        if "market_cap_usd" in df.columns:
            df = df[df["volume_usd_5"] >= min_volume_usd_5]
            df["market_cap_rank"] = df.groupby("date")["market_cap_usd"].rank(ascending=False, method="first").astype(int)
            df = df[df["market_cap_rank"] <= n_gvkeys]
            if lower_rank != None:
                df = df[df["market_cap_rank"] >= lower_rank]

        elif "market_cap_rank" in df.columns:
            df = df[df["volume_usd_5"] >= min_volume_usd_5]
            df = df[df["market_cap_rank"] <= n_gvkeys]
        else:
            print("No market cap or rank in df")
    df = add_quantiles(df, quantiles=quantiles)
    df = set_time_period(df, min_date, max_date)
    
    return df

In [None]:
result_dirs = os.listdir()

In [None]:
us_lookup = pd.read_parquet("../data/lookup/us_lookup.parquet", engine="pyarrow")
us_lookup["date"] = pd.to_datetime(us_lookup["date"])
us_lookup.set_index(["date", "gvkey"], inplace=True)
us_lookup.drop(columns=["currency", "country_hq", "exchange_code", "company_name"], inplace=True)

In [None]:
result_cols = ['date', 'gvkey', 'company_name', 'currency', 'exchange_code',
       'trr_5_fwd', 'trr_5_fwd_class', 'pred_0', 'pred_1', 'pred_2',
       'pred_class', 'market_cap_rank', 'train_file', 'split_year', 'gsector',
       'ggroup', 'gind', 'gsubind', 'market_cap_usd', 'trr_5', 'volume_usd_5', 'volatility_5',
       'price_close_usd']

In [None]:
#Ensemble model, all predictions

models = ["catboost","xgb","logreg","rf"]
regions = ["US", "Europe", "Japan"]

no_conviction = False

if no_conviction:
    top_quantile_name = "top_quantile"
    bottom_quantile_name = "bottom_quantile"
    
else:
    top_quantile_name = "conviction_quantile"
    bottom_quantile_name = "conviction_quantile"

n_quantiles = 40

mean_features = ["trr_5_fwd_ar", "volume_usd_5", "volatility_5"]

max_trr_5_fwd = float(0.3) #Arithmetic = +100%
min_trr_5_fwd = float(-0.3) #Arithmetic = -50%

only_first_5_test_years = True

keep_uuids = False

years = [
    #1980, 1985, 1990, 1995, 
    2000, 2005, 2010, 2015, 2020]

result_dirs = os.listdir("results")

us_lookup = pd.read_parquet("data/lookup/us_lookup.parquet", engine="pyarrow")
us_lookup["date"] = pd.to_datetime(us_lookup["date"])
us_lookup.set_index(["date", "gvkey"], inplace=True)

eu_lookup = pd.read_parquet("data/lookup/eu_lookup.parquet", engine="pyarrow")
eu_lookup["date"] = pd.to_datetime(eu_lookup["date"])
eu_lookup.set_index(["date", "gvkey"], inplace=True)

jp_lookup = pd.read_parquet("data/lookup/jp_lookup.parquet", engine="pyarrow")
jp_lookup["date"] = pd.to_datetime(jp_lookup["date"])
jp_lookup.set_index(["date", "gvkey"], inplace=True)

result_cols = ['date', 'gvkey', 'exchange_code',
       'trr_5_fwd', 'trr_5_fwd_class', 'pred_0', 'pred_1', 'pred_2',
        'train_file', 'split_year', 'gsector',
       'ggroup', 'gind', 'gsubind', 'market_cap_usd', 'trr_5', 'volume_usd_5', 'volatility_5']

train_files = ['return_filter_bear_m_short_2_3', 'bull_dates_sp500', 'markov_rec',
           'flat_dates_sp500', 'nber_recession_dates',
           'return_filter_bull_m_short_2_3', 'non_bear_dates_sp500',
           'nber_expansion_dates', 'bear_dates_sp500',
           'return_filter_bull_m_long_3_6_', 'all_dates',
           'return_filter_bear_m_long_3_6_', 'markov_exp', 
            'EPU_exp_2yr','EPU_rec_2yr']

mean_columns = ["pred_0", "pred_1", "pred_2"]

for region in regions:
    for y_i, year in enumerate(years):
        first_results = True
        if year < 2005:
            if region == "Europe" or region == "Japan":
                continue
        print("year:", year)
        for train_file in train_files:
            sys.stdout.write(f"Processing {region} for {train_file} in {year}\n")
            models_found = 0
            for m_1, model_name in enumerate(models):
                for directory in result_dirs:
                    if model_name + "_" + region in directory and "regime_feature" not in directory and f"test_split_{year}" in directory and train_file in directory: 
                        if train_file == "bear_dates_sp500" and "non_bear_dates_sp500" in directory:
                            continue
                        sys.stdout.write(f"Found {model_name} for {region} for {train_file} in {year}\n")
                        models_found += 1
                        sys.stdout.write("Models found: " + str(models_found) + "\n")
                        current_single_model_results = pd.read_parquet(f"results/{directory}/results.parquet", engine="pyarrow")
                        current_single_model_results["date"] = pd.to_datetime(current_single_model_results["date"])
                        current_single_model_results = current_single_model_results[current_single_model_results["date"] <= (pd.Timestamp(f"{year}-01-01") + pd.DateOffset(years=5))]
                        if m_1 == 0:
                            ensemble_results = current_single_model_results.copy()
                            continue
                        ensemble_results = pd.concat([ensemble_results, current_single_model_results])
            if models_found == len(models):
                current_results = ensemble_results.groupby(["date", "gvkey"])[mean_columns + ["trr_5_fwd", "trr_5_fwd_class"]].mean().reset_index()
                
                current_results["train_file"] = train_file
                current_results["split_year"] = year

                if "model" in current_results.columns:
                    current_results.drop(columns=["model"], inplace=True)

                if only_first_5_test_years:
                    current_results = current_results[current_results["date"] < (pd.Timestamp(f"{year}-01-01") + pd.DateOffset(years=5))]
                    
                current_results = current_results[current_results["date"] >= (pd.Timestamp(f"{year}-01-01"))]
                    
                if region == "US":
                    current_results = current_results.set_index(["date", "gvkey"]).merge(us_lookup, left_index=True, right_index=True, suffixes=("_x", "")).reset_index()
                elif region == "Europe":
                    current_results = current_results.set_index(["date", "gvkey"]).merge(eu_lookup, left_index=True, right_index=True, suffixes=("_x", "")).reset_index()
                elif region == "Japan":
                    current_results = current_results.set_index(["date", "gvkey"]).merge(jp_lookup, left_index=True, right_index=True, suffixes=("_x", "")).reset_index()
                    
                if max_trr_5_fwd:
                    current_results = current_results[current_results["trr_5_fwd"] <= max_trr_5_fwd]
                    
                if min_trr_5_fwd:
                    current_results = current_results[current_results["trr_5_fwd"] >= min_trr_5_fwd]

                current_results = current_results[result_cols]

                current_results_copy = current_results.copy()

                min_date = f"{year}-01-01"
                
                
                current_results = prepare_results(current_results, quantiles=n_quantiles, min_date = min_date, 
                                                    use_percentile_cap=True, min_market_cap_percentile = 0.78, 
                                                    max_market_cap_percentile=1.00)

                current_results["train_file"] = train_file
                if first_results:
                    all_train_results = current_results
                    first_results = False
                    continue
                all_train_results = pd.concat([all_train_results, current_results])
            else:
                sys.stdout.write(f"Not enough models found for {region} for {train_file} in {year}\n")
                continue
                
        all_train_results["split_year"] = year
        if ((y_i == 0) and (region == "US")):
            all_year_results = all_train_results
            continue
        if ((year == 2005) and (region == "Europe")) or ((year == 2005) and (region == "Japan")):
            all_year_results = all_train_results
            continue
        all_year_results = pd.concat([all_year_results, all_train_results])
        
    model_name_and_region = f"{models}_ensemble_{region}"
        
    if max_trr_5_fwd:
        model_name_and_region += f"max_trr_5_fwd_ar_{max_trr_5_fwd}"
                    
    if min_trr_5_fwd:
        model_name_and_region += f"min_trr_5_fwd_ar_{min_trr_5_fwd}"

    all_year_results["model"] = model_name_and_region
    all_year_results.to_parquet(f"results/digests/{model_name_and_region}_all_preds_078_after_2003.parquet")