### Data Loading

In [2]:
import pandas as pd
import portfolio_utils as pfu
import itertools
import json
import time
import os

In [3]:
# load extraction results
extractions_df = pd.read_csv("../data/matched/VIDEOS_inf_llama3_ft_v4_q8_0_llamacpp_guided.csv", sep=";")
# load metadata df to get upload dates and channel ids
metadata_df = pd.read_csv("../data/yt_metadata/video_metadata.csv", sep=";")[["video_id", "upload_date", "uploader_id"]].rename(columns={"uploader_id": "channel_id"})
# join metadata to extractions
extractions_df = extractions_df.merge(metadata_df, on="video_id")
trade_info_col = "trade_info_no_neutrals"
extractions_df = extractions_df[["video_id", "upload_date", "channel_id", trade_info_col]]
extractions_df.rename(columns={trade_info_col: "trade_info"}, inplace=True)

# load returns dfs
returns_path = "../data/asset_data/returns"
returns_dfs = {"stock": pd.read_csv(f"{returns_path}/stocks_returns.csv", sep=";"),
               "etf": pd.read_csv(f"{returns_path}/etfs_returns.csv", sep=";"),
                "crypto": pd.read_csv(f"{returns_path}/cryptos_returns.csv", sep=";"),
                "commodity": pd.read_csv(f"{returns_path}/commodities_returns.csv", sep=";"),
                "benchmark": pd.read_csv(f"{returns_path}/benchmarks_returns.csv", sep=";")
}

# join returns dfs, add asset type to column names, set date column as index
colname_sep = "+" # separator for asset type and ticker in column names
for asset_type in returns_dfs.keys():
    # make date col index
    returns_dfs[asset_type].set_index("date", inplace=True)
    # rename columns
    returns_dfs[asset_type].columns = [f"{asset_type}{colname_sep}{col}" for col in returns_dfs[asset_type].columns]
    print(f"{asset_type} returns df shape: {returns_dfs[asset_type].shape}")

returns_df = pd.concat(returns_dfs.values(), axis=1)
print(f"returns_df shape after joining: {returns_df.shape}")
# make sure we didn't lose any dates
assert returns_df.index.equals(returns_dfs["stock"].index)
del returns_dfs

# instantiate portfolio builder (with default settings for now)
pb = pfu.PortfolioBuilder(extractions_df=extractions_df, 
                          returns_df=returns_df, 
                          settings=None,
                          ticker_sep=colname_sep)

stock returns df shape: (2012, 10947)
etf returns df shape: (2012, 3182)
crypto returns df shape: (2012, 4576)
commodity returns df shape: (2012, 26)
benchmark returns df shape: (2012, 6)
returns_df shape after joining: (2012, 18737)
PortfolioBuilder: Initialized with 45967 videos from 231 unique channels.
PortfolioBuilder: Only considering videos with non-empty trade_info: 16510 videos from 213 unique channels.


### Portfolio Computation

We compute and save portfolios for each channel. 
We go through multiple runs to obtain several portfolio versions, each with a different combination of settings (i.e. max holding period, neutral asset, etc.)

In [20]:
save_dir = "../data/portfolios/raw"

pf_types = ["equal_weight"]
neutral_assets = ["cash", "3m_tbills", "SPY"] # cash, 3m-tbills (-> risk-free asset), S&P 500 ETF
#neutral_assets = ["SPY"] # cash only for now
max_holding_periods = [21, 126, 252, 99999] # 1 month, 6 months, 1 year, no limit
#max_holding_periods = [252]
after_upload_waits = [1] # 1 day
max_positions = [5, 99999] # max 5, no limit
#max_positions = [99999] # no limit

# save parameter choices to save dir (json)
with open(f"{save_dir}/parameter_choices.json", "w") as f:
    json.dump({"pf_types": pf_types, "neutral_assets": neutral_assets, "max_holding_periods": max_holding_periods, "after_upload_waits": after_upload_waits, "max_positions": max_positions}, f)

# save unique channel_id list to save dir (might be useful for reloading portfolio data later)
pd.Series(pb.ext["channel_id"].unique()).rename("channel_ids").to_csv(f"{save_dir}/channel_ids.csv", sep=";", index=False)

# iterate over portfolio parameter combinations
for pf_type, neutral_asset, max_hp, after_upload_waits, max_pos in itertools.product(pf_types, neutral_assets, max_holding_periods, after_upload_waits, max_positions):
    start_time = time.time()

    # create output dir for the run 
    run_name = f"{pf_type}_{neutral_asset}_hp{max_hp}_wait{after_upload_waits}_pos{max_pos}"
    run_save_dir = f"{save_dir}/{run_name}"
    if not os.path.exists(run_save_dir):
        os.makedirs(run_save_dir)
    else:
        print(f"Run dir {run_save_dir} already exists. Skipping.")
        continue

    # define settings for portfolio builder
    # note: we compute all portfolios for the full observation range. We can later still choose to only consider the active trading period for each portfolio for computing performance measures. 
    portfolio_settings = {#"pf_start_date": "2016-01-04", # default values for observation period, kept constant
                            #  "pf_end_date": "2023-12-29",
                            "pf_initial_value": 1, 
                            "portfolio_type": pf_type, 
                            "max_positions": max_pos, # max number of non-cash positions in a portfolio (sell oldest positions if exceeded)
                            "max_holding_period": max_hp, # max number of trading days to hold a position (sell if exceeded)
                            "neutral_asset": neutral_asset, # asset to hold in place of cash        
                            "min_days_wait_after_upload": 1, # minimum number of days to wait after upload before trading
                    }
    print(f"{30*'-'}\nStarting run: {run_name}")
    # update settings for portfolio builder (without having to reload returns and extraction data)
    pb.update_settings(new_settings=portfolio_settings)

    # save run settings to run dir
    with open(f"{run_save_dir}/settings.json", "w") as f:
        json.dump(portfolio_settings, f)

    # build portfolios for all channel ids and save results
    for i, channel_id in enumerate(pb.ext["channel_id"].unique()):
        pos_df_bt, pos_df_at, trade_logs_df = pb.compute_portfolio(channel_id=channel_id, debug=False)
        # save results
        pos_df_bt.to_csv(f"{run_save_dir}/{channel_id}_pos_bt.csv", sep=";", index=True)
        pos_df_at.to_csv(f"{run_save_dir}/{channel_id}_pos_at.csv", sep=";", index=True)
        trade_logs_df.to_csv(f"{run_save_dir}/{channel_id}_trade_logs.csv", sep=";", index=False)
        if (i+1) % 20 == 0:
            print(f"  - Processed {i+1}/{len(pb.ext['channel_id'].unique())} channel ids in {time.time()-start_time:.2f}s")
    print(f"Run {pf_type}_{neutral_asset}_hp{max_hp}_wait{after_upload_waits}_pos{max_pos} completed in {time.time()-start_time:.2f}s")
                


Run dir ../data/portfolios/raw/equal_weight_cash_hp21_wait1_pos5 already exists. Skipping.
------------------------------
Starting run: equal_weight_cash_hp21_wait1_pos99999
PortfolioBuilder: Updated settings.
  - Processed 20/213 channel ids in 43.30s
  - Processed 40/213 channel ids in 93.46s
  - Processed 60/213 channel ids in 137.45s
  - Processed 80/213 channel ids in 174.78s
  - Processed 100/213 channel ids in 211.88s
  - Processed 120/213 channel ids in 243.44s
  - Processed 140/213 channel ids in 274.08s
  - Processed 160/213 channel ids in 304.75s
  - Processed 180/213 channel ids in 334.46s
  - Processed 200/213 channel ids in 363.95s
Run equal_weight_cash_hp21_wait1_pos99999 completed in 382.63s
------------------------------
Starting run: equal_weight_cash_hp126_wait1_pos5
PortfolioBuilder: Updated settings.
  - Processed 20/213 channel ids in 48.38s
  - Processed 40/213 channel ids in 99.94s
  - Processed 60/213 channel ids in 142.39s
  - Processed 80/213 channel ids in 1