In [43]:
import pandas as pd
import json
import time

In [44]:
# load returns data
returns_path = "../data/asset_data/returns"
returns_dfs = {"stock": pd.read_csv(f"{returns_path}/stocks_returns.csv", sep=";"),
               "etf": pd.read_csv(f"{returns_path}/etfs_returns.csv", sep=";"),
                "crypto": pd.read_csv(f"{returns_path}/cryptos_returns.csv", sep=";"),
                "commodity": pd.read_csv(f"{returns_path}/commodities_returns.csv", sep=";"),
                "benchmark": pd.read_csv(f"{returns_path}/benchmarks_returns.csv", sep=";")
}
# join returns dfs, add asset type to column names, set date column as index
colname_sep = "+" # separator for asset type and ticker in column names
for asset_type in returns_dfs.keys():
    # make date col index
    returns_dfs[asset_type].set_index("date", inplace=True)
    # rename columns
    returns_dfs[asset_type].columns = [f"{asset_type}{colname_sep}{col}" for col in returns_dfs[asset_type].columns]
    print(f"{asset_type} returns df shape: {returns_dfs[asset_type].shape}")

returns_df = pd.concat(returns_dfs.values(), axis=1)
print(f"returns_df shape after joining: {returns_df.shape}")
# make sure we didn't lose any dates
assert returns_df.index.equals(returns_dfs["stock"].index)
del returns_dfs

stock returns df shape: (2012, 10947)
etf returns df shape: (2012, 3182)
crypto returns df shape: (2012, 4576)
commodity returns df shape: (2012, 26)
benchmark returns df shape: (2012, 6)
returns_df shape after joining: (2012, 18737)


In [45]:
# load extractions data
# load extraction results
edf = pd.read_csv("../data/matched/VIDEOS_inf_llama3_ft_v4_q8_0_llamacpp_guided.csv", sep=";")
# load metadata df to get upload dates and channel ids
metadata_df = pd.read_csv("../data/yt_metadata/video_metadata.csv", sep=";")[["video_id", "upload_date", "uploader_id"]].rename(columns={"uploader_id": "channel_id"})
# join metadata to extractions
edf = edf.merge(metadata_df, on="video_id")
extractions_col = "trade_info_incl_neutrals"
edf = edf[["video_id", "upload_date", "channel_id", extractions_col]]
# remove rows with empty extractions
edf = edf[edf[extractions_col] != "[]"]
edf.loc[:, extractions_col] = edf[extractions_col].apply(json.loads)
edf = edf.explode(extractions_col).reset_index(drop=True) # explode rec lists (1 row per rec)
for attr in ["asset_type", "ticker", "sentiment"]:
    edf[attr] = edf[extractions_col].apply(lambda x: pd.NA if pd.isna(x) else x[attr])
edf["ticker"] = edf["asset_type"] + "+" + edf["ticker"]
edf["has_returns"] = edf["ticker"].apply(lambda x: x in returns_df.columns)
edf.head()

Unnamed: 0,video_id,upload_date,channel_id,trade_info_incl_neutrals,asset_type,ticker,sentiment,has_returns
0,--D3TtCMo6A,2021-12-30,@MoneyZG,"{'asset_type': 'crypto', 'ticker': 'BTC', 'sen...",crypto,crypto+BTC,buy,True
1,--D3TtCMo6A,2021-12-30,@MoneyZG,"{'asset_type': 'crypto', 'ticker': 'ETH', 'sen...",crypto,crypto+ETH,neutral,True
2,--O0hf8aXcw,2020-12-10,@AltcoinDaily,"{'asset_type': 'crypto', 'ticker': 'BTC', 'sen...",crypto,crypto+BTC,buy,True
3,--O0hf8aXcw,2020-12-10,@AltcoinDaily,"{'asset_type': 'crypto', 'ticker': 'ETH', 'sen...",crypto,crypto+ETH,buy,True
4,-0a-kCtTydo,2021-01-27,@Jungernaut,"{'asset_type': 'stock', 'ticker': 'GME', 'sent...",stock,stock+GME,buy,True


In [46]:
# get preceding and following returns for each recommendation
# NA if no data available (but separate each timeframe and following/preceding case)

# benchmark returns
benchmark_returns = returns_df["benchmark+SPY"]
n_days_range = [1, 5, 21, 63, 252] # 1d, 1w, 1m, 3m, 1y

data_list = []
start_time = time.time()
for i, (upload_date, ticker, has_returns) in enumerate(zip(edf["upload_date"], edf["ticker"], edf["has_returns"])):
    row_values = {}
    # get returns for current ticker
    if not has_returns:
        for n_days in n_days_range:
            row_values[f"pre_{n_days}d"] = pd.NA
            row_values[f"pre_vs_bench_{n_days}d"] = pd.NA
            row_values[f"fol_{n_days}d"] = pd.NA
            row_values[f"fol_vs_bench_{n_days}d"] = pd.NA
        data_list.append(row_values) # should become empty row
        continue
    r = returns_df[ticker]
    for n_days in n_days_range: 
        # preceding returns
        preceding = r[r.index < upload_date].tail(n_days)
        if preceding.shape[0] < n_days or preceding.isna().any():
            row_values[f"pre_{n_days}d"] = pd.NA
            row_values[f"pre_vs_bench_{n_days}d"] = pd.NA
        else:
            # total return for the preceding period
            row_values[f"pre_{n_days}d"] = (preceding + 1).prod() - 1
            preceding_benchmark = benchmark_returns.loc[preceding.index]
            row_values[f"pre_vs_bench_{n_days}d"] = row_values[f"pre_{n_days}d"] - ((preceding_benchmark + 1).prod() - 1)

        # for the following returns, we need to wait one extra (normal, not trading) day to ensure no lookahead bias
        day_after_upload_date = (pd.to_datetime(upload_date) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
        following = r[r.index > day_after_upload_date].head(n_days)
        if following.shape[0] < n_days or following.isna().any():
            row_values[f"fol_{n_days}d"] = pd.NA
            row_values[f"fol_vs_bench_{n_days}d"] = pd.NA
        else:
            # total return for the following period
            row_values[f"fol_{n_days}d"] = (following + 1).prod() - 1
            following_benchmark = benchmark_returns.loc[following.index]
            row_values[f"fol_vs_bench_{n_days}d"] = row_values[f"fol_{n_days}d"] - ((following_benchmark + 1).prod() - 1)

    data_list.append(row_values)

    if (i+1) % 2500 == 0:
        print(f"Processed {i+1} rows in {time.time() - start_time:.2f} seconds")
# add returns data to edf
# make df and concat with edf
df_return_analysis = pd.DataFrame(data_list)
edf = pd.concat([edf, df_return_analysis], axis=1)

# save to csv
edf.to_csv("../data/rec_analysis/recs_with_pre_post_returns.csv", sep=";", index=False)

Processed 2500 rows in 13.67 seconds
Processed 5000 rows in 27.29 seconds
Processed 7500 rows in 41.05 seconds
Processed 10000 rows in 55.91 seconds
Processed 12500 rows in 70.09 seconds
Processed 15000 rows in 83.26 seconds
Processed 17500 rows in 96.47 seconds
Processed 20000 rows in 109.76 seconds
Processed 22500 rows in 123.36 seconds
Processed 25000 rows in 136.71 seconds
Processed 27500 rows in 150.08 seconds
Processed 30000 rows in 163.28 seconds
Processed 32500 rows in 176.55 seconds
Processed 35000 rows in 189.81 seconds
Processed 37500 rows in 203.05 seconds
Processed 40000 rows in 216.11 seconds
Processed 42500 rows in 229.37 seconds
Processed 45000 rows in 242.59 seconds
Processed 47500 rows in 255.84 seconds
Processed 50000 rows in 269.26 seconds
Processed 52500 rows in 282.64 seconds
Processed 55000 rows in 295.87 seconds


In [77]:
# load from csv
edf = pd.read_csv("../data/rec_analysis/recs_with_pre_post_returns.csv", sep=";")

In [50]:
edf.columns

Index(['video_id', 'upload_date', 'channel_id', 'trade_info_incl_neutrals',
       'asset_type', 'ticker', 'sentiment', 'has_returns', 'pre_1d',
       'pre_vs_bench_1d', 'fol_1d', 'fol_vs_bench_1d', 'pre_5d',
       'pre_vs_bench_5d', 'fol_5d', 'fol_vs_bench_5d', 'pre_21d',
       'pre_vs_bench_21d', 'fol_21d', 'fol_vs_bench_21d', 'pre_63d',
       'pre_vs_bench_63d', 'fol_63d', 'fol_vs_bench_63d', 'pre_252d',
       'pre_vs_bench_252d', 'fol_252d', 'fol_vs_bench_252d'],
      dtype='object')

In [74]:
from scipy.stats import wilcoxon
import numpy as np

### by sentiment (buy/sell)
sentiment_list = ["buy", "sell"]
time_periods = [(5, "1w"), (21, "1m"), (252, "1y")]
#asset_class_list = ["stock", "etf", "crypto", "commodity"]
pre_fol_list = [("pre", "pre"), ("fol", "post")] # preceding or following
return_type = "excess" # "excess" or "abs"
rows = []
for (pre_fol, pre_fol_name) in pre_fol_list:
    for sent in sentiment_list:
        for (n_days, timeframe_name) in time_periods:
    
            row = {}
            row["pre_or_post"] = pre_fol_name
            row["sentiment"] = sent
            row["time_period"] = timeframe_name
            # (excess) returns following
            sample = edf[edf["sentiment"] == sent][f"{pre_fol}_{'vs_bench_' if return_type=='excess' else ''}{n_days}d"].dropna()
            row["n"] = sample.shape[0]
            row["q10"] = sample.quantile(0.1)
            row["q33"] = sample.quantile(0.33)
            row["q50"] = sample.quantile(0.5)
            row["q66"] = sample.quantile(0.66)
            row["q90"] = sample.quantile(0.9)
            # wilcoxon test: if excess returns -> sample, if absolute returns -> log sample
            if return_type == "excess":
                test_sample = sample
            elif return_type == "abs":
                test_sample = np.log(sample + 1)
            else:
                raise ValueError("return_type must be 'excess' or 'abs'")
            w_stat, w_pval = wilcoxon(test_sample, alternative="two-sided")
            row["p_two_sided"] = w_pval
            w_stat, w_pval = wilcoxon(test_sample, alternative="greater")
            row["p_greater"] = w_pval
            w_stat, w_pval = wilcoxon(test_sample, alternative="less")
            row["p_less"] = w_pval


            rows.append(row)
df = pd.DataFrame(rows)
# save to csv
df.to_csv(f"../data/rec_analysis/buy_sell_{return_type}_prefol_analysis_results.csv", sep=";", index=False)


In [76]:
### by asset class
from scipy.stats import wilcoxon
import numpy as np

# excess returns only for now
sentiment_list = ["buy"]
time_periods = [(5, "1w"), (21, "1m"), (252, "1y")]
asset_class_list = ["stock", "etf", "crypto", "commodity"]
pre_fol_list = [("pre", "pre"), ("fol", "post")] # preceding or following
return_type = "excess" # "excess" or "abs"
rows = []
for (pre_fol, pre_fol_name) in pre_fol_list:
    for sent in sentiment_list:
        for asset_class in asset_class_list:
            for (n_days, timeframe_name) in time_periods:
                row = {}
                row["pre_or_post"] = pre_fol_name
                row["sentiment"] = sent
                row["asset_class"] = asset_class
                row["time_period"] = timeframe_name
                # (excess) returns following
                sample = edf[(edf["sentiment"] == sent) & (edf["asset_type"] == asset_class)][f"{pre_fol}_{'vs_bench_' if return_type=='excess' else ''}{n_days}d"].dropna()
                row["n"] = sample.shape[0]
                row["q10"] = sample.quantile(0.1)
                row["q33"] = sample.quantile(0.33)
                row["q50"] = sample.quantile(0.5)
                row["q66"] = sample.quantile(0.66)
                row["q90"] = sample.quantile(0.9)
                # wilcoxon test: if excess returns -> sample, if absolute returns -> log sample
                if return_type == "excess":
                    test_sample = sample
                elif return_type == "abs":
                    test_sample = np.log(sample + 1)
                else:
                    raise ValueError("return_type must be 'excess' or 'abs'")
                w_stat, w_pval = wilcoxon(test_sample, alternative="two-sided")
                row["p_two_sided"] = w_pval
                w_stat, w_pval = wilcoxon(test_sample, alternative="greater")
                row["p_greater"] = w_pval
                w_stat, w_pval = wilcoxon(test_sample, alternative="less")
                row["p_less"] = w_pval

                rows.append(row)
df = pd.DataFrame(rows)
# save to csv
df.to_csv(f"../data/rec_analysis/buy_asset_class_{return_type}_prefol_analysis_results.csv", sep=";", index=False)

In [7]:
### robustness check: drop all recommendations which do not have 252d preceding and following returns

import pandas as pd
import json
import time
edf = pd.read_csv("../data/rec_analysis/recs_with_pre_post_returns.csv", sep=";")
edf = edf.dropna(subset=["pre_vs_bench_252d", "fol_vs_bench_252d"])



In [10]:
from scipy.stats import wilcoxon
import numpy as np

# excess returns only for now
sentiment_list = ["buy", "sell"]
time_periods = [(5, "1w"), (21, "1m"), (252, "1y")]
#asset_class_list = ["stock", "etf", "crypto", "commodity"]
pre_fol_list = [("pre", "pre"), ("fol", "post")] # preceding or following
return_type = "abs" # "excess" or "abs"
rows = []
for (pre_fol, pre_fol_name) in pre_fol_list:
    for sent in sentiment_list:
        for (n_days, timeframe_name) in time_periods:
    
            row = {}
            row["pre_or_post"] = pre_fol_name
            row["sentiment"] = sent
            row["time_period"] = timeframe_name
            # (excess) returns following
            sample = edf[edf["sentiment"] == sent][f"{pre_fol}_{'vs_bench_' if return_type=='excess' else ''}{n_days}d"].dropna()
            row["n"] = sample.shape[0]
            row["q10"] = sample.quantile(0.1)
            row["q33"] = sample.quantile(0.33)
            row["q50"] = sample.quantile(0.5)
            row["q66"] = sample.quantile(0.66)
            row["q90"] = sample.quantile(0.9)
            # wilcoxon test: if excess returns -> sample, if absolute returns -> log sample
            if return_type == "excess":
                test_sample = sample
            elif return_type == "abs":
                test_sample = np.log(sample + 1)
            else:
                raise ValueError("return_type must be 'excess' or 'abs'")
            w_stat, w_pval = wilcoxon(test_sample, alternative="two-sided")
            row["p_two_sided"] = w_pval
            w_stat, w_pval = wilcoxon(test_sample, alternative="greater")
            row["p_greater"] = w_pval
            w_stat, w_pval = wilcoxon(test_sample, alternative="less")
            row["p_less"] = w_pval


            rows.append(row)
df = pd.DataFrame(rows)
# save to csv
df.to_csv(f"../data/rec_analysis/robust_buy_sell_{return_type}_prefol_analysis_results.csv", sep=";", index=False)

In [12]:
### by asset class
from scipy.stats import wilcoxon
import numpy as np

sentiment_list = ["buy"]
time_periods = [(5, "1w"), (21, "1m"), (252, "1y")]
asset_class_list = ["stock", "etf", "crypto", "commodity"]
pre_fol_list = [("pre", "pre"), ("fol", "post")] # preceding or following
return_type = "abs" # "excess" or "abs"
rows = []
for (pre_fol, pre_fol_name) in pre_fol_list:
    for sent in sentiment_list:
        for asset_class in asset_class_list:
            for (n_days, timeframe_name) in time_periods:
                row = {}
                row["pre_or_post"] = pre_fol_name
                row["sentiment"] = sent
                row["asset_class"] = asset_class
                row["time_period"] = timeframe_name
                # (excess) returns following
                sample = edf[(edf["sentiment"] == sent) & (edf["asset_type"] == asset_class)][f"{pre_fol}_{'vs_bench_' if return_type=='excess' else ''}{n_days}d"].dropna()
                row["n"] = sample.shape[0]
                row["q10"] = sample.quantile(0.1)
                row["q33"] = sample.quantile(0.33)
                row["q50"] = sample.quantile(0.5)
                row["q66"] = sample.quantile(0.66)
                row["q90"] = sample.quantile(0.9)
                # wilcoxon test: if excess returns -> sample, if absolute returns -> log sample
                if return_type == "excess":
                    test_sample = sample
                elif return_type == "abs":
                    test_sample = np.log(sample + 1)
                else:
                    raise ValueError("return_type must be 'excess' or 'abs'")
                w_stat, w_pval = wilcoxon(test_sample, alternative="two-sided")
                row["p_two_sided"] = w_pval
                w_stat, w_pval = wilcoxon(test_sample, alternative="greater")
                row["p_greater"] = w_pval
                w_stat, w_pval = wilcoxon(test_sample, alternative="less")
                row["p_less"] = w_pval

                rows.append(row)
df = pd.DataFrame(rows)
# save to csv
df.to_csv(f"../data/rec_analysis/robust_buy_asset_class_{return_type}_prefol_analysis_results.csv", sep=";", index=False)

In [16]:
# check dfs

df = pd.read_csv(f"../data/rec_analysis/robust_buy_asset_class_excess_prefol_analysis_results.csv", sep=";")

In [17]:
df

Unnamed: 0,pre_or_post,sentiment,asset_class,time_period,n,q10,q33,q50,q66,q90,p_two_sided,p_greater,p_less
0,pre,buy,stock,1w,21543,-0.075244,-0.016129,0.002678,0.022938,0.117999,3.835623e-40,1.917811e-40,1.0
1,pre,buy,stock,1m,21543,-0.151697,-0.038679,0.00492,0.050814,0.290335,2.02645e-46,1.013225e-46,1.0
2,pre,buy,stock,1y,21543,-0.423586,-0.091958,0.10697,0.407384,4.506863,0.0,0.0,1.0
3,pre,buy,etf,1w,1457,-0.029449,-0.004434,0.0,0.003789,0.044499,0.1591015,0.07955077,0.9204492
4,pre,buy,etf,1m,1457,-0.06145,-0.008371,0.0,0.008841,0.115463,0.0507705,0.02538525,0.9746147
5,pre,buy,etf,1y,1457,-0.268316,-0.05034,0.0,0.034859,0.559469,0.1226995,0.06134973,0.9386503
6,pre,buy,crypto,1w,9116,-0.137058,-0.026555,0.025164,0.085279,0.290518,5.06811e-123,2.534055e-123,1.0
7,pre,buy,crypto,1m,9116,-0.231054,-0.02431,0.127296,0.301265,1.001089,0.0,0.0,1.0
8,pre,buy,crypto,1y,9116,-0.144857,2.386489,5.025566,8.724441,46.035609,0.0,0.0,1.0
9,pre,buy,commodity,1w,840,-0.050995,-0.015976,-0.001677,0.012647,0.06195,0.6998344,0.6500828,0.3499172


In [18]:
df = pd.read_csv(f"../data/rec_analysis/buy_asset_class_excess_prefol_analysis_results.csv", sep=";")

In [19]:
df

Unnamed: 0,pre_or_post,sentiment,asset_class,time_period,n,q10,q33,q50,q66,q90,p_two_sided,p_greater,p_less
0,pre,buy,stock,1w,24157,-0.080334,-0.016995,0.002877,0.024605,0.1326,3.288143e-48,1.644071e-48,1.0
1,pre,buy,stock,1m,24049,-0.161746,-0.040498,0.00542,0.054336,0.321939,1.083531e-57,5.417657e-58,1.0
2,pre,buy,stock,1y,21956,-0.425623,-0.092507,0.105441,0.404608,4.422551,0.0,0.0,1.0
3,pre,buy,etf,1w,1501,-0.02954,-0.004506,0.0,0.003978,0.044555,0.1856265,0.09281324,0.9071868
4,pre,buy,etf,1m,1497,-0.061607,-0.008951,0.0,0.008867,0.114418,0.074502,0.037251,0.962749
5,pre,buy,etf,1y,1461,-0.268216,-0.050047,0.0,0.035136,0.560868,0.09711796,0.04855898,0.951441
6,pre,buy,crypto,1w,11171,-0.151366,-0.027314,0.029644,0.09837,0.367333,4.3232040000000004e-165,2.1616020000000002e-165,1.0
7,pre,buy,crypto,1m,11075,-0.247974,-0.021426,0.148703,0.349863,1.339116,0.0,0.0,1.0
8,pre,buy,crypto,1y,9166,-0.151303,2.381121,4.981951,8.72006,45.970674,0.0,0.0,1.0
9,pre,buy,commodity,1w,858,-0.052041,-0.016028,-0.001677,0.012911,0.062467,0.7177763,0.6411118,0.3588882
