In [1]:
import pandas as pd
import random 
import json


from functions import join_stocks_crypto, generate_rand_portfolios
from functions_post_clustering import simulate_evaluate_portfolio_subset

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
#GET THE DATA IN
df_all_stocks = pd.read_csv('stocks_data_filled.csv',index_col='Date')
cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')

joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)

returns_all = joined_df.pct_change()


tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)


#Reassemble the portfolio jsons for minvar
min_var_portfolios = dict()
for i in range(1,1000):
    with open(f'min_variance_portfolio_jsons/my_dict{i}.json') as f:
        port = json.load(f)
        min_var_portfolios.update(port)

with open(f"full_optimized_min_variance.json", "w") as f:
    json.dump(min_var_portfolios, f, indent=4)

SIMULATE AND EVALUATE

In [3]:
simulations_results_dict_rand, subset_statistics_df_rand, _ = simulate_evaluate_portfolio_subset(random_portfolios, returns_all, n_sims=100, t=100, distribution_model='multivar_norm')

simulations_results_dict_minvar, subset_statistics_df_minvar, _ = simulate_evaluate_portfolio_subset(min_var_portfolios, returns_all, n_sims=100, t=100, distribution_model='multivar_norm')

Normality Test results: 

                           statistic        p_value  normal
mean_cumulative_return    609.692142  4.046071e-133   False
mean_daily_return         609.692142  4.046071e-133   False
std_cumulative_return     772.668780  1.648566e-168   False
std_daily_return          755.124060  1.063883e-164   False
sharpe_daily                5.730103   5.698021e-02    True
sharpe_cumulative          10.670590   4.818489e-03   False
sharpe_annual               5.730103   5.698021e-02    True
sharpe_cumulative_annual   10.670590   4.818489e-03   False
VaR                       560.697513  1.762309e-122   False
CVaR                      450.878254   1.238885e-98   False
Normality Test results: 

                           statistic        p_value  normal
mean_cumulative_return    426.311676   2.676676e-93   False
mean_daily_return         426.311676   2.676676e-93   False
std_cumulative_return     561.805442  1.012741e-122   False
std_daily_return          554.443754  4.018542e-

In [48]:
from scipy.stats import f_oneway
from scipy.stats import kruskal


def kruskal_anova_test(subset_stats_dfs:dict, metrics='all', test='anova'):
    if metrics == 'all':
        subset_stats_dfs_list = [x for x in subset_stats_dfs.values()]
        metrics = list(subset_stats_dfs_list[0].columns)

    tests_results = dict()
    for metric in metrics:
        groups = [subset_df[metric] for k, subset_df in subset_stats_dfs.items()]
    
        if test == 'anova':
            test_stat, test_p = f_oneway(*groups)
        elif test == 'kruskal':
            test_stat, test_p = kruskal(*groups)

        tests_results[metric] = {'test_stat': round(float(test_stat), 4), 'test_p': round(float(test_p), 4)}
    
    return pd.DataFrame(tests_results).T

In [50]:
subset_stats_dfs = {'Random Portfolios Stats': subset_statistics_df_rand,
                    'MinVar Portfolios Stats': subset_statistics_df_minvar}
kruskal_anova_test(subset_stats_dfs, metrics='all', test='kruskal')

Unnamed: 0,test_stat,test_p
mean_cumulative_return,201.2143,0.0
mean_daily_return,201.2143,0.0
std_cumulative_return,1207.7553,0.0
std_daily_return,1211.7177,0.0
sharpe_daily,221.9385,0.0
sharpe_cumulative,237.4066,0.0
sharpe_annual,221.9385,0.0
sharpe_cumulative_annual,237.4066,0.0
VaR,1352.4818,0.0
CVaR,1322.0393,0.0


In [6]:
#Dunn-Bonferroni Test
import scikit_posthocs as sp


group1 = subset_statistics_df_rand['sharpe_annual']
group2 = subset_statistics_df_minvar['sharpe_annual']


# Combine into one Series
data = pd.concat([group1, group2], ignore_index=True)
groups = ['Group 1'] * len(group1) + ['Group 2'] * len(group2)

#sp.posthoc_dunn([group1, group2], p_adjust='bonferroni')

df = pd.DataFrame({'value': data, 'group': groups})
sp.posthoc_dunn(df, val_col='value', group_col='group', p_adjust='bonferroni')

Unnamed: 0,Group 1,Group 2
Group 1,1.0,3.4163259999999995e-50
Group 2,3.4163259999999995e-50,1.0


In [66]:
def dunn_bonferroni(subset_stats_dfs:dict, metrics='all'):
    if metrics == 'all':
        subset_stats_dfs_list = [x for x in subset_stats_dfs.values()]
        metrics = list(subset_stats_dfs_list[0].columns)

    dunn_tables_results = dict()
    for metric in metrics:

        group_list = list()
        group_labels = list()
        for i, subset_dict_name in enumerate(subset_stats_dfs):
            group = subset_stats_dfs[subset_dict_name][metric]
            group_list.append(group)
            group_labels.extend([subset_dict_name.replace(' Stats', '')] * len(group))
        

        data = pd.concat(group_list, ignore_index=True)


        df = pd.DataFrame({'value': data, 'group': group_labels})
        result = sp.posthoc_dunn(df, val_col='value', group_col='group', p_adjust='bonferroni')    
        result = result.astype(float).round(4)

        dunn_tables_results[metric] = result

    
    return dunn_tables_results

In [67]:
subset_stats_dfs = {'Random Portfolios Stats': subset_statistics_df_rand,
                    'MinVar Portfolios Stats': subset_statistics_df_minvar}
x = dunn_bonferroni(subset_stats_dfs, metrics='all')
x['mean_cumulative_return']

Unnamed: 0,MinVar Portfolios,Random Portfolios
MinVar Portfolios,1.0,0.0
Random Portfolios,0.0,1.0
