In [1]:
import pandas as pd
import random 
import itertools
import json
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 50)


from functions import sharpe_ratio_calculation, generate_rand_portfolios, select_top_five, join_stocks_crypto, test_for_silhouette_score, run_clustering_model

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
df_all_stocks = pd.read_csv('stocks_data_filled.csv',index_col='Date')
#df_all_stocks = df_all_stocks.bfill()

#df_all_stocks.to_csv('stocks_data_filled.csv')

cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')


joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)
joined_df = joined_df.bfill()


joined_df_weekly = joined_df.resample('W').last() #try aggregating on a weekly level
joined_df_3days = joined_df.resample('3D').last()# aggregating on a twice per week basis to arrive at the sweet spot of that 250 (1 year) timeseries length

In [3]:
#Random Portfolios generation
tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)

In [4]:
#Select top five sharpe ratio portfolios from a portfolio
sharpe_ratio = sharpe_ratio_calculation(df_all_stocks, rf_rate_annual = 0.02)
top_five_dict = select_top_five(random_portfolios, metric=sharpe_ratio)

In [5]:
#Optimize Traditional Portfolios
# for i in range(999, 1001):

#     print('Doing', i)

#     top_five_sets = dict(itertools.islice(top_five_dict.items(), i, i+1))
#     results = run_min_variance(df_all_stocks, top_five_sets, min_weight_for_top_five=0.05)  #TRY DIFFERENT WEIGHTS FOR top_five
#     with open(f"min_variance_portfolio_jsons/my_dict{i}.json", "w") as f:
#         json.dump(results, f, indent=4)

In [6]:
#Reassemble the results of the optimization - jsons
# min_var_portfolios = dict()
# for i in range(1,1000):
#     with open(f'min_variance_portfolio_jsons/my_dict{i}.json') as f:
#         port = json.load(f)
#         min_var_portfolios.update(port)

# with open(f"full_optimized_min_variance.json", "w") as f:
#     json.dump(min_var_portfolios, f, indent=4)

In [7]:
# n_clusters_list = [4,5,6,7]
# linkage_list=['single', 'average', 'complete']
# #window_sizes = [3,7,10,14,21,30,60]
# window_sizes = [21,30,60]


# def run_clustering_evaluation(df, window_sizes, method, moving_average=True, return_mode='arithmetic', df_input_name='DFWASNOTSPECIFIED'):

#     for w_size in window_sizes:

#         #return_mode = 'arithmetic'
#         #n_init = 3
#         #center = True
#         if moving_average:
#             df = df.rolling(window=w_size, center=True).mean()

#             smoothing = 'moving_average'
#         else:
#             smoothing = 'no_smoothing'

#         if len(df) < 150 and w_size > 30:
#             continue
#         silhouette_df = test_for_silhouette_score(df, n_clusters_list, method=method, return_mode=return_mode, n_init=3, linkage_list=linkage_list)

#         silhouette_df['return_mode'] = return_mode
#         silhouette_df['n_init'] = 3
#         silhouette_df['smoothing'] = smoothing
#         silhouette_df['window_size/span'] = w_size

#         silhouette_df.to_csv(f'silhouette_dfs/{method}_{smoothing}_{return_mode}_windowsize-{w_size}_{df_input_name}.csv')




# df_input_name = input('Put in the name of the df mode you are running for: ')
# for return_mode in ['arithmetic', 'geometric']:
#     run_clustering_evaluation(joined_df_3days, window_sizes, method='kmeans', moving_average=True, return_mode=return_mode, df_input_name=df_input_name)

In [None]:
import warnings



def label_balance(df_dict:dict, window, method, return_mode, n_clus, linkage):

    # Suppress all warnings
    warnings.filterwarnings('ignore')


    df_name = list(df_dict.keys())[0]

    df_smooth = df_dict[df_name].rolling(window=window, center=True).mean()
    _, tickers_with_labels, _, _ = run_clustering_model(df_smooth, n_clus=n_clus, model_name=method, linkage=linkage, return_mode=return_mode, n_init=3)

    res = pd.DataFrame(list(tickers_with_labels.items()), columns=['ticker', 'label'])
    out = res.groupby('label').count()

    #if not ((out['ticker'] / len(joined_df.columns)) >= 0.6).any():
    max_percentage_per_cluster = (out['ticker'] / len(joined_df.columns)).max()
    min_percentage_per_cluster = (out['ticker'] / len(joined_df.columns)).min()
    min_max_delta = round(max_percentage_per_cluster - min_percentage_per_cluster, 4)
        #print(f'Window - {window},method - {method},return mode - {return_mode} \nMax {round(max_percentage_per_cluster * 100, 2)} % of observations per cluster  \nMin {round(min_percentage_per_cluster*100, 2)} % of observations per cluster')

    # out['return_mode'] = return_mode
    # out['window_size'] = window
    # out['method'] = method
    # out['clusters'] = n_clus
    # out['linkage'] = linkage
    # out['df_mode'] = df_name
    if method != 'ahc':
        linkage = 'not_applicable'

    output = {'return_mode': [return_mode], 
              'window_size': [window], 
              'method': [method],
              'linkage': [linkage], 
              'df_mode': [df_name], 
               'min_per_cluster': [round(min_percentage_per_cluster, 4)],
              'max_per_cluster': [round(max_percentage_per_cluster, 4)],
              'min_max_delta': [min_max_delta]}
    
    output_df_one_row = pd.DataFrame(output)
    

    return output_df_one_row #out, max_percentage_per_cluster, min_percentage_per_cluster


In [93]:
#THIS IS FOR FULL DF

method_loop = 'ahc'
return_mode_loop = 'geometric'
n_clus_loop = 4

df_with_label_balance = pd.DataFrame()
for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
    for linkage in ['single', 'complete', 'average']:
        for w in [3,7,10,14,30]:
            output = label_balance(df_dict, w, method_loop, return_mode_loop, n_clus=n_clus_loop, linkage=linkage)
            df_with_label_balance = pd.concat([df_with_label_balance, output])

df_with_label_balance.reset_index(inplace=True)

df_with_label_balance.to_csv(f'balance_test_results_{method_loop}_{return_mode_loop}_{n_clus_loop}clusters.csv')

In [83]:
df_with_label_balance

Unnamed: 0,index,return_mode,window_size,method,linkage,df_mode,min_per_cluster,max_per_cluster,min_max_delta
0,0,geometric,3,kshape,not_applicable,weekly,0.0766,0.3919,0.3153
1,0,geometric,7,kshape,not_applicable,weekly,0.1126,0.2658,0.1532
2,0,geometric,10,kshape,not_applicable,weekly,0.1261,0.2252,0.0991
3,0,geometric,14,kshape,not_applicable,weekly,0.1577,0.2252,0.0676
4,0,geometric,30,kshape,not_applicable,weekly,0.0946,0.3649,0.2703
5,0,geometric,3,kshape,not_applicable,weekly,0.1351,0.2838,0.1486
6,0,geometric,7,kshape,not_applicable,weekly,0.1171,0.2793,0.1622
7,0,geometric,10,kshape,not_applicable,weekly,0.0811,0.2568,0.1757
8,0,geometric,14,kshape,not_applicable,weekly,0.1486,0.2342,0.0856
9,0,geometric,30,kshape,not_applicable,weekly,0.0586,0.2838,0.2252
