In [1]:
import pandas as pd
import random 
import itertools
import json
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 50)


from functions import sharpe_ratio_calculation, generate_rand_portfolios, select_top_five, join_stocks_crypto, test_for_silhouette_score, run_clustering_model, label_balance

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
df_all_stocks = pd.read_csv('stocks_data_filled.csv',index_col='Date')
cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')

joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)
joined_df = joined_df.bfill()

joined_df_weekly = joined_df.resample('W').last() #try aggregating on a weekly level
joined_df_3days = joined_df.resample('3D').last()# aggregating on a twice per week basis to arrive at the sweet spot of that 250 (1 year) timeseries length

In [3]:
#Random Portfolios generation
tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)

In [4]:
random_portfolios

{'portfolio_0': {'0386.HK': 0.06282997455226873,
  'PKX': 0.07994374964088999,
  'COST': 0.06286953539988874,
  'SU.PA': 0.09611890748764781,
  'DG.PA': 0.054778980095644185,
  'VERX': 0.08730043455816668,
  'TLK': 0.022908538003450555,
  'KT': 0.07695659706273594,
  '1658.HK': 0.03099991483144627,
  '601857.SS': 0.04387006656719966,
  'HDB': 0.10201439428992247,
  '7203.T': 0.056142971826213146,
  'ADYEN.AS': 0.0881616197354022,
  'CSCO': 0.07484773468893512,
  'CRESY': 0.06025658126018868},
 'portfolio_1': {'HMC': 0.031031934626452002,
  'SMMT': 0.14033546699200483,
  'AIR.PA': 0.0803397405851176,
  '600941.SS': 0.09389780571559775,
  '8001.T': 0.12356369075762515,
  'COST': 0.0018682556605987952,
  '2914.T': 0.04986548047663692,
  'PRCT': 0.042647660570856075,
  '0883.HK': 0.03154414902308569,
  '601857.SS': 0.09468596101462544,
  'AD.AS': 0.014341642228770914,
  'TTD': 0.06454439537881784,
  'INGA.AS': 0.14574610816782002,
  '6861.T': 0.04733631531025828,
  'TTE.PA': 0.038251393491

In [5]:
#Select top five sharpe ratio portfolios from a portfolio
sharpe_ratio = sharpe_ratio_calculation(df_all_stocks, rf_rate_annual = 0.02)
top_five_dict = select_top_five(random_portfolios, metric=sharpe_ratio)

In [6]:
#Optimize Traditional Portfolios
# for i in range(999, 1001):

#     print('Doing', i)

#     top_five_sets = dict(itertools.islice(top_five_dict.items(), i, i+1))
#     results = run_min_variance(df_all_stocks, top_five_sets, min_weight_for_top_five=0.05)  #TRY DIFFERENT WEIGHTS FOR top_five
#     with open(f"min_variance_portfolio_jsons/my_dict{i}.json", "w") as f:
#         json.dump(results, f, indent=4)




#Reassemble the results of the optimization - jsons
# min_var_portfolios = dict()
# for i in range(1,1000):
#     with open(f'min_variance_portfolio_jsons/my_dict{i}.json') as f:
#         port = json.load(f)
#         min_var_portfolios.update(port)

# with open(f"full_optimized_min_variance.json", "w") as f:
#     json.dump(min_var_portfolios, f, indent=4)

In [7]:
#RUN THE CLUSTERING WITH DIFFERENT SET UPS TO GET THE SILHOUETTE SCORES FOR COMPARISON

# n_clusters_list = [4,5,6,7]
# linkage_list=['single', 'average', 'complete']
# #window_sizes = [3,7,10,14,21,30,60]
# window_sizes = [21,30,60]


# def run_clustering_evaluation(df, window_sizes, method, moving_average=True, return_mode='arithmetic', df_input_name='DFWASNOTSPECIFIED'):

#     for w_size in window_sizes:

#         #return_mode = 'arithmetic'
#         #n_init = 3
#         #center = True
#         if moving_average:
#             df = df.rolling(window=w_size, center=True).mean()

#             smoothing = 'moving_average'
#         else:
#             smoothing = 'no_smoothing'

#         if len(df) < 150 and w_size > 30:
#             continue
#         silhouette_df = test_for_silhouette_score(df, n_clusters_list, method=method, return_mode=return_mode, n_init=3, linkage_list=linkage_list)

#         silhouette_df['return_mode'] = return_mode
#         silhouette_df['n_init'] = 3
#         silhouette_df['smoothing'] = smoothing
#         silhouette_df['window_size/span'] = w_size

#         silhouette_df.to_csv(f'silhouette_dfs/{method}_{smoothing}_{return_mode}_windowsize-{w_size}_{df_input_name}.csv')




# df_input_name = input('Put in the name of the df mode you are running for: ')
# for return_mode in ['arithmetic', 'geometric']:
#     run_clustering_evaluation(joined_df_3days, window_sizes, method='kmeans', moving_average=True, return_mode=return_mode, df_input_name=df_input_name)

In [None]:
#RUN FOR LABEL BALANCE

method_loop = 'kmeans'
return_mode_loop = 'arithmetic'
n_clus_loop = 7

df_with_label_balance = pd.DataFrame()
for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
    for linkage in ['single', 'complete', 'average']:
        for w in [3,7,10,14,30]:
            output = label_balance(df_dict, w, method_loop, return_mode_loop, n_clus=n_clus_loop, linkage=linkage)
            df_with_label_balance = pd.concat([df_with_label_balance, output])

df_with_label_balance.reset_index(inplace=True)

df_with_label_balance.to_csv(f'balance_datasets/balance_test_results_{method_loop}_{return_mode_loop}_{n_clus_loop}clusters.csv')

In [None]:
##############TEST###################
# from functions import test_clustering_metrics
# n_clusters_list = [4,5,6,7]
# df_with_label_balance = pd.DataFrame()
# for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
#     output = test_clustering_metrics(df_dict, n_clusters_list, method='kshape', linkage_list=None, 
#                            return_mode='arithmetic', window=7, n_init=3)
#     df_with_label_balance = pd.concat([df_with_label_balance, output])

# df_with_label_balance

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,min_per_cluster,max_per_cluster,min_max_delta
0,4,0.106343,kshape,not_applicable,arithmetic,7,weekly,0.1667,0.3018,0.1351
1,5,0.117279,kshape,not_applicable,arithmetic,7,weekly,0.1216,0.2477,0.1261
2,6,0.102482,kshape,not_applicable,arithmetic,7,weekly,0.0721,0.2658,0.1937
3,7,0.137601,kshape,not_applicable,arithmetic,7,weekly,0.0405,0.2387,0.1982
0,4,0.118409,kshape,not_applicable,arithmetic,7,3day,0.1757,0.3649,0.1892
1,5,0.131066,kshape,not_applicable,arithmetic,7,3day,0.1351,0.3063,0.1712
2,6,0.1502,kshape,not_applicable,arithmetic,7,3day,0.1036,0.3243,0.2207
3,7,0.15802,kshape,not_applicable,arithmetic,7,3day,0.0721,0.2928,0.2207
0,4,0.143082,kshape,not_applicable,arithmetic,7,full,0.1351,0.3964,0.2613
1,5,0.167169,kshape,not_applicable,arithmetic,7,full,0.1126,0.3829,0.2703


In [None]:
def rejoin_dfs(folder_path, extract_cols_from_filename=False):
    '''
    extract_cols_from_filename is the option to handle legacy thing with the silhouette score tables when part of the information was stored in the filename
    '''
    dfs = pd.DataFrame()
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)  
        df = pd.read_csv(file_path, index_col=False)
        if extract_cols_from_filename:
            df['filename'] = filename
        dfs = pd.concat([dfs, df])
    for col in ['index', 'Unnamed: 0']:
        try:
            dfs = dfs.drop(columns=[col])
        except:
            pass
    if extract_cols_from_filename:
        dfs['df_mode'] = dfs['filename'].apply(lambda x: x.split('-')[1].split('_')[1].replace('.csv', ''))
        dfs['return_mode'] = dfs['filename'].apply(lambda x: x.split('_')[3])
        dfs['linkage'] = dfs['linkage'].fillna('not_applicable')

    return dfs

In [None]:
folder_path_sil = 'silhouette_dfs/'
folder_path_balance = 'balance_datasets/'

silhouette_results = rejoin_dfs(folder_path_sil, extract_cols_from_filename=True)
balance_results = rejoin_dfs(folder_path_balance)


In [None]:
balance_results

Unnamed: 0,return_mode,window_size,method,linkage,df_mode,min_per_cluster,max_per_cluster,min_max_delta,filename,clusters
0,arithmetic,3,ahc,single,weekly,0.0045,0.9685,0.9640,balance_test_results_ahc_arithmetic_5clusters.csv,5
1,arithmetic,7,ahc,single,weekly,0.0045,0.9640,0.9595,balance_test_results_ahc_arithmetic_5clusters.csv,5
2,arithmetic,10,ahc,single,weekly,0.0045,0.9550,0.9505,balance_test_results_ahc_arithmetic_5clusters.csv,5
3,arithmetic,14,ahc,single,weekly,0.0045,0.9550,0.9505,balance_test_results_ahc_arithmetic_5clusters.csv,5
4,arithmetic,30,ahc,single,weekly,0.0045,0.9550,0.9505,balance_test_results_ahc_arithmetic_5clusters.csv,5
...,...,...,...,...,...,...,...,...,...,...
40,geometric,3,kshape,not_applicable,full,0.0586,0.2387,0.1802,,7
41,geometric,7,kshape,not_applicable,full,0.0631,0.3153,0.2523,,7
42,geometric,10,kshape,not_applicable,full,0.0721,0.3288,0.2568,,7
43,geometric,14,kshape,not_applicable,full,0.0541,0.2477,0.1937,,7


In [None]:
silhouette_results

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,n_init,smoothing,window_size/span,filename,inertia,df_mode
0,4,0.450849,ahc,single,arithmetic,3,moving_average,10,ahc_moving_average_arithmetic_windowsize-10_3d...,,3day
1,5,0.366116,ahc,single,arithmetic,3,moving_average,10,ahc_moving_average_arithmetic_windowsize-10_3d...,,3day
2,6,0.217847,ahc,single,arithmetic,3,moving_average,10,ahc_moving_average_arithmetic_windowsize-10_3d...,,3day
3,7,0.186290,ahc,single,arithmetic,3,moving_average,10,ahc_moving_average_arithmetic_windowsize-10_3d...,,3day
4,4,0.436554,ahc,average,arithmetic,3,moving_average,10,ahc_moving_average_arithmetic_windowsize-10_3d...,,3day
...,...,...,...,...,...,...,...,...,...,...,...
3,7,0.175246,kshape,not_applicable,geometric,3,moving_average,7,kshape_moving_average_geometric_windowsize-7_f...,0.150242,full
0,4,0.122094,kshape,not_applicable,geometric,3,moving_average,7,kshape_moving_average_geometric_windowsize-7_w...,0.114208,weekly
1,5,0.109347,kshape,not_applicable,geometric,3,moving_average,7,kshape_moving_average_geometric_windowsize-7_w...,0.103348,weekly
2,6,0.106075,kshape,not_applicable,geometric,3,moving_average,7,kshape_moving_average_geometric_windowsize-7_w...,0.101805,weekly


In [None]:
df_all_results = pd.merge(silhouette_results, balance_results, how='left', 
                          left_on=['clusters', 'linkage', 'window_size/span', 'df_mode', 'method', 'return_mode'],
                          right_on=['clusters', 'linkage', 'window_size', 'df_mode', 'method', 'return_mode'])

In [None]:
x = df_all_results.drop(columns=['filename_x', 'filename_y'])
x[x['min_per_cluster'].isna()]

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,n_init,smoothing,window_size/span,inertia,df_mode,window_size,min_per_cluster,max_per_cluster,min_max_delta
0,4,0.450849,ahc,single,arithmetic,3,moving_average,10,,3day,,,,
4,4,0.436554,ahc,average,arithmetic,3,moving_average,10,,3day,,,,
8,4,0.158583,ahc,complete,arithmetic,3,moving_average,10,,3day,,,,
12,4,0.365574,ahc,single,arithmetic,3,moving_average,10,,full,,,,
16,4,0.365574,ahc,average,arithmetic,3,moving_average,10,,full,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098,6,0.087180,kshape,not_applicable,geometric,3,moving_average,60,0.051515,full,,,,
1099,7,0.082481,kshape,not_applicable,geometric,3,moving_average,60,0.050901,full,,,,
1100,4,0.118613,kshape,not_applicable,geometric,3,moving_average,7,0.165385,3day,,,,
1110,4,0.129811,kshape,not_applicable,geometric,3,moving_average,7,0.196862,full,,,,


In [None]:
x.to_csv('full_table_silhouette_balance.csv', index=False)