In [1]:
import pandas as pd
import random 
import itertools
import json
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 50)


from functions import sharpe_ratio_calculation, generate_rand_portfolios, select_top_five, join_stocks_crypto, test_for_silhouette_score, run_clustering_model, label_balance

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [None]:
df_all_stocks = pd.read_csv('stocks_data_filled.csv',index_col='Date')
cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')

joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)

joined_df_weekly = joined_df.resample('W').last() #try aggregating on a weekly level
joined_df_3days = joined_df.resample('3D').last()# aggregating on a twice per week basis to arrive at the sweet spot of that 250 (1 year) timeseries length

In [None]:
#Random Portfolios generation
tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)

#Select top five sharpe ratio portfolios from a portfolio
sharpe_ratio = sharpe_ratio_calculation(df_all_stocks, rf_rate_annual = 0.02)
top_five_dict = select_top_five(random_portfolios, metric=sharpe_ratio)

In [6]:
#Optimize Traditional Portfolios
# for i in range(999, 1001):

#     print('Doing', i)

#     top_five_sets = dict(itertools.islice(top_five_dict.items(), i, i+1))
#     results = run_min_variance(df_all_stocks, top_five_sets, min_weight_for_top_five=0.05)  #TRY DIFFERENT WEIGHTS FOR top_five
#     with open(f"min_variance_portfolio_jsons/my_dict{i}.json", "w") as f:
#         json.dump(results, f, indent=4)




#Reassemble the results of the optimization - jsons
# min_var_portfolios = dict()
# for i in range(1,1000):
#     with open(f'min_variance_portfolio_jsons/my_dict{i}.json') as f:
#         port = json.load(f)
#         min_var_portfolios.update(port)

# with open(f"full_optimized_min_variance.json", "w") as f:
#     json.dump(min_var_portfolios, f, indent=4)

In [7]:
#RUN THE CLUSTERING WITH DIFFERENT SET UPS TO GET THE SILHOUETTE SCORES FOR COMPARISON

# n_clusters_list = [4,5,6,7]
# linkage_list=['single', 'average', 'complete']
# #window_sizes = [3,7,10,14,21,30,60]
# window_sizes = [21,30,60]


# def run_clustering_evaluation(df, window_sizes, method, moving_average=True, return_mode='arithmetic', df_input_name='DFWASNOTSPECIFIED'):

#     for w_size in window_sizes:

#         #return_mode = 'arithmetic'
#         #n_init = 3
#         #center = True
#         if moving_average:
#             df = df.rolling(window=w_size, center=True).mean()

#             smoothing = 'moving_average'
#         else:
#             smoothing = 'no_smoothing'

#         if len(df) < 150 and w_size > 30:
#             continue
#         silhouette_df = test_for_silhouette_score(df, n_clusters_list, method=method, return_mode=return_mode, n_init=3, linkage_list=linkage_list)

#         silhouette_df['return_mode'] = return_mode
#         silhouette_df['n_init'] = 3
#         silhouette_df['smoothing'] = smoothing
#         silhouette_df['window_size/span'] = w_size

#         silhouette_df.to_csv(f'silhouette_dfs/{method}_{smoothing}_{return_mode}_windowsize-{w_size}_{df_input_name}.csv')




# df_input_name = input('Put in the name of the df mode you are running for: ')
# for return_mode in ['arithmetic', 'geometric']:
#     run_clustering_evaluation(joined_df_3days, window_sizes, method='kmeans', moving_average=True, return_mode=return_mode, df_input_name=df_input_name)

In [None]:
#RUN FOR LABEL BALANCE

# method_loop = 'ahc'
# return_mode_loop = 'geometric'
# n_clus_loop = 7

# df_with_label_balance = pd.DataFrame()
# for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
#     for linkage in ['single', 'complete', 'average']:
#         for w in [3,7,10,14,21,30]:
#             output = label_balance(df_dict, w, method_loop, return_mode_loop, n_clus=n_clus_loop, linkage=linkage)
#             df_with_label_balance = pd.concat([df_with_label_balance, output])

# df_with_label_balance.reset_index(inplace=True)

#df_with_label_balance.to_csv(f'balance_datasets/balance_test_results_{method_loop}_{return_mode_loop}_{n_clus_loop}clusters.csv')

In [129]:
##############TEST###################
# from functions import test_clustering_metrics
# n_clusters_list = [4,5,6,7]
# df_with_label_balance = pd.DataFrame()
# for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
#     output = test_clustering_metrics(df_dict, n_clusters_list, method='kshape', linkage_list=None, 
#                            return_mode='arithmetic', window=7, n_init=3)
#     df_with_label_balance = pd.concat([df_with_label_balance, output])

# df_with_label_balance

In [None]:
def rejoin_dfs(folder_path, extract_cols_from_filename=False):
    '''
    extract_cols_from_filename is the option to handle legacy thing with the silhouette score tables when part of the information was stored in the filename
    '''
    dfs = pd.DataFrame()
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)  
        df = pd.read_csv(file_path, index_col=False)
        if extract_cols_from_filename:
            df['filename'] = filename
        dfs = pd.concat([dfs, df])
    for col in ['index', 'Unnamed: 0']:
        try:
            dfs = dfs.drop(columns=[col])
        except:
            pass
    if extract_cols_from_filename:
        dfs['df_mode'] = dfs['filename'].apply(lambda x: x.split('-')[1].split('_')[1].replace('.csv', ''))
        dfs['return_mode'] = dfs['filename'].apply(lambda x: x.split('_')[3])
        dfs['linkage'] = dfs['linkage'].fillna('not_applicable')
    
    dfs = dfs.drop_duplicates()

    return dfs

In [155]:
folder_path_sil = 'silhouette_dfs/'
folder_path_balance = 'balance_datasets/'

silhouette_results = rejoin_dfs(folder_path_sil, extract_cols_from_filename=True)
balance_results = rejoin_dfs(folder_path_balance)


silhouette_results_over_15percent = silhouette_results[silhouette_results['silhouette_score'] >= 0.15]

In [None]:
df_all_results = pd.merge(silhouette_results_over_15percent, balance_results, how='left', 
                          left_on=['clusters', 'linkage', 'window_size/span', 'df_mode', 'method', 'return_mode'],
                          right_on=['clusters', 'linkage', 'window_size', 'df_mode', 'method', 'return_mode'])


from sklearn.preprocessing import MinMaxScaler
df_all_results[['silhouette_norm']] = MinMaxScaler().fit_transform(df_all_results[['silhouette_score']])
df_all_results['delta_norm'] = 1 - MinMaxScaler().fit_transform(df_all_results[['min_max_delta']])
df_all_results['total_score'] = (df_all_results['silhouette_norm'] + df_all_results['delta_norm']) / 2

best_configs = df_all_results.loc[df_all_results.groupby('method')['score'].idxmax()]

In [157]:
x = df_all_results.drop(columns=['filename_x', 'filename_y'])
x[x['min_per_cluster'].isna()]

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,n_init,smoothing,window_size/span,inertia,df_mode,window_size,min_per_cluster,max_per_cluster,min_max_delta
42,4,0.446806,ahc,single,arithmetic,3,moving_average,21,,3day,,,,
43,5,0.430937,ahc,single,arithmetic,3,moving_average,21,,3day,,,,
44,6,0.240101,ahc,single,arithmetic,3,moving_average,21,,3day,,,,
45,4,0.233561,ahc,average,arithmetic,3,moving_average,21,,3day,,,,
46,4,0.175851,ahc,complete,arithmetic,3,moving_average,21,,3day,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,4,0.378125,kmeans,not_applicable,geometric,3,moving_average,60,6.845577,3day,,,,
361,5,0.387660,kmeans,not_applicable,geometric,3,moving_average,60,5.339188,3day,,,,
362,6,0.292936,kmeans,not_applicable,geometric,3,moving_average,60,5.608180,3day,,,,
363,7,0.344045,kmeans,not_applicable,geometric,3,moving_average,60,4.067230,3day,,,,


In [None]:
df_rest = pd.DataFrame()
for index, r in x.iterrows():
    df_dict = {'weekly':{'weekly': joined_df_weekly}, '3day': {'3day': joined_df_3days}, 'full': {'full': joined_df}}[r['df_mode']]


    output = label_balance(df_dict=df_dict, window=r['window_size/span'], method=r['method'], return_mode=r['return_mode'], n_clus=r['clusters'], linkage=r['linkage'])
    df_rest = pd.concat([df_rest, output])

df_rest.to_csv(f'balance_datasets/rest.csv')

In [None]:
#what are the lowest silhouette scores?
#silhouette_results[silhouette_results['silhouette_score'] <= 0.1]#.sort_values(by='silhouette_score')

#what is the mean for silhouette score per method?
silhouette_results.groupby(by='method')['silhouette_score'].mean()

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,n_init,smoothing,window_size/span,filename,inertia,df_mode
3,7,0.093055,ahc,single,arithmetic,3,moving_average,10,ahc_moving_average_arithmetic_windowsize-10_fu...,,full
8,4,0.080447,ahc,complete,arithmetic,3,moving_average,14,ahc_moving_average_arithmetic_windowsize-14_fu...,,full
9,5,0.079474,ahc,complete,arithmetic,3,moving_average,14,ahc_moving_average_arithmetic_windowsize-14_fu...,,full
10,6,0.077971,ahc,complete,arithmetic,3,moving_average,14,ahc_moving_average_arithmetic_windowsize-14_fu...,,full
11,7,0.082014,ahc,complete,arithmetic,3,moving_average,14,ahc_moving_average_arithmetic_windowsize-14_fu...,,full
...,...,...,...,...,...,...,...,...,...,...,...
3,7,-0.092348,kshape,not_applicable,geometric,3,moving_average,60,kshape_moving_average_geometric_windowsize-60_...,0.018487,3day
0,4,0.030101,kshape,not_applicable,geometric,3,moving_average,60,kshape_moving_average_geometric_windowsize-60_...,0.073983,full
1,5,0.040678,kshape,not_applicable,geometric,3,moving_average,60,kshape_moving_average_geometric_windowsize-60_...,0.063123,full
2,6,0.087180,kshape,not_applicable,geometric,3,moving_average,60,kshape_moving_average_geometric_windowsize-60_...,0.051515,full


In [None]:
###CRYPTOS DISTRIBUTION IN CLUSTERS:
# from collections import Counter
# _, tickers_with_labels, _, _ = run_clustering_model(joined_df, n_clus=3, model_name='kmeans', linkage='single', return_mode='arithmetic', n_init=3)

# cryptos_list = list(cryptos_df.columns) 
# crypto_clusters = {ticker: tickers_with_labels[ticker] for ticker in cryptos_list if ticker in tickers_with_labels}

# # Count how many cryptos are in each cluster
# distribution = Counter(crypto_clusters.values())