In [2]:
import pandas as pd
import random 
import os

pd.set_option('display.max_rows', 50)


from functions import sharpe_ratio_calculation, generate_rand_portfolios, select_top_five, join_stocks_crypto, run_clustering_model

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [3]:
df_all_stocks = pd.read_csv('stocks_data_filtered_volatility.csv',index_col='Date')
cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')

joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)

joined_df_weekly = joined_df.resample('W').last() #try aggregating on a weekly level
joined_df_3days = joined_df.resample('3D').last()# aggregating on a twice per week basis to arrive at the sweet spot of that 250 (1 year) timeseries length

In [4]:
#Random Portfolios generation
tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)

#Select top five sharpe ratio portfolios from a portfolio
sharpe_ratio = sharpe_ratio_calculation(df_all_stocks, rf_rate_annual = 0.02)
top_five_dict = select_top_five(random_portfolios, metric=sharpe_ratio)

In [11]:
##############TEST###################
from functions import test_clustering_metrics
n_clusters_list = [4,5,6,7,9,12]
df_with_label_balance = pd.DataFrame()
method = 'ahc'
for window in [3,7,10,14,30,60]:
    for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
        output = test_clustering_metrics(df_dict, n_clusters_list, method=method, linkage_list=['average', 'complete', 'single'], 
                               return_mode='geometric', window=window, n_init=3)
        df_with_label_balance = pd.concat([df_with_label_balance, output])

df_with_label_balance.to_csv(f'new_balance_silhouette/{method}.csv', index=False)

In [6]:
kmeans_results = pd.read_csv('new_balance_silhouette/kmeans.csv')
kshape_results = pd.read_csv('new_balance_silhouette/kshape.csv')
ahc_results = pd.read_csv('new_balance_silhouette/ahc.csv')

results = pd.concat([kmeans_results, kshape_results, ahc_results])#.drop(columns=['Unnamed: 0'])

In [12]:
from sklearn.preprocessing import MinMaxScaler
results[['silhouette_norm']] = MinMaxScaler().fit_transform(results[['silhouette_score']])
results['delta_norm'] = 1 - MinMaxScaler().fit_transform(results[['min_max_delta']])
results['total_score'] = (results['silhouette_norm'] + results['delta_norm']) / 2


results_filtered = results[results['silhouette_norm'] > 0.5]

best_configs = results_filtered.loc[results_filtered.groupby('method')['total_score'].idxmax()].sort_values(by='total_score', ascending=False)

In [13]:
results.sort_values(by='total_score', ascending=False)

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,min_per_cluster,max_per_cluster,min_max_delta,silhouette_norm,delta_norm,total_score
15,7,0.184157,kshape,not_applicable,geometric,3,full,0.0892,0.1831,0.0939,0.548840,0.954497,0.751668
13,5,0.175363,kshape,not_applicable,geometric,3,full,0.1455,0.2441,0.0986,0.536345,0.949441,0.742893
54,4,0.110924,kmeans,not_applicable,geometric,14,weekly,0.2207,0.2817,0.0610,0.444783,0.989888,0.717336
72,4,0.189226,kmeans,not_applicable,geometric,30,weekly,0.1408,0.3099,0.1690,0.556043,0.873709,0.714876
34,9,0.176470,kshape,not_applicable,geometric,7,full,0.0376,0.1925,0.1549,0.537918,0.888877,0.713398
...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,12,-0.051209,ahc,single,geometric,60,3day,0.0047,0.9437,0.9390,0.214405,0.045396,0.129900
323,12,-0.061074,ahc,single,geometric,60,full,0.0047,0.9343,0.9296,0.200388,0.055508,0.127948
125,12,-0.070507,ahc,single,geometric,10,weekly,0.0047,0.9296,0.9249,0.186984,0.060564,0.123774
287,12,-0.067134,ahc,single,geometric,60,weekly,0.0047,0.9343,0.9296,0.191776,0.055508,0.123642


In [14]:
best_configs

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,min_per_cluster,max_per_cluster,min_max_delta,silhouette_norm,delta_norm,total_score
15,7,0.184157,kshape,not_applicable,geometric,3,full,0.0892,0.1831,0.0939,0.54884,0.954497,0.751668
72,4,0.189226,kmeans,not_applicable,geometric,30,weekly,0.1408,0.3099,0.169,0.556043,0.873709,0.714876
299,12,0.155118,ahc,complete,geometric,60,3day,0.0047,0.2535,0.2488,0.507578,0.787866,0.647722
72,4,0.394223,ahc,average,geometric,7,3day,0.0047,0.9718,0.9671,0.847327,0.015168,0.431247


In [15]:
results.groupby('method')['total_score'].max()

method
ahc       0.653741
kmeans    0.717336
kshape    0.751668
Name: total_score, dtype: float64

In [16]:
results.groupby('method')['silhouette_score'].max()

method
ahc       0.501670
kmeans    0.226091
kshape    0.202010
Name: silhouette_score, dtype: float64

In [19]:
###CRYPTOS DISTRIBUTION IN CLUSTERS:
# from collections import Counter
# _, tickers_with_labels, _, _ = run_clustering_model(joined_df, n_clus=3, model_name='kmeans', linkage='single', return_mode='arithmetic', n_init=3)

# cryptos_list = list(cryptos_df.columns) 
# crypto_clusters = {ticker: tickers_with_labels[ticker] for ticker in cryptos_list if ticker in tickers_with_labels}

# # Count how many cryptos are in each cluster
# distribution = Counter(crypto_clusters.values())