In [4]:
import pandas as pd
import random 
import itertools
import json
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 50)


from functions import sharpe_ratio_calculation, generate_rand_portfolios, select_top_five, join_stocks_crypto, distance_matrix_calc, run_min_variance, run_clustering_model, test_for_silhouette_score

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [5]:
df_all_stocks = pd.read_csv('stocks_data.csv',index_col='Date')
cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')


joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)


joined_df_weekly = joined_df.resample('W').last() #try aggregating on a weekly level
joined_df_3days = joined_df.resample('3D').last()# aggregating on a twice per week basis to arrive at the sweet spot of that 250 (1 year) timeseries length

In [6]:
#Random Portfolios generation
tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)

In [7]:
#Select top five sharpe ratio portfolios from a portfolio
sharpe_ratio = sharpe_ratio_calculation(df_all_stocks, rf_rate_annual = 0.02)
top_five_dict = select_top_five(random_portfolios, metric=sharpe_ratio)

In [None]:
#Optimize Traditional Portfolios
# for i in range(999, 1001):

#     print('Doing', i)

#     top_five_sets = dict(itertools.islice(top_five_dict.items(), i, i+1))
#     results = run_min_variance(df_all_stocks, top_five_sets, min_weight_for_top_five=0.05)  #TRY DIFFERENT WEIGHTS FOR top_five
#     with open(f"min_variance_portfolio_jsons/my_dict{i}.json", "w") as f:
#         json.dump(results, f, indent=4)

In [None]:
#Reassemble the results of the optimization - jsons
# min_var_portfolios = dict()
# for i in range(1,1000):
#     with open(f'min_variance_portfolio_jsons/my_dict{i}.json') as f:
#         port = json.load(f)
#         min_var_portfolios.update(port)

# with open(f"full_optimized_min_variance.json", "w") as f:
#     json.dump(min_var_portfolios, f, indent=4)

In [None]:
n_clusters_list = [4,5,6]
linkage_list=['single', 'average', 'complete']
window_sizes = [3,7,10,14,21,30,60]


def run_clustering_evaluation(df, window_sizes, method, moving_average=True):

    df_input_name = input('Put in the name of the df mode you are running for: ')

    for w_size in window_sizes:

        #return_mode = 'arithmetic'
        #n_init = 3
        #center = True
        if moving_average:
            df = joined_df.rolling(window=w_size, center=True).mean()

            smoothing = 'moving_average'
        else:
            smoothing = 'no_smoothing'


        silhouette_df = test_for_silhouette_score(df, n_clusters_list, method=method, return_mode='arithmetic', n_init=3, linkage_list=linkage_list)

        silhouette_df['return_mode'] = 'arithmetic'
        silhouette_df['n_init'] = 3
        silhouette_df['smoothing'] = smoothing
        silhouette_df['window_size/span'] = w_size

        silhouette_df.to_csv(f'silhouette_dfs/{method}_{smoothing}_windowsize-{w_size}_{df_input_name}.csv')






run_clustering_evaluation(joined_df_weekly, window_sizes, method='kmeans', moving_average=True)

In [11]:
folder_path = 'silhouette_dfs/'

dfs_456_sil = pd.DataFrame()
dfs_all_sil = pd.DataFrame()

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if '456' in filename: 
        df = pd.read_csv(file_path)
        dfs_456_sil = pd.concat([dfs_456_sil, df], axis=0)
    else:
        df = pd.read_csv(file_path)
        df['filename'] = filename
        dfs_all_sil = pd.concat([dfs_all_sil, df], axis=0)

dfs_all_sil = dfs_all_sil.drop(columns=['Unnamed: 0'])

In [12]:
dfs_all_sil

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,n_init,smoothing,window_size/span,filename,inertia_score,inertia
0,4,0.498719,ahc,single,arithmetic,3.0,moving_average,60.0,ahc_moving_average_windowsize-60_3day_df.csv,,
1,5,0.323820,ahc,single,arithmetic,3.0,moving_average,60.0,ahc_moving_average_windowsize-60_3day_df.csv,,
2,6,0.323800,ahc,single,arithmetic,3.0,moving_average,60.0,ahc_moving_average_windowsize-60_3day_df.csv,,
3,4,0.498719,ahc,average,arithmetic,3.0,moving_average,60.0,ahc_moving_average_windowsize-60_3day_df.csv,,
4,5,0.323820,ahc,average,arithmetic,3.0,moving_average,60.0,ahc_moving_average_windowsize-60_3day_df.csv,,
...,...,...,...,...,...,...,...,...,...,...,...
4,5,0.330563,ahc,average,arithmetic,3.0,moving_average,7.0,ahc_moving_average_windowsize-7_3day_df.csv,,
5,6,0.333160,ahc,average,arithmetic,3.0,moving_average,7.0,ahc_moving_average_windowsize-7_3day_df.csv,,
6,4,0.347203,ahc,complete,arithmetic,3.0,moving_average,7.0,ahc_moving_average_windowsize-7_3day_df.csv,,
7,5,0.070858,ahc,complete,arithmetic,3.0,moving_average,7.0,ahc_moving_average_windowsize-7_3day_df.csv,,
