In [4]:
import pandas as pd
import random 
import os

pd.set_option('display.max_rows', 50)


from functions import sharpe_ratio_calculation, generate_rand_portfolios, select_top_five, join_stocks_crypto, run_clustering_model

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [5]:
df_all_stocks = pd.read_csv('stocks_data_filtered_volatility.csv',index_col='Date')
cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')

joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)

joined_df_weekly = joined_df.resample('W').last() #try aggregating on a weekly level
joined_df_3days = joined_df.resample('3D').last()# aggregating on a twice per week basis to arrive at the sweet spot of that 250 (1 year) timeseries length

In [6]:
#Random Portfolios generation
tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)

#Select top five sharpe ratio portfolios from a portfolio
sharpe_ratio = sharpe_ratio_calculation(df_all_stocks, rf_rate_annual = 0.02)
top_five_dict = select_top_five(random_portfolios, metric=sharpe_ratio)

In [52]:
##############TEST###################
from functions import test_clustering_metrics
random.seed(42)


n_clusters_list = [7,9,10,12]
df_with_label_balance = pd.DataFrame()
method = 'kmeans'
for window in [1,7,10,14,30,60]:
    for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
        output = test_clustering_metrics(df_dict, n_clusters_list, method=method, linkage_list=['average', 'complete', 'single'], 
                               return_mode='geometric', window=window, n_init=3)
        df_with_label_balance = pd.concat([df_with_label_balance, output])

df_with_label_balance.to_csv(f'NEW_CLUSTERING_TUNING/{method}.csv', index=False)

In [74]:
kmeans_results = pd.read_csv('NEW_CLUSTERING_TUNING/kmeans.csv')
kshape_results = pd.read_csv('NEW_CLUSTERING_TUNING/kshape.csv')
ahc_results = pd.read_csv('NEW_CLUSTERING_TUNING/ahc.csv')

# results = pd.concat([kmeans_results, kshape_results, ahc_results])#.drop(columns=['Unnamed: 0'])
results = pd.concat([ahc_results, kshape_results, kmeans_results]).drop(columns='normalized_entropy')

In [75]:
results.sort_values(['silhouette_score', 'entropy'], ascending=False)

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy
200,7,0.272056,ahc,single,geometric,60,3day,0.2914
192,7,0.257570,ahc,average,geometric,60,3day,0.8815
180,7,0.253655,ahc,average,geometric,60,weekly,2.0052
144,7,0.252875,ahc,average,geometric,30,weekly,1.1690
148,7,0.228788,ahc,complete,geometric,30,weekly,1.8635
...,...,...,...,...,...,...,...,...
186,10,-0.145125,kshape,complete,geometric,60,weekly,3.0991
190,10,-0.145125,kshape,single,geometric,60,weekly,3.0991
183,12,-0.173409,kshape,average,geometric,60,weekly,3.3921
187,12,-0.173409,kshape,complete,geometric,60,weekly,3.3921


In [78]:
# from sklearn.preprocessing import MinMaxScaler
# results[['silhouette_norm']] = MinMaxScaler().fit_transform(results[['silhouette_score']])
# results['delta_norm'] = 1 - MinMaxScaler().fit_transform(results[['min_max_delta']])
# results['total_score'] = (results['silhouette_norm'] + results['delta_norm']) / 2


# results_filtered = results[results['silhouette_norm'] > 0.5]

# best_configs = results_filtered.loc[results_filtered.groupby('method')['total_score'].idxmax()].sort_values(by='total_score', ascending=False)


from sklearn.preprocessing import MinMaxScaler

# Normalize silhouette scores (higher is better)
results[['silhouette_norm']] = MinMaxScaler().fit_transform(results[['silhouette_score']])

#Normalize entropy
results[['entropy_normalized']] = MinMaxScaler().fit_transform(results[['entropy']])

# Compute total score: average of silhouette and normalized entropy
results['total_score'] = (results['silhouette_norm'] + results['entropy_normalized']) / 2

# Filter: Keep only good silhouette values
results_filtered = results[results['silhouette_norm'] > 0.5]

# Get best config per method
best_configs = results_filtered.loc[
    results_filtered.groupby('method')['total_score'].idxmax()
].sort_values(by='total_score', ascending=False)

In [84]:
results_filtered.sort_values('total_score', ascending=False)

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy,silhouette_norm,entropy_normalized,total_score
11,12,0.194370,kshape,single,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
3,12,0.194370,kshape,average,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
7,12,0.194370,kshape,complete,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
67,12,0.177092,kshape,complete,geometric,7,full,3.4032,0.786821,0.977986,0.882403
71,12,0.177092,kshape,single,geometric,7,full,3.4032,0.786821,0.977986,0.882403
...,...,...,...,...,...,...,...,...,...,...,...
21,9,0.054326,ahc,single,geometric,1,3day,0.4403,0.511229,0.056715,0.283972
202,10,0.056698,ahc,single,geometric,60,3day,0.4199,0.516555,0.050372,0.283463
0,7,0.060060,ahc,average,geometric,1,weekly,0.3583,0.524100,0.031218,0.277659
33,9,0.052609,ahc,single,geometric,1,full,0.4105,0.507374,0.047449,0.277412


In [None]:
best_configs.sort_values(by='total_score', ascending=False).head(20)

In [85]:
best_configs

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy,silhouette_norm,entropy_normalized,total_score
3,12,0.19437,kshape,average,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
182,10,0.150511,kmeans,average,geometric,60,weekly,3.0737,0.72715,0.875532,0.801341
199,12,0.155118,ahc,complete,geometric,60,3day,3.0352,0.737491,0.863561,0.800526
199,12,0.108384,kmeans,complete,geometric,60,3day,3.3407,0.632582,0.958552,0.795567
182,10,0.220285,ahc,average,geometric,60,weekly,2.22,0.883781,0.610087,0.746934


In [89]:
df_smooth = joined_df_3days.rolling(window=60, center=False).mean().dropna()
df_smooth

Unnamed: 0_level_0,AAPL,ADBE,AMD,AMZN,ASML,AVGO,COST,CRESY,CSCO,ERIC,...,SAND-USD,SHIB-USD,SNX-USD,SOL-USD,STX-USD,UNI-USD,XLM-USD,XRP-USD,XTZ-USD,ZEC-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-29,156.690325,440.655832,105.904833,139.104733,593.882804,53.794311,493.220458,5.470497,47.228757,8.337974,...,2.723409,0.000021,4.270422,85.645906,1.126467,0.000120,0.182905,0.631317,2.912212,120.457002
2022-07-02,156.107054,438.223333,104.863500,138.191450,589.154644,53.549011,492.221149,5.475895,46.966715,8.287039,...,2.659791,0.000020,4.210465,83.673994,1.095627,0.000118,0.180303,0.623891,2.862356,119.132246
2022-07-05,155.690702,436.226499,103.985167,137.421050,584.291604,53.373387,491.588153,5.477556,46.707172,8.235203,...,2.600376,0.000020,4.165088,82.042586,1.067621,0.000115,0.177920,0.616876,2.817597,117.804587
2022-07-08,155.238018,433.885666,103.019167,136.590683,579.881113,53.193496,491.286398,5.476449,46.431546,8.178709,...,2.542001,0.000020,4.122508,80.344300,1.039487,0.000113,0.175430,0.609727,2.773659,116.442270
2022-07-11,154.792798,431.408000,102.029833,135.728383,575.125460,53.026751,491.139261,5.469943,46.160577,8.120562,...,2.479775,0.000019,4.077184,78.490166,1.010902,0.000110,0.172747,0.602218,2.727543,114.982139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-15,182.381024,543.192165,111.222666,135.730834,653.502769,86.948040,545.382890,7.251438,49.963278,4.670444,...,0.376239,0.000008,2.537411,32.054489,0.003531,0.000338,0.120824,0.575236,0.769926,28.229957
2023-12-18,182.524281,545.047832,111.646999,136.110667,654.082299,87.434047,547.624445,7.274856,49.966574,4.688679,...,0.377584,0.000008,2.566342,33.138517,0.003553,0.000340,0.121361,0.577248,0.771372,28.236150
2023-12-21,182.670458,547.035165,112.181833,136.545500,655.002120,87.939917,550.053993,7.298502,49.976371,4.708966,...,0.379540,0.000008,2.602056,34.500745,0.003577,0.000339,0.121937,0.579683,0.774510,28.237399
2023-12-24,182.736143,548.943332,112.718000,136.970667,655.709485,88.395581,552.399075,7.332726,49.973502,4.725123,...,0.383211,0.000008,2.629241,36.079183,0.003600,0.000337,0.122331,0.582143,0.779121,28.249722


In [7]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import pandas as pd

def evaluate_clustering_stability(df, 
                                   method='kmeans', 
                                   return_mode='geometric',
                                   n_clusters=5, 
                                   window_size=252, 
                                   step_size=63,
                                   linkage='average',
                                   n_init=3,
                                   verbose=False):
    """
    Evaluates clustering stability over time using rolling windows.

    Parameters:
    -----------
    df: DataFrame
        Time series data (rows = time, columns = tickers)
    method: str
        'kmeans', 'kshape', or 'ahc'
    return_mode: str
        'arithmetic' or 'geometric'
    n_clusters: int
        Number of clusters
    window_size: int
        Size of the rolling window (e.g., 252 for 1 year)
    step_size: int
        Step size for rolling window (e.g., 63 for quarterly)
    linkage: str
        Linkage for AHC if method='ahc'
    n_init: int
        Number of k-means/k-shape initializations
    verbose: bool
        Whether to print progress
    
    Returns:
    --------
    DataFrame with ARI and NMI scores between consecutive windows
    """
    
    label_dicts = []
    time_indices = []

    for start in range(0, len(df) - window_size + 1, step_size):
        df_window = df.iloc[start:start + window_size]
        time_indices.append(df.index[start])
        
        try:
            labels, ticker_label_map, _, _ = run_clustering_model(
                df_window,
                n_clus=n_clusters,
                model_name=method,
                linkage=linkage,
                return_mode=return_mode,
                n_init=n_init
            )
            label_dicts.append(ticker_label_map)
        except Exception as e:
            print(f"Clustering failed at window starting {df.index[start]}: {e}")
            label_dicts.append(None)
    
    # Compute stability scores between consecutive windows
    stability_scores = []
    
    for i in range(len(label_dicts) - 1):
        d1, d2 = label_dicts[i], label_dicts[i + 1]
        if d1 is None or d2 is None:
            continue
        
        common_tickers = list(set(d1) & set(d2))
        if len(common_tickers) < 5:
            continue
        
        labels1 = [d1[t] for t in common_tickers]
        labels2 = [d2[t] for t in common_tickers]
        
        ari = adjusted_rand_score(labels1, labels2)
        nmi = normalized_mutual_info_score(labels1, labels2)
        
        stability_scores.append({
            'window_pair': f"{time_indices[i].date()} → {time_indices[i+1].date()}",
            'ari': ari,
            'nmi': nmi,
            'common_tickers': len(common_tickers)
        })

    return pd.DataFrame(stability_scores)


In [None]:
for row in best_configs.iterrows():
    results_stability = evaluate_clustering_stability(joined_df, method='kmeans', return_mode='geometric',
                                   n_clusters=7, 
                                   window_size=252, 
                                   step_size=63,
                                   linkage='average',
                                   n_init=1,
                                   verbose=False)

(3, clusters                     12
silhouette_score        0.19437
method                   kshape
linkage                 average
return_mode           geometric
window_size                   1
df_mode                  weekly
entropy                  3.4136
silhouette_norm        0.825606
entropy_normalized     0.981219
total_score            0.903413
Name: 3, dtype: object)
(182, clusters                     10
silhouette_score       0.150511
method                   kmeans
linkage                 average
return_mode           geometric
window_size                  60
df_mode                  weekly
entropy                  3.0737
silhouette_norm         0.72715
entropy_normalized     0.875532
total_score            0.801341
Name: 182, dtype: object)
(199, clusters                     12
silhouette_score       0.155118
method                      ahc
linkage                complete
return_mode           geometric
window_size                  60
df_mode                    3day
entrop

In [14]:
print(results_stability['ari'].mean())
print(results_stability['nmi'].mean())

0.3188104010320274
0.3961919026651622
