In [1]:
import pandas as pd
import random 
import os

pd.set_option('display.max_rows', 50)


from functions import sharpe_ratio_calculation, generate_rand_portfolios, select_top_five, join_stocks_crypto, run_clustering_model

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
df_all_stocks = pd.read_csv('stocks_data_filtered_volatility.csv',index_col='Date')
cryptos_df = pd.read_csv('cryptos_data.csv', index_col='Date')

joined_df = join_stocks_crypto(cryptos_df, df_all_stocks, mode = 'stocks_left') #mode - either do left with crypto and fill NA for stocks or do left on stocks and leave out some dates for cryptos
joined_df.index = pd.to_datetime(joined_df.index)

joined_df_weekly = joined_df.resample('W').last() #try aggregating on a weekly level
joined_df_3days = joined_df.resample('3D').last()# aggregating on a twice per week basis to arrive at the sweet spot of that 250 (1 year) timeseries length

In [3]:
#Random Portfolios generation
tickers = list(df_all_stocks.columns)

random.seed(42)
random_portfolios = generate_rand_portfolios(n_reps=1000, n_stocks=15, tickers=tickers)

#Select top five sharpe ratio portfolios from a portfolio
sharpe_ratio = sharpe_ratio_calculation(df_all_stocks, rf_rate_annual = 0.02)
top_five_dict = select_top_five(random_portfolios, metric=sharpe_ratio)

In [4]:
##############TEST###################
from functions import test_clustering_metrics
random.seed(42)


n_clusters_list = [7,9,10,12]
df_with_label_balance = pd.DataFrame()
method = 'kmeans'
for window in [1,7,10,14,30,60]:
    for df_dict in [{'weekly': joined_df_weekly}, {'3day': joined_df_3days}, {'full': joined_df}]:
        output = test_clustering_metrics(df_dict, n_clusters_list, method=method, linkage_list=['average', 'complete', 'single'], 
                               return_mode='geometric', window=window, n_init=3)
        df_with_label_balance = pd.concat([df_with_label_balance, output])

df_with_label_balance.to_csv(f'NEW_CLUSTERING_TUNING/{method}.csv', index=False)

KeyboardInterrupt: 

In [5]:
kmeans_results = pd.read_csv('NEW_CLUSTERING_TUNING/kmeans.csv')
kshape_results = pd.read_csv('NEW_CLUSTERING_TUNING/kshape.csv')
ahc_results = pd.read_csv('NEW_CLUSTERING_TUNING/ahc.csv')

# results = pd.concat([kmeans_results, kshape_results, ahc_results])#.drop(columns=['Unnamed: 0'])
results = pd.concat([ahc_results, kshape_results, kmeans_results]).drop(columns='normalized_entropy')

In [6]:
results.sort_values(['silhouette_score', 'entropy'], ascending=False)

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy
200,7,0.272056,ahc,single,geometric,60,3day,0.2914
192,7,0.257570,ahc,average,geometric,60,3day,0.8815
180,7,0.253655,ahc,average,geometric,60,weekly,2.0052
144,7,0.252875,ahc,average,geometric,30,weekly,1.1690
148,7,0.228788,ahc,complete,geometric,30,weekly,1.8635
...,...,...,...,...,...,...,...,...
186,10,-0.145125,kshape,complete,geometric,60,weekly,3.0991
190,10,-0.145125,kshape,single,geometric,60,weekly,3.0991
183,12,-0.173409,kshape,average,geometric,60,weekly,3.3921
187,12,-0.173409,kshape,complete,geometric,60,weekly,3.3921


In [7]:
# from sklearn.preprocessing import MinMaxScaler
# results[['silhouette_norm']] = MinMaxScaler().fit_transform(results[['silhouette_score']])
# results['delta_norm'] = 1 - MinMaxScaler().fit_transform(results[['min_max_delta']])
# results['total_score'] = (results['silhouette_norm'] + results['delta_norm']) / 2


# results_filtered = results[results['silhouette_norm'] > 0.5]

# best_configs = results_filtered.loc[results_filtered.groupby('method')['total_score'].idxmax()].sort_values(by='total_score', ascending=False)


from sklearn.preprocessing import MinMaxScaler

# Normalize silhouette scores (higher is better)
results[['silhouette_norm']] = MinMaxScaler().fit_transform(results[['silhouette_score']])

#Normalize entropy
results[['entropy_normalized']] = MinMaxScaler().fit_transform(results[['entropy']])

# Compute total score: average of silhouette and normalized entropy
results['total_score'] = (results['silhouette_norm'] + results['entropy_normalized']) / 2

# Filter: Keep only good silhouette values
results_filtered = results[results['silhouette_norm'] > 0.5]

# Get best config per method
best_configs = results_filtered.loc[
    results_filtered.groupby('method')['total_score'].idxmax()
].sort_values(by='total_score', ascending=False)

In [8]:
results_filtered.sort_values('total_score', ascending=False)

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy,silhouette_norm,entropy_normalized,total_score
11,12,0.194370,kshape,single,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
3,12,0.194370,kshape,average,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
7,12,0.194370,kshape,complete,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
67,12,0.177092,kshape,complete,geometric,7,full,3.4032,0.786821,0.977986,0.882403
71,12,0.177092,kshape,single,geometric,7,full,3.4032,0.786821,0.977986,0.882403
...,...,...,...,...,...,...,...,...,...,...,...
21,9,0.054326,ahc,single,geometric,1,3day,0.4403,0.511229,0.056715,0.283972
202,10,0.056698,ahc,single,geometric,60,3day,0.4199,0.516555,0.050372,0.283463
0,7,0.060060,ahc,average,geometric,1,weekly,0.3583,0.524100,0.031218,0.277659
33,9,0.052609,ahc,single,geometric,1,full,0.4105,0.507374,0.047449,0.277412


In [9]:
best_configs.sort_values(by='total_score', ascending=False).head(20)

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy,silhouette_norm,entropy_normalized,total_score
3,12,0.19437,kshape,average,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
182,10,0.150511,kmeans,average,geometric,60,weekly,3.0737,0.72715,0.875532,0.801341
199,12,0.155118,ahc,complete,geometric,60,3day,3.0352,0.737491,0.863561,0.800526
199,12,0.108384,kmeans,complete,geometric,60,3day,3.3407,0.632582,0.958552,0.795567
182,10,0.220285,ahc,average,geometric,60,weekly,2.22,0.883781,0.610087,0.746934


In [10]:
best_configs

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy,silhouette_norm,entropy_normalized,total_score
3,12,0.19437,kshape,average,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413
182,10,0.150511,kmeans,average,geometric,60,weekly,3.0737,0.72715,0.875532,0.801341
199,12,0.155118,ahc,complete,geometric,60,3day,3.0352,0.737491,0.863561,0.800526
199,12,0.108384,kmeans,complete,geometric,60,3day,3.3407,0.632582,0.958552,0.795567
182,10,0.220285,ahc,average,geometric,60,weekly,2.22,0.883781,0.610087,0.746934


In [11]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, jaccard_score
import pandas as pd
import numpy as np

def evaluate_clustering_stability(df, 
                                   method='kmeans', 
                                   return_mode='geometric',
                                   n_clusters=5, 
                                   window_size=252,     # e.g. 1 year of daily data
                                   step_size=63,        # e.g. ~1 quarter
                                   linkage='average',
                                   n_init=3,
                                   agg_level=1,          # 1 = daily, 3 = every 3 days, 5 = weekly
                                   smoothing_window=None,  # e.g. 3-day moving average
                                   verbose=False):
    """
    Evaluates clustering stability over time using rolling windows,
    applying aggregation and optional smoothing within each window.

    Parameters:
    -----------
    df : DataFrame
        Time series data (rows = time, columns = tickers)
    agg_level : int
        Aggregation step size (1 = daily, 3 = every 3rd day, 5 = weekly, etc.)
    smoothing_window : int or None
        If set, applies a rolling mean of this window size (in aggregated points)
    """
    
    label_dicts = []
    time_indices = []

    # Adjust effective window and step sizes after aggregation
    effective_window = window_size
    effective_step = step_size

    for start in range(0, len(df) - effective_window + 1, effective_step):
        df_window = df.iloc[start:start + effective_window]
        time_indices.append(df.index[start])
        
        # Apply aggregation
        # df_agg = df_window.iloc[::agg_level].copy()
        if agg_level == 3:
            df_agg = df_window.resample('3D').last() #try aggregating on a weekly level
        elif agg_level == 5:
            df_agg = df_window.resample('W').last()
        elif agg_level == 1:
            df_agg = df_window.copy()
        else:
            raise ValueError("This aggregation level is not available")

        # Apply smoothing if specified
        if smoothing_window is not None:
            df_agg = df_agg.rolling(window=smoothing_window, min_periods=1).mean()

        try:
            labels, ticker_label_map, _, _ = run_clustering_model(
                df_agg,
                n_clus=n_clusters,
                model_name=method,
                linkage=linkage,
                return_mode=return_mode,
                n_init=n_init
            )
            label_dicts.append(ticker_label_map)
        except Exception as e:
            print(f"Clustering failed at window starting {df.index[start]}: {e}")
            label_dicts.append(None)
    
    # Compute stability scores between consecutive windows
    stability_scores = []
    
    for i in range(len(label_dicts) - 1):
        d1, d2 = label_dicts[i], label_dicts[i + 1]
        if d1 is None or d2 is None:
            continue
        
        common_tickers = list(set(d1) & set(d2))
        if len(common_tickers) < 5:
            continue
        
        labels1 = [d1[t] for t in common_tickers]
        labels2 = [d2[t] for t in common_tickers]
        
        ari = adjusted_rand_score(labels1, labels2)
        nmi = normalized_mutual_info_score(labels1, labels2)
        
        stability_scores.append({
            'window_pair': f"{time_indices[i].date()} → {time_indices[i+1].date()}",
            'ari': ari,
            'nmi': nmi,
            'common_tickers': len(common_tickers)
        })

    return pd.DataFrame(stability_scores)


In [None]:
#results_stability = pd.DataFrame(columns=)
for row in best_configs.iterrows():
    print(row[1])
    method = row[1]['method']
    linkage = row[1]['linkage']
    window_size = row[1]['window_size']
    if row[1]['df_mode'] == '3day':
        agg_level = 3
    elif row[1]['df_mode'] == 'weekly':
        agg_level = 5
    else:
        agg_level = 1

    results = evaluate_clustering_stability(joined_df, window_size=300, step_size=30, agg_level=agg_level, smoothing_window=window_size, method=method, linkage=linkage)
    ari_mean = results['ari'].mean()
    nmi_mean = results['nmi'].mean()

    best_configs.loc[row[0], 'ari_mean'] = ari_mean
    best_configs.loc[row[0], 'nmi_mean'] = nmi_mean


clusters                     12
silhouette_score        0.19437
method                   kshape
linkage                 average
return_mode           geometric
window_size                   1
df_mode                  weekly
entropy                  3.4136
silhouette_norm        0.825606
entropy_normalized     0.981219
total_score            0.903413
Name: 3, dtype: object
clusters                     10
silhouette_score       0.150511
method                   kmeans
linkage                 average
return_mode           geometric
window_size                  60
df_mode                  weekly
entropy                  3.0737
silhouette_norm         0.72715
entropy_normalized     0.875532
total_score            0.801341
Name: 182, dtype: object
clusters                     12
silhouette_score       0.155118
method                      ahc
linkage                complete
return_mode           geometric
window_size                  60
df_mode                    3day
entropy                 

In [13]:
best_configs

Unnamed: 0,clusters,silhouette_score,method,linkage,return_mode,window_size,df_mode,entropy,silhouette_norm,entropy_normalized,total_score,ari,nmi
3,12,0.19437,kshape,average,geometric,1,weekly,3.4136,0.825606,0.981219,0.903413,0.530371,0.536711
182,10,0.150511,kmeans,average,geometric,60,weekly,3.0737,0.72715,0.875532,0.801341,0.053126,0.094624
199,12,0.155118,ahc,complete,geometric,60,3day,3.0352,0.737491,0.863561,0.800526,0.033367,0.067828
199,12,0.108384,kmeans,complete,geometric,60,3day,3.3407,0.632582,0.958552,0.795567,0.033367,0.067828
182,10,0.220285,ahc,average,geometric,60,weekly,2.22,0.883781,0.610087,0.746934,0.053126,0.094624
