In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
import hdbscan
from sklearn.metrics import silhouette_score
from kneed import KneeLocator
import umap
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

## Load Config and Datasets

In [538]:
data = json.load(open('../../datasets/annotated_ht_ef_datasets/afro_asian_bulletin/mdp_39015061285097/wavelet_analysis/DWT_results/raw_results/mdp_39015061285097_ranking_config.json'))
ranking_config = pd.json_normalize(data)
reconstruction_configs_metrics = pd.json_normalize(ranking_config['reconstruction_config.metrics'].explode()).add_prefix('reconstruction_configs_metrics_')
metrics = pd.json_normalize(ranking_config['metrics'].explode()).add_prefix('metrics_')
subset_ranking_config = ranking_config.drop(columns=['reconstruction_config.metrics', 'metrics'])
reconstruction_config_cols = [col for col in subset_ranking_config.columns.tolist() if "reconstruction" in col]

repeated_subset_ranking_config = pd.concat([subset_ranking_config[reconstruction_config_cols]] * len(reconstruction_configs_metrics), ignore_index=True)
reconstruction_config_df = pd.concat([repeated_subset_ranking_config, reconstruction_configs_metrics], axis=1)
reconstruction_config_df = reconstruction_config_df.rename(columns={'reconstruction_configs_metrics_metric': 'reconstruction_configs_metric'})
non_reconstruction_cols = [col for col in subset_ranking_config.columns.tolist() if "reconstruction" not in col]
repeated_subset_ranking_config = pd.concat([subset_ranking_config[non_reconstruction_cols]] * len(metrics), ignore_index=True)
metrics_config_df = pd.concat([repeated_subset_ranking_config, metrics], axis=1)
metrics_config_df = metrics_config_df.rename(columns={'metrics_metric': 'metric'})

In [539]:
full_raw_df = pd.read_csv("../../datasets/annotated_ht_ef_datasets/afro_asian_bulletin/mdp_39015061285097/wavelet_analysis/DWT_results/raw_results/mdp_39015061285097_full_ranked_results.csv")
subset_raw_df = pd.read_csv("../../datasets/annotated_ht_ef_datasets/afro_asian_bulletin/mdp_39015061285097/wavelet_analysis/DWT_results/raw_results/mdp_39015061285097_subset_ranked_results.csv")
raw_processed_df = pd.read_csv("../../scripts/wavelet_scripts/rawprocessed_results_df.csv")

full_smoothed_df = pd.read_csv("../../datasets/annotated_ht_ef_datasets/afro_asian_bulletin/mdp_39015061285097/wavelet_analysis/DWT_results/smoothed_results/mdp_39015061285097_full_ranked_results.csv")
subset_smoothed_df = pd.read_csv("../../datasets/annotated_ht_ef_datasets/afro_asian_bulletin/mdp_39015061285097/wavelet_analysis/DWT_results/smoothed_results/mdp_39015061285097_subset_ranked_results.csv")
smoothed_processed_df = pd.read_csv("../../scripts/wavelet_scripts/smoothedprocessed_results_df.csv")

In [540]:
len(raw_processed_df), len(full_raw_df), len(subset_raw_df), len(smoothed_processed_df), len(full_smoothed_df), len(subset_smoothed_df)

(585, 585, 104, 585, 585, 110)

In [541]:
merged_subset_raw_df = pd.merge(subset_raw_df, raw_processed_df, on=["wavelet", "wavelet_mode", "wavelet_level", "reconstruction_score_weighted", "reconstruction_score_sum"], how="outer")
merged_subset_raw_df['summed_scores'] = merged_subset_raw_df['reconstruction_score_sum'] + merged_subset_raw_df['wavelet_summed_norm_score']
merged_full_raw_df = pd.merge(full_raw_df, raw_processed_df, on=["wavelet", "wavelet_mode", "wavelet_level", "reconstruction_score_weighted", "reconstruction_score_sum"], how="outer")
merged_full_raw_df['summed_scores'] = merged_full_raw_df['reconstruction_score_sum'] + merged_full_raw_df['wavelet_summed_norm_score']

merged_subset_smoothed_df = pd.merge(subset_smoothed_df, smoothed_processed_df, on=["wavelet", "wavelet_mode", "wavelet_level", "reconstruction_score_weighted", "reconstruction_score_sum"], how="outer")
merged_subset_smoothed_df['summed_scores'] = merged_subset_smoothed_df['reconstruction_score_sum'] + merged_subset_smoothed_df['wavelet_summed_norm_score']
merged_full_smoothed_df = pd.merge(full_smoothed_df, smoothed_processed_df, on=["wavelet", "wavelet_mode", "wavelet_level", "reconstruction_score_weighted", "reconstruction_score_sum"], how="outer")
merged_full_smoothed_df['summed_scores'] = merged_full_smoothed_df['reconstruction_score_sum'] + merged_full_smoothed_df['wavelet_summed_norm_score']

cols = merged_full_raw_df.columns.tolist()

subset_cols = [col for col in cols if ('zscore' not in col) and ('norm' not in col)]
subset_cols = subset_cols + ['wavelet_summed_norm_score']
norm_cols = [col for col in cols if ('zscore' not in col) and ('norm' in col)]
score_cols = ['reconstruction_score_sum', 'reconstruction_score_weighted', 'final_score', 'summed_scores']
norm_cols = norm_cols + score_cols
rank_cols = [col for col in cols if ('rank' in col)]
core_cols = ['wavelet', 'wavelet_mode', 'wavelet_level', 'signal_type', 'wavelet_type']

reconstruction_metrics = reconstruction_config_df.reconstruction_configs_metric.tolist()

reconstruction_metrics_cols = []


for col in merged_full_raw_df.columns.tolist():
	for metric in reconstruction_metrics:
		if metric in col:
			reconstruction_metrics_cols.append(col)
norm_reconstruction_cols = [col for col in reconstruction_metrics_cols if ('norm' in col)]

signal_metrics = metrics_config_df.metric.tolist()

signal_metrics_cols = []


for col in merged_full_raw_df.columns.tolist():
	for metric in signal_metrics:
		if metric in col:
			signal_metrics_cols.append(col)
norm_signal_cols = [col for col in signal_metrics_cols if ('norm' in col)]

In [543]:
ranking_config.to_dict()

{'signal_type': {0: 'raw'},
 'is_combined': {0: False},
 'metrics': {0: [{'metric': 'wavelet_mse',
    'original_weight': 0.25,
    'final_weight': 0.034482758620689655,
    'normalized_weight': 0.034482758620689655,
    'ignore_metric': False,
    'removal_reason': 'Low variance (std: 0.000000, mean: 0.000000)',
    'was_inverted': True,
    'was_shared': True,
    'was_specific': False,
    'variance': 1.3330267265552304e-38,
    'presence': 1.0,
    'was_zscored': False,
    'mean': 1.7180416389929614e-20,
    'std': 1.1545677661164937e-19,
    'max': 1.1332263261644102e-18,
    'min': 2.488414928686321e-27,
    'final_score_correlation': -0.10581486066543677,
    'summed_score_correlation': -0.4524331190281691},
   {'metric': 'wavelet_psnr',
    'original_weight': 0.25,
    'final_weight': 0.27586206896551724,
    'normalized_weight': 0.27586206896551724,
    'ignore_metric': False,
    'removal_reason': None,
    'was_inverted': False,
    'was_shared': True,
    'was_specific': F

## Explore Correlations

In [6]:
subset_raw_df[['reconstruction_score_weighted',
 'reconstruction_score_sum', 'final_score',
 'wavelet_summed_norm_score',]].corr()

Unnamed: 0,reconstruction_score_weighted,reconstruction_score_sum,final_score,wavelet_summed_norm_score
reconstruction_score_weighted,1.0,0.764019,-0.135277,0.040181
reconstruction_score_sum,0.764019,1.0,0.237468,0.475529
final_score,-0.135277,0.237468,1.0,0.841348
wavelet_summed_norm_score,0.040181,0.475529,0.841348,1.0


In [7]:
full_raw_df[['reconstruction_score_weighted',
			  'reconstruction_score_sum', 'final_score',
 'wavelet_summed_norm_score',]].corr()

Unnamed: 0,reconstruction_score_weighted,reconstruction_score_sum,final_score,wavelet_summed_norm_score
reconstruction_score_weighted,1.0,0.668381,-0.086706,-0.012912
reconstruction_score_sum,0.668381,1.0,0.1073,0.520711
final_score,-0.086706,0.1073,1.0,0.459282
wavelet_summed_norm_score,-0.012912,0.520711,0.459282,1.0


In [8]:
subset_smoothed_df[['reconstruction_score_weighted',
					 'reconstruction_score_sum', 'final_score',
 'wavelet_summed_norm_score',]].corr()

Unnamed: 0,reconstruction_score_weighted,reconstruction_score_sum,final_score,wavelet_summed_norm_score
reconstruction_score_weighted,1.0,0.93501,-0.211951,-0.227701
reconstruction_score_sum,0.93501,1.0,-0.02942,-0.009985
final_score,-0.211951,-0.02942,1.0,0.727917
wavelet_summed_norm_score,-0.227701,-0.009985,0.727917,1.0


In [9]:
full_smoothed_df[['reconstruction_score_weighted',
					 'reconstruction_score_sum', 'final_score',
 'wavelet_summed_norm_score',]].corr()

Unnamed: 0,reconstruction_score_weighted,reconstruction_score_sum,final_score,wavelet_summed_norm_score
reconstruction_score_weighted,1.0,0.886664,-0.031904,-0.145981
reconstruction_score_sum,0.886664,1.0,0.016531,0.132582
final_score,-0.031904,0.016531,1.0,0.286711
wavelet_summed_norm_score,-0.145981,0.132582,0.286711,1.0


## Cluster Features

### Functions

In [46]:
def evaluate_clusters(pivot_df, max_k=10, n_runs=5):
	"""
	Evaluates cluster stability across multiple runs.

	Parameters:
	-----------
	pivot_df : DataFrame
		The data to cluster.
	max_k : int
		Maximum number of clusters to try.
	n_runs : int
		Number of times to repeat clustering with different seeds.

	Returns:
	--------
	dict
		Dictionary containing silhouette scores, elbow values, and the most stable number of clusters.
	"""

	all_silhouette_scores = []
	all_elbow_dfs = []
	optimal_k_values = []

	for _ in range(n_runs):
		silhouette_scores = []
		elbow_df = pd.DataFrame(columns=['k', 'inertia'])

		for k in range(2, max_k + 1):
			if k >= len(pivot_df):
				continue  # Skip if the number of clusters is greater than or equal to the number of samples
			kmeans = KMeans(n_clusters=k, random_state=np.random.randint(1000))
			cluster_labels = kmeans.fit_predict(pivot_df)

			# Compute silhouette score
			if len(set(cluster_labels)) > 1:  # Ensure there is more than one cluster
				

				silhouette_avg = silhouette_score(pivot_df, cluster_labels)
				silhouette_scores.append(silhouette_avg)
			else:
				silhouette_scores.append(np.nan)
			# Compute inertia for elbow method
			new_row = pd.DataFrame({'k': [k], 'inertia': [kmeans.inertia_]})
			elbow_df = pd.concat([elbow_df, new_row], ignore_index=True)

		elbow_df['k'] = elbow_df['k'].astype(int)
		elbow_df['inertia'] = elbow_df['inertia'].astype(int)

		# Detect elbow point
		kn = KneeLocator(elbow_df['k'], elbow_df['inertia'], curve='convex', direction='decreasing')
		optimal_k = kn.knee if kn.knee else max_k

		optimal_k_values.append(optimal_k)
		all_silhouette_scores.append(silhouette_scores)
		all_elbow_dfs.append(elbow_df)

	# Find the most frequently occurring optimal cluster number
	most_common_k = Counter(optimal_k_values).most_common(1)[0][0]

	return {
		'silhouette_scores': all_silhouette_scores,
		'elbow_dfs': all_elbow_dfs,
		'stable_optimal_k': most_common_k
	}


def plot_silhouette_scores(silhouette_scores_list, max_k):
	"""
	Plot the mean silhouette scores across multiple runs.
	
	:param silhouette_scores_list: List of silhouette score lists from multiple runs.
	:param max_k: Maximum number of clusters.
	"""
	avg_silhouette_scores = np.mean(silhouette_scores_list, axis=0)
	std_silhouette_scores = np.std(silhouette_scores_list, axis=0)

	plt.figure(figsize=(8, 5))
	plt.plot(range(2, max_k + 1), avg_silhouette_scores, marker='o', label="Mean Silhouette Score")
	plt.fill_between(
		range(2, max_k + 1),
		avg_silhouette_scores - std_silhouette_scores,
		avg_silhouette_scores + std_silhouette_scores,
		color='gray', alpha=0.2, label="Std Dev"
	)
	
	plt.xlabel('Number of clusters')
	plt.ylabel('Silhouette score')
	plt.title('Mean Silhouette Scores Across Runs')
	plt.grid(True)
	plt.xticks(range(2, max_k + 1))
	plt.legend()
	plt.show()

def plot_elbow_curve(elbow_dfs):
	"""
	Plot elbow curves from multiple runs to visualize variability.
	
	:param elbow_dfs: List of DataFrames containing 'k' and 'inertia' values for different runs.
	"""
	plt.figure(figsize=(8, 5))

	# Plot all runs
	for i, df in enumerate(elbow_dfs):
		plt.plot(df['k'], df['inertia'], marker='o', alpha=0.3, label=f"Run {i+1}")

	# Compute mean elbow curve
	avg_inertia = np.mean([df['inertia'].values for df in elbow_dfs], axis=0)
	plt.plot(elbow_dfs[0]['k'], avg_inertia, marker='o', color='black', linewidth=2, label="Mean Inertia")

	plt.xlabel('Number of clusters')
	plt.ylabel('Inertia')
	plt.title('Elbow Method Across Runs')
	plt.legend()
	plt.grid(True)
	plt.show()

def create_clusters(
	pivot_df, comparison_title, type_clustering, clustering_method="KMeans", max_k=10, n_runs=5, hdbscan_params=None, show_plots=False
):
	"""
	Create and visualize clusters using KMeans, DBSCAN, or HDBSCAN and determine the most stable number of clusters.

	Parameters:
	-----------
	pivot_df : DataFrame
		The data to cluster.
	comparison_title : str
		Title for comparison.
	type_clustering : str
		Clustering type ('UMAP' or 'PCA').
	clustering_method : str
		Clustering method to use ('KMeans', 'DBSCAN', or 'HDBSCAN').
	max_k : int
		Maximum number of clusters to try (only for KMeans).
	n_runs : int
		Number of times to repeat clustering with different seeds (only for KMeans).
	hdbscan_params : dict
		Parameters for HDBSCAN clustering (e.g., {'min_cluster_size': 5, 'min_samples': None}).
	show_plots : bool
		Whether to show plots.

	Returns:
	--------
	DataFrame with cluster assignments.
	"""
	if clustering_method == "KMeans":
		# Evaluate clusters and determine the most stable number of clusters
		cluster_results = evaluate_clusters(pivot_df, max_k=max_k, n_runs=n_runs)
		stable_k = cluster_results['stable_optimal_k']
		print(f"Most stable number of clusters: {stable_k}")

		if show_plots:
			# Plot silhouette scores and elbow curve
			plot_silhouette_scores(cluster_results['silhouette_scores'], max_k)
			plot_elbow_curve(cluster_results['elbow_dfs'])

		# Perform final clustering
		kmeans = KMeans(n_clusters=stable_k, random_state=42)
		clusters = kmeans.fit_predict(pivot_df)

	elif clustering_method == "DBSCAN":
		# Perform DBSCAN clustering
		dbscan_params = dbscan_params or {'eps': 0.5, 'min_samples': 5}
		print(f"Using DBSCAN with params: {dbscan_params}")
		dbscan = DBSCAN(**dbscan_params)
		clusters = dbscan.fit_predict(pivot_df)

	elif clustering_method == "HDBSCAN":
		# Perform HDBSCAN clustering
		hdbscan_params = hdbscan_params or {'min_cluster_size': 2, 'min_samples': None}
		print(f"Using HDBSCAN with params: {hdbscan_params}")
		hdbscan_clusterer = hdbscan.HDBSCAN(**hdbscan_params)
		clusters = hdbscan_clusterer.fit_predict(pivot_df)

	else:
		raise ValueError("Invalid clustering method. Choose 'KMeans', 'DBSCAN', or 'HDBSCAN'.")

	# Add cluster assignments to the pivot dataframe
	col_name = f'{clustering_method.lower()}_{type_clustering.lower()}_cluster'
	pivot_df[col_name] = clusters
	# Use UMAP to reduce dimensionality for visualization
	if type_clustering == "UMAP" and show_plots:
		reducer = umap.UMAP(random_state=42)
		if col_name in pivot_df.columns:
			cluster_pivot_df = pivot_df.drop(col_name, axis=1)
		else:
			cluster_pivot_df = pivot_df
		embedding = reducer.fit_transform(cluster_pivot_df)

		# Create a DataFrame for the embedding
		embedding_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])
		embedding_df['cluster'] = clusters

		# Visualize the clusters
		sns.scatterplot(data=embedding_df, x='UMAP1', y='UMAP2', hue='cluster', palette='viridis')


	elif type_clustering == "PCA" and show_plots:
		# Reduce the dimensionality for visualization
		# Determine the number of components for 95% variance
		pca = PCA()
		if col_name in pivot_df.columns:
			cluster_pivot_df = pivot_df.drop(col_name, axis=1)
		else:
			cluster_pivot_df = pivot_df
		pca.fit(cluster_pivot_df)
		cumsum = np.cumsum(pca.explained_variance_ratio_)
		d = np.argmax(cumsum >= 0.95) + 1
		if d < 2:
			d = 2  # Ensure at least two components are retained
		if d > 3:
			d = 3
		pca = PCA(n_components=d)
		reduced_data = pca.fit_transform(cluster_pivot_df)

		# Create a DataFrame for the reduced data
		reduced_df = pd.DataFrame(reduced_data, columns=['PCA' + str(i) for i in range(1, d+1)])
		reduced_df['cluster'] = clusters

		# Debugging: Print the columns of reduced_df
		print("Columns in reduced_df:", reduced_df.columns)

		# Visualize the clusters
		if d == 3:
			fig = plt.figure()
			ax = fig.add_subplot(111, projection='3d')

			ax.scatter(reduced_df['PCA1'], reduced_df['PCA2'], reduced_df['PCA3'], c=reduced_df['cluster'])

			ax.set_xlabel('PCA1')
			ax.set_ylabel('PCA2')
			ax.set_zlabel('PCA3')
		else:
			sns.scatterplot(data=reduced_df, x='PCA1', y='PCA2', hue='cluster', palette='viridis')
	if show_plots:
		plt.title(f'Clusters of {comparison_title} with {type_clustering}')
		plt.show()

	return pivot_df

In [154]:
def compute_correlation(df, norm_cols, use_thresholding=False, fill_na=True, threshold=0.1, drop_na=False):
	"""
	Computes the correlation matrix for selected numeric columns and 
	drops columns/rows if more than (1 - threshold)% of values are NaN.

	Parameters:
	-----------
	df (pd.DataFrame): Input DataFrame.
	norm_cols (list): List of column names to include in the correlation matrix.
	use_thresholding (bool): Whether to drop columns/rows based on NaN thresholding.
	threshold (float): Minimum proportion of non-NaN values required to keep a column/row.
	fill_na (bool): If True, fills remaining NaNs with 0.
	drop_na (bool): If True, drops all remaining NaNs.

	Returns:
	--------
	pd.DataFrame: Cleaned correlation matrix.
	"""

	# Step 1: Compute correlation matrix
	numeric_cols = df[norm_cols].select_dtypes(include="number").columns.tolist()
	correlation_df = df[numeric_cols].corr()

	if use_thresholding:
		# Step 2: Identify columns/rows exceeding NaN threshold
		dropped_cols = correlation_df.columns[correlation_df.isna().sum() > len(correlation_df) * (1 - threshold)].tolist()
		dropped_rows = correlation_df.index[correlation_df.isna().sum(axis=1) > len(correlation_df.columns) * (1 - threshold)].tolist()

		if dropped_cols:
			print(f"🛑 Dropping {len(dropped_cols)} columns due to NaNs before transposing: {dropped_cols}")
		if dropped_rows:
			print(f"🛑 Dropping {len(dropped_rows)} rows due to NaNs before transposing: {dropped_rows}")

		correlation_df.drop(columns=dropped_cols, errors='ignore', inplace=True)
		correlation_df.drop(index=dropped_rows, errors='ignore', inplace=True)

	# Step 3: Transpose matrix
	correlation_df = correlation_df.T

	# Step 4: Post-Transpose NaN Handling
	remaining_na_cols = correlation_df.columns[correlation_df.isna().sum() > 0].tolist()
	remaining_na_rows = correlation_df.index[correlation_df.isna().sum(axis=1) > 0].tolist()

	if remaining_na_cols or remaining_na_rows:
		print(f"⚠️ Warning: Some NaNs still present after transposing!")
		if remaining_na_cols:
			print(f"⚠️ Columns with NaNs after transposing: {remaining_na_cols}")
		if remaining_na_rows:
			print(f"⚠️ Rows with NaNs after transposing: {remaining_na_rows}")

		# Drop remaining NaNs if either `drop_na=True` OR `use_thresholding=True`
		if drop_na or use_thresholding:
			correlation_df.drop(columns=remaining_na_cols, errors='ignore', inplace=True)
			correlation_df.drop(index=remaining_na_rows, errors='ignore', inplace=True)
			print("✅ Remaining NaNs dropped after transposing!")

	# Step 5: Fill NaNs if requested
	if fill_na:
		correlation_df.fillna(0, inplace=True)  # Fill remaining NaNs with 0 if requested

	return correlation_df

In [155]:
def run_correlations_clustering(df, norm_reconstruction_cols, data_type, use_thresholding, fill_na):
	norm_corr_df = compute_correlation(df, norm_reconstruction_cols, use_thresholding=use_thresholding, fill_na=fill_na)
	norm_pivoted_df = create_clusters(norm_corr_df, f"Reconstruction Configurations {data_type}", "PCA")
	norm_pivoted_df = create_clusters(norm_pivoted_df, f"Reconstruction Configurations {data_type}", "UMAP", max_k=15)
	norm_pivoted_df = create_clusters(norm_pivoted_df, f"Reconstruction Configurations {data_type}", "PCA", "HDBSCAN", max_k=15)
	norm_pivoted_df = create_clusters(norm_pivoted_df, f"Reconstruction Configurations {data_type}", "UMAP", "HDBSCAN", max_k=15)
	norm_pivoted_df['metric'] = norm_pivoted_df.index
	norm_pivoted_df = norm_pivoted_df.reset_index(drop=True)
	return norm_pivoted_df


In [156]:
def combined_across_clusters(df, cluster_cols):
	finalized_df = []
	initial_cluster = cluster_cols[0]
	clusters = df[initial_cluster].unique()

	for idx, cluster in enumerate(clusters):
		final_clustered_cols = []
		initial_metrics = df[(df[initial_cluster] == cluster)][[initial_cluster, 'metric']]
		initial_metrics['cluster_type'] = initial_cluster
		initial_metrics = initial_metrics.rename(columns={initial_cluster: 'original_cluster'})
		final_clustered_cols.append(initial_metrics)
		for col in cluster_cols[1:]:
			comparison_clusters = df[(df[col] == cluster)][col].unique().tolist()
			if len(comparison_clusters) > 2:
				print(f"Cluster {cluster} has more than 2 clusters in {col}. Should be checked manually")
			comparison_metrics = df[(df['kmeans_pca_cluster'].isin(comparison_clusters))][['metric', col]]
			comparison_metrics['cluster_type'] = col
			comparison_metrics = comparison_metrics.rename(columns={col: 'original_cluster'})
			final_clustered_cols.append(comparison_metrics)
		# concat dataframe and drop duplicates
		final_clustered_cols = pd.concat(final_clustered_cols)
		final_clustered_cols = final_clustered_cols.drop_duplicates()
		final_clustered_cols['new_cluster'] = idx
		finalized_df.append(final_clustered_cols)
	return pd.concat(finalized_df)
	

In [219]:
def compute_correlation_scores(df, norm_cols, data_type, signal_type):
	numeric_cols = df[norm_cols].select_dtypes(include="number").columns.tolist()
	correlation_df = df[numeric_cols].corr()
	subset_correlation_df = correlation_df.loc[['reconstruction_score_sum', 'wavelet_summed_norm_score', "reconstruction_score_weighted", 'final_score', 'summed_scores']]
	# subset_correlation_df = subset_correlation_df.dropna(axis=1, how='all')

	subset_correlation_df = subset_correlation_df.T.sort_values(by='summed_scores', ascending=False)
	subset_correlation_df['metric'] = subset_correlation_df.index
	subset_correlation_df = subset_correlation_df.reset_index(drop=True)
	subset_correlation_df['data_type'] = data_type
	subset_correlation_df['signal_type'] = signal_type
	subset_correlation_df = subset_correlation_df[['metric', 'reconstruction_score_sum', 'wavelet_summed_norm_score',
       "reconstruction_score_weighted", 'final_score', 'summed_scores', 'data_type', 'signal_type']]
	return subset_correlation_df

In [185]:
def test_stability_clusters(grouped_full_raw_df, grouped_subset_raw_df, grouped_full_smoothed_df, grouped_subset_smoothed_df):
	
	# Dictionary to store results
	stable_groups = []
	unstable_groups = []

	# Create a copy of unique_metrics to modify it in-place
	unique_metrics = grouped_full_raw_df.metric.unique().tolist()

	# Loop through each metric and compare its groupings
	for metric in unique_metrics[:]:  # Iterate over a copy to allow removal
		# Get the cluster number for this metric in each data type
		initial_full_raw_cluster = grouped_full_raw_df[grouped_full_raw_df.metric == metric].new_cluster.unique()[0]
		initial_subset_raw_cluster = grouped_subset_raw_df[grouped_subset_raw_df.metric == metric].new_cluster.unique()[0]
		initial_full_smoothed_cluster = grouped_full_smoothed_df[grouped_full_smoothed_df.metric == metric].new_cluster.unique()[0]
		initial_subset_smoothed_cluster = grouped_subset_smoothed_df[grouped_subset_smoothed_df.metric == metric].new_cluster.unique()[0]

		# Get the set of metrics in each cluster
		full_raw_metrics = set(grouped_full_raw_df[grouped_full_raw_df.new_cluster == initial_full_raw_cluster].metric.tolist())
		subset_raw_metrics = set(grouped_subset_raw_df[grouped_subset_raw_df.new_cluster == initial_subset_raw_cluster].metric.tolist())
		full_smoothed_metrics = set(grouped_full_smoothed_df[grouped_full_smoothed_df.new_cluster == initial_full_smoothed_cluster].metric.tolist())
		subset_smoothed_metrics = set(grouped_subset_smoothed_df[grouped_subset_smoothed_df.new_cluster == initial_subset_smoothed_cluster].metric.tolist())

		# Compare all sets to check if they are identical
		all_sets = [full_raw_metrics, subset_raw_metrics, full_smoothed_metrics, subset_smoothed_metrics]

		# Check if all sets are identical by comparing them pairwise
		sets_are_equal = all(s == all_sets[0] for s in all_sets)

		# Handle stable vs unstable cases
		if sets_are_equal:
			unique_metrics.remove(metric)  # Remove from future checks
			stable_groups.append({
				'cluster_raw': list(full_raw_metrics), 
				'cluster_smoothed': list(full_smoothed_metrics),
				'metric': metric, 
				'cluster_number_raw': initial_full_raw_cluster,
				'cluster_number_smoothed': initial_full_smoothed_cluster
			})
		else:
			unstable_groups.append({
				'metric': metric,
				'full_raw_vs_subset_raw': list(full_raw_metrics.symmetric_difference(subset_raw_metrics)),
				'full_smoothed_vs_subset_smoothed': list(full_smoothed_metrics.symmetric_difference(subset_smoothed_metrics)),
				'full_raw_vs_full_smoothed': list(full_raw_metrics.symmetric_difference(full_smoothed_metrics)),
				'subset_raw_vs_subset_smoothed': list(subset_raw_metrics.symmetric_difference(subset_smoothed_metrics)),
				'cluster_raw': list(full_raw_metrics),
				'cluster_smoothed': list(full_smoothed_metrics),
				'cluster_number_raw': initial_full_raw_cluster,
				'cluster_number_smoothed': initial_full_smoothed_cluster
			})

	# Convert unstable_groups to DataFrame for easier analysis
	unstable_df = pd.DataFrame(unstable_groups) if len(unstable_groups) > 0 else pd.DataFrame()
	stable_df = pd.DataFrame(stable_groups) if len(stable_groups) > 0 else pd.DataFrame()
	return unstable_df, stable_df

### Reconstruction Metrics

In [159]:
norm_reconstruction_fillna_full_raw_df = run_correlations_clustering(merged_full_raw_df, norm_reconstruction_cols, "Full Raw", False, True)
norm_reconstruction_thresholding_full_raw_df = run_correlations_clustering(merged_subset_raw_df, norm_reconstruction_cols, "Subset Raw", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [160]:
norm_reconstruction_fillna_subset_raw_df = run_correlations_clustering(merged_subset_raw_df, norm_reconstruction_cols, "Subset Raw", False, True)
norm_reconstruction_thresholding_subset_raw_df = run_correlations_clustering(merged_subset_raw_df, norm_reconstruction_cols, "Subset Raw", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [161]:
norm_reconstruction_fillna_full_smoothed_df = run_correlations_clustering(merged_full_smoothed_df, norm_reconstruction_cols, "Full Smoothed", False, True)
norm_reconstruction_thresholding_full_smoothed_df = run_correlations_clustering(merged_full_smoothed_df, norm_reconstruction_cols, "Full Smoothed", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [162]:
norm_reconstruction_fillna_subset_smoothed_df = run_correlations_clustering(merged_subset_smoothed_df, norm_reconstruction_cols, "Subset Smoothed", False, True)
norm_reconstruction_thresholding_subset_smoothed_df = run_correlations_clustering(merged_subset_smoothed_df, norm_reconstruction_cols, "Subset Smoothed", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [163]:
cluster_cols = [col for col in norm_reconstruction_fillna_full_raw_df.columns.tolist() if 'cluster' in col]
cluster_cols

['kmeans_pca_cluster',
 'kmeans_umap_cluster',
 'hdbscan_pca_cluster',
 'hdbscan_umap_cluster']

In [164]:
final_reconstruction_fillna_full_raw_df = combined_across_clusters(norm_reconstruction_fillna_full_raw_df, cluster_cols)
grouped_reconstruction_fillna_full_raw_df = final_reconstruction_fillna_full_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_fillna_full_raw_df['new_cluster'] = grouped_reconstruction_fillna_full_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_fillna_full_raw_df['data_type'] = 'Full Raw'
grouped_reconstruction_fillna_full_raw_df.sort_values(by='new_cluster', inplace=True)
final_reconstruction_fillna_subset_raw_df = combined_across_clusters(norm_reconstruction_fillna_subset_raw_df, cluster_cols)
grouped_reconstruction_fillna_subset_raw_df = final_reconstruction_fillna_subset_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_fillna_subset_raw_df['new_cluster'] = grouped_reconstruction_fillna_subset_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_fillna_subset_raw_df['data_type'] = 'Subset Raw'
grouped_reconstruction_fillna_subset_raw_df.sort_values(by='new_cluster', inplace=True)
final_reconstruction_fillna_full_smoothed_df = combined_across_clusters(norm_reconstruction_fillna_full_smoothed_df, cluster_cols)
grouped_reconstruction_fillna_full_smoothed_df = final_reconstruction_fillna_full_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_fillna_full_smoothed_df['new_cluster'] = grouped_reconstruction_fillna_full_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_fillna_full_smoothed_df['data_type'] = 'Full Smoothed'
grouped_reconstruction_fillna_full_smoothed_df.sort_values(by='new_cluster', inplace=True)
final_reconstruction_fillna_subset_smoothed_df = combined_across_clusters(norm_reconstruction_fillna_subset_smoothed_df, cluster_cols)
grouped_reconstruction_fillna_subset_smoothed_df = final_reconstruction_fillna_subset_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_fillna_subset_smoothed_df['new_cluster'] = grouped_reconstruction_fillna_subset_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_fillna_subset_smoothed_df['data_type'] = 'Subset Smoothed'
grouped_reconstruction_fillna_subset_smoothed_df.sort_values(by='new_cluster', inplace=True)

In [165]:
final_reconstruction_thresholding_full_raw_df = combined_across_clusters(norm_reconstruction_thresholding_full_raw_df, cluster_cols)
grouped_reconstruction_thresholding_full_raw_df = final_reconstruction_thresholding_full_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_thresholding_full_raw_df['new_cluster'] = grouped_reconstruction_thresholding_full_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_thresholding_full_raw_df['data_type'] = 'Full Raw'
grouped_reconstruction_thresholding_full_raw_df.sort_values(by='new_cluster', inplace=True)
final_reconstruction_thresholding_subset_raw_df = combined_across_clusters(norm_reconstruction_thresholding_subset_raw_df, cluster_cols)
grouped_reconstruction_thresholding_subset_raw_df = final_reconstruction_thresholding_subset_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_thresholding_subset_raw_df['new_cluster'] = grouped_reconstruction_thresholding_subset_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_thresholding_subset_raw_df['data_type'] = 'Subset Raw'
grouped_reconstruction_thresholding_subset_raw_df.sort_values(by='new_cluster', inplace=True)
final_reconstruction_thresholding_full_smoothed_df = combined_across_clusters(norm_reconstruction_thresholding_full_smoothed_df, cluster_cols)
grouped_reconstruction_thresholding_full_smoothed_df = final_reconstruction_thresholding_full_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_thresholding_full_smoothed_df['new_cluster'] = grouped_reconstruction_thresholding_full_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_thresholding_full_smoothed_df['data_type'] = 'Full Smoothed'
grouped_reconstruction_thresholding_full_smoothed_df.sort_values(by='new_cluster', inplace=True)
final_reconstruction_thresholding_subset_smoothed_df = combined_across_clusters(norm_reconstruction_thresholding_subset_smoothed_df, cluster_cols)
grouped_reconstruction_thresholding_subset_smoothed_df = final_reconstruction_thresholding_subset_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_reconstruction_thresholding_subset_smoothed_df['new_cluster'] = grouped_reconstruction_thresholding_subset_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_reconstruction_thresholding_subset_smoothed_df['data_type'] = 'Subset Smoothed'
grouped_reconstruction_thresholding_subset_smoothed_df.sort_values(by='new_cluster', inplace=True)

In [189]:
reconstruction_fillna_unstable_df, reconstruction_fillna_stable_df = test_stability_clusters(grouped_reconstruction_fillna_full_raw_df, grouped_reconstruction_fillna_subset_raw_df, grouped_reconstruction_fillna_full_smoothed_df, grouped_reconstruction_fillna_subset_smoothed_df)

reconstruction_fillna_raw_clusters_dfs = []
reconstruction_fillna_smoothed_clusters_dfs = []
if len(reconstruction_fillna_stable_df) > 0:
	reconstruction_fillna_raw_stable_clusters = reconstruction_fillna_stable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	reconstruction_fillna_raw_clusters_dfs.append(reconstruction_fillna_raw_stable_clusters)
	reconstruction_fillna_smoothed_stable_clusters = reconstruction_fillna_stable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	reconstruction_fillna_smoothed_clusters_dfs.append(reconstruction_fillna_smoothed_stable_clusters)

if len(reconstruction_fillna_unstable_df) > 0:
	reconstruction_fillna_raw_unstable_clusters = reconstruction_fillna_unstable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	reconstruction_fillna_raw_clusters_dfs.append(reconstruction_fillna_raw_unstable_clusters)
	reconstruction_fillna_smoothed_unstable_clusters = reconstruction_fillna_unstable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	reconstruction_fillna_smoothed_clusters_dfs.append(reconstruction_fillna_smoothed_unstable_clusters)

reconstruction_fillna_raw_clusters = pd.concat(reconstruction_fillna_raw_clusters_dfs)
reconstruction_fillna_smoothed_clusters = pd.concat(reconstruction_fillna_smoothed_clusters_dfs)

reconstruction_thresholding_unstable_df, reconstruction_thresholding_stable_df = test_stability_clusters(grouped_reconstruction_thresholding_full_raw_df, grouped_reconstruction_thresholding_subset_raw_df, grouped_reconstruction_thresholding_full_smoothed_df, grouped_reconstruction_thresholding_subset_smoothed_df)

reconstruction_thresholding_raw_clusters_dfs = []
reconstruction_thresholding_smoothed_clusters_dfs = []

if len(reconstruction_thresholding_stable_df) > 0:
	reconstruction_thresholding_raw_stable_clusters = reconstruction_thresholding_stable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	reconstruction_thresholding_raw_clusters_dfs.append(reconstruction_thresholding_raw_stable_clusters)
	reconstruction_thresholding_smoothed_stable_clusters = reconstruction_thresholding_stable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	reconstruction_thresholding_smoothed_clusters_dfs.append(reconstruction_thresholding_smoothed_stable_clusters)

if len(reconstruction_thresholding_unstable_df) > 0:
	reconstruction_thresholding_raw_unstable_clusters = reconstruction_thresholding_unstable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	reconstruction_thresholding_raw_clusters_dfs.append(reconstruction_thresholding_raw_unstable_clusters)
	reconstruction_thresholding_smoothed_unstable_clusters = reconstruction_thresholding_unstable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	reconstruction_thresholding_smoothed_clusters_dfs.append(reconstruction_thresholding_smoothed_unstable_clusters)


reconstruction_thresholding_raw_clusters = pd.concat(reconstruction_thresholding_raw_clusters_dfs)
reconstruction_thresholding_smoothed_clusters = pd.concat(reconstruction_thresholding_smoothed_clusters_dfs)

### Signal Metrics

In [167]:
norm_signal_fillna_full_raw_df = run_correlations_clustering(merged_full_raw_df, norm_signal_cols, "Full Raw", False, True)
norm_signal_thresholding_full_raw_df = run_correlations_clustering(merged_subset_raw_df, norm_signal_cols, "Subset Raw", True, False)

⚠️ Columns with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
⚠️ Rows with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
Most stable number of clusters: 5
Most stable number of clusters: 5
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
🛑 Dropping 1 columns due to NaNs before transposing: ['wavelet_mse_norm']
🛑 Dropping 1 rows due to NaNs before transposing: ['wavelet_mse_norm']
Most stable numb

In [168]:
norm_signal_fillna_subset_raw_df = run_correlations_clustering(merged_subset_raw_df, norm_signal_cols, "Subset Raw", False, True)
norm_signal_thresholding_subset_raw_df = run_correlations_clustering(merged_subset_raw_df, norm_signal_cols, "Subset Raw", True, False)

⚠️ Columns with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
⚠️ Rows with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
Most stable number of clusters: 5
Most stable number of clusters: 5
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
🛑 Dropping 1 columns due to NaNs before transposing: ['wavelet_mse_norm']
🛑 Dropping 1 rows due to NaNs before transposing: ['wavelet_mse_norm']
Most stable numb

In [169]:
norm_signal_fillna_full_smoothed_df = run_correlations_clustering(merged_full_smoothed_df, norm_signal_cols, "Full Smoothed", False, True)
norm_signal_thresholding_full_smoothed_df = run_correlations_clustering(merged_full_smoothed_df, norm_signal_cols, "Full Smoothed", True, False)

⚠️ Columns with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
⚠️ Rows with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
Most stable number of clusters: 6
Most stable number of clusters: 5
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
🛑 Dropping 1 columns due to NaNs before transposing: ['wavelet_mse_norm']
🛑 Dropping 1 rows due to NaNs before transposing: ['wavelet_mse_norm']
Most stable numb

In [170]:
norm_signal_fillna_subset_smoothed_df = run_correlations_clustering(merged_subset_smoothed_df, norm_signal_cols, "Subset Smoothed", False, True)
norm_signal_thresholding_subset_smoothed_df = run_correlations_clustering(merged_subset_smoothed_df, norm_signal_cols, "Subset Smoothed", True, False)

⚠️ Columns with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
⚠️ Rows with NaNs after transposing: ['wavelet_mse_norm', 'wavelet_psnr_norm', 'emd_value_norm', 'kl_divergence_norm', 'wavelet_energy_entropy_norm', 'wavelet_sparsity_norm', 'wavelet_entropy_norm', 'smoothness_norm', 'correlation_norm', 'avg_variance_across_levels_norm', 'max_autocorrelation_diff_normalized']
Most stable number of clusters: 6
Most stable number of clusters: 5
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
Using HDBSCAN with params: {'min_cluster_size': 2, 'min_samples': None}
🛑 Dropping 1 columns due to NaNs before transposing: ['wavelet_mse_norm']
🛑 Dropping 1 rows due to NaNs before transposing: ['wavelet_mse_norm']
Most stable numb

In [171]:
cluster_cols = [col for col in norm_signal_fillna_full_raw_df.columns.tolist() if 'cluster' in col]
cluster_cols

['kmeans_pca_cluster',
 'kmeans_umap_cluster',
 'hdbscan_pca_cluster',
 'hdbscan_umap_cluster']

In [172]:
final_signal_fillna_full_raw_df = combined_across_clusters(norm_signal_fillna_full_raw_df, cluster_cols)
grouped_signal_fillna_full_raw_df = final_signal_fillna_full_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_fillna_full_raw_df['new_cluster'] = grouped_signal_fillna_full_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_fillna_full_raw_df['data_type'] = 'Full Raw'
grouped_signal_fillna_full_raw_df.sort_values(by='new_cluster', inplace=True)
final_signal_fillna_subset_raw_df = combined_across_clusters(norm_signal_fillna_subset_raw_df, cluster_cols)
grouped_signal_fillna_subset_raw_df = final_signal_fillna_subset_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_fillna_subset_raw_df['new_cluster'] = grouped_signal_fillna_subset_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_fillna_subset_raw_df['data_type'] = 'Subset Raw'
grouped_signal_fillna_subset_raw_df.sort_values(by='new_cluster', inplace=True)
final_signal_fillna_full_smoothed_df = combined_across_clusters(norm_signal_fillna_full_smoothed_df, cluster_cols)
grouped_signal_fillna_full_smoothed_df = final_signal_fillna_full_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_fillna_full_smoothed_df['new_cluster'] = grouped_signal_fillna_full_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_fillna_full_smoothed_df['data_type'] = 'Full Smoothed'
grouped_signal_fillna_full_smoothed_df.sort_values(by='new_cluster', inplace=True)
final_signal_fillna_subset_smoothed_df = combined_across_clusters(norm_signal_fillna_subset_smoothed_df, cluster_cols)
grouped_signal_fillna_subset_smoothed_df = final_signal_fillna_subset_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_fillna_subset_smoothed_df['new_cluster'] = grouped_signal_fillna_subset_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_fillna_subset_smoothed_df['data_type'] = 'Subset Smoothed'
grouped_signal_fillna_subset_smoothed_df.sort_values(by='new_cluster', inplace=True)

In [173]:
final_signal_thresholding_full_raw_df = combined_across_clusters(norm_signal_thresholding_full_raw_df, cluster_cols)
grouped_signal_thresholding_full_raw_df = final_signal_thresholding_full_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_thresholding_full_raw_df['new_cluster'] = grouped_signal_thresholding_full_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_thresholding_full_raw_df['data_type'] = 'Full Raw'
grouped_signal_thresholding_full_raw_df.sort_values(by='new_cluster', inplace=True)
final_signal_thresholding_subset_raw_df = combined_across_clusters(norm_signal_thresholding_subset_raw_df, cluster_cols)
grouped_signal_thresholding_subset_raw_df = final_signal_thresholding_subset_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_thresholding_subset_raw_df['new_cluster'] = grouped_signal_thresholding_subset_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_thresholding_subset_raw_df['data_type'] = 'Subset Raw'
grouped_signal_thresholding_subset_raw_df.sort_values(by='new_cluster', inplace=True)
final_signal_thresholding_full_smoothed_df = combined_across_clusters(norm_signal_thresholding_full_smoothed_df, cluster_cols)
grouped_signal_thresholding_full_smoothed_df = final_signal_thresholding_full_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_thresholding_full_smoothed_df['new_cluster'] = grouped_signal_thresholding_full_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_thresholding_full_smoothed_df['data_type'] = 'Full Smoothed'
grouped_signal_thresholding_full_smoothed_df.sort_values(by='new_cluster', inplace=True)
final_signal_thresholding_subset_smoothed_df = combined_across_clusters(norm_signal_thresholding_subset_smoothed_df, cluster_cols)
grouped_signal_thresholding_subset_smoothed_df = final_signal_thresholding_subset_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_signal_thresholding_subset_smoothed_df['new_cluster'] = grouped_signal_thresholding_subset_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_signal_thresholding_subset_smoothed_df['data_type'] = 'Subset Smoothed'
grouped_signal_thresholding_subset_smoothed_df.sort_values(by='new_cluster', inplace=True)

In [190]:
signal_fillna_unstable_df, signal_fillna_stable_df = test_stability_clusters(grouped_signal_fillna_full_raw_df, grouped_signal_fillna_subset_raw_df, grouped_signal_fillna_full_smoothed_df, grouped_signal_fillna_subset_smoothed_df)

signal_fillna_raw_clusters_dfs = []
signal_fillna_smoothed_clusters_dfs = []
if len(signal_fillna_stable_df) > 0:
	signal_fillna_raw_stable_clusters = signal_fillna_stable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	signal_fillna_raw_clusters_dfs.append(signal_fillna_raw_stable_clusters)
	signal_fillna_smoothed_stable_clusters = signal_fillna_stable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	signal_fillna_smoothed_clusters_dfs.append(signal_fillna_smoothed_stable_clusters)

if len(signal_fillna_unstable_df) > 0:
	signal_fillna_raw_unstable_clusters = signal_fillna_unstable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	signal_fillna_raw_clusters_dfs.append(signal_fillna_raw_unstable_clusters)
	signal_fillna_smoothed_unstable_clusters = signal_fillna_unstable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	signal_fillna_smoothed_clusters_dfs.append(signal_fillna_smoothed_unstable_clusters)

signal_fillna_raw_clusters = pd.concat(signal_fillna_raw_clusters_dfs)
signal_fillna_smoothed_clusters = pd.concat(signal_fillna_smoothed_clusters_dfs)

signal_thresholding_unstable_df, signal_thresholding_stable_df = test_stability_clusters(grouped_signal_thresholding_full_raw_df, grouped_signal_thresholding_subset_raw_df, grouped_signal_thresholding_full_smoothed_df, grouped_signal_thresholding_subset_smoothed_df)

signal_thresholding_raw_clusters_dfs = []
signal_thresholding_smoothed_clusters_dfs = []

if len(signal_thresholding_stable_df) > 0:
	signal_thresholding_raw_stable_clusters = signal_thresholding_stable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	signal_thresholding_raw_clusters_dfs.append(signal_thresholding_raw_stable_clusters)
	signal_thresholding_smoothed_stable_clusters = signal_thresholding_stable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	signal_thresholding_smoothed_clusters_dfs.append(signal_thresholding_smoothed_stable_clusters)

if len(signal_thresholding_unstable_df) > 0:
	signal_thresholding_raw_unstable_clusters = signal_thresholding_unstable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	signal_thresholding_raw_clusters_dfs.append(signal_thresholding_raw_unstable_clusters)
	signal_thresholding_smoothed_unstable_clusters = signal_thresholding_unstable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	signal_thresholding_smoothed_clusters_dfs.append(signal_thresholding_smoothed_unstable_clusters)


signal_thresholding_raw_clusters = pd.concat(signal_thresholding_raw_clusters_dfs)
signal_thresholding_smoothed_clusters = pd.concat(signal_thresholding_smoothed_clusters_dfs)

### Across Reconstruction & Signal

In [246]:
all_cols = norm_reconstruction_cols + norm_signal_cols
norm_fillna_full_raw_df = run_correlations_clustering(merged_full_raw_df, all_cols, "Full Raw", False, True)
norm_thresholding_full_raw_df = run_correlations_clustering(merged_subset_raw_df, all_cols, "Subset Raw", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [247]:
norm_fillna_subset_raw_df = run_correlations_clustering(merged_subset_raw_df, all_cols, "Subset Raw", False, True)
norm_thresholding_subset_raw_df = run_correlations_clustering(merged_subset_raw_df, all_cols, "Subset Raw", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [248]:
norm_fillna_full_smoothed_df = run_correlations_clustering(merged_full_smoothed_df, all_cols, "Full Smoothed", False, True)
norm_thresholding_full_smoothed_df = run_correlations_clustering(merged_subset_smoothed_df, all_cols, "Subset Smoothed", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [249]:
norm_fillna_subset_smoothed_df = run_correlations_clustering(merged_subset_smoothed_df, all_cols, "Subset Smoothed", False, True)
norm_thresholding_subset_smoothed_df = run_correlations_clustering(merged_subset_smoothed_df, all_cols, "Subset Smoothed", True, False)

⚠️ Columns with NaNs after transposing: ['frequency_max_diff_normalized', 'avg_prominence_diff_normalized', 'spectral_magnitude_diff_normalized', 'prominence_max_diff_normalized', 'spectral_centroid_diff_normalized', 'relative_left_bases_matcher_alignment_score_normalized', 'relative_left_bases_global_alignment_score_normalized', 'spectral_bandwidth_diff_normalized', 'upper_envelope_diff_normalized', 'positive_frequencies_dtw_normalized', 'positive_frequencies_euclidean_normalized', 'positive_frequencies_wasserstein_normalized', 'num_fft_peaks_diff_normalized', 'relative_num_peaks_diff_normalized', 'dominant_frequency_diff_normalized', 'lower_envelope_diff_normalized', 'relative_prominences_avg_diff_normalized', 'relative_prominences_total_diff_normalized', 'relative_prominences_wasserstein_normalized', 'relative_right_bases_matcher_alignment_score_normalized', 'relative_right_bases_global_alignment_score_normalized', 'positive_amplitudes_dtw_normalized', 'positive_amplitudes_euclidean

In [250]:
final_fillna_full_raw_df = combined_across_clusters(norm_fillna_full_raw_df, cluster_cols)
grouped_fillna_full_raw_df = final_fillna_full_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_fillna_full_raw_df['new_cluster'] = grouped_fillna_full_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_fillna_full_raw_df['data_type'] = 'Full Raw'
grouped_fillna_full_raw_df.sort_values(by='new_cluster', inplace=True)
final_fillna_subset_raw_df = combined_across_clusters(norm_fillna_subset_raw_df, cluster_cols)
grouped_fillna_subset_raw_df = final_fillna_subset_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_fillna_subset_raw_df['new_cluster'] = grouped_fillna_subset_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_fillna_subset_raw_df['data_type'] = 'Subset Raw'
grouped_fillna_subset_raw_df.sort_values(by='new_cluster', inplace=True)
final_fillna_full_smoothed_df = combined_across_clusters(norm_fillna_full_smoothed_df, cluster_cols)
grouped_fillna_full_smoothed_df = final_fillna_full_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_fillna_full_smoothed_df['new_cluster'] = grouped_fillna_full_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_fillna_full_smoothed_df['data_type'] = 'Full Smoothed'
grouped_fillna_full_smoothed_df.sort_values(by='new_cluster', inplace=True)
final_fillna_subset_smoothed_df = combined_across_clusters(norm_fillna_subset_smoothed_df, cluster_cols)
grouped_fillna_subset_smoothed_df = final_fillna_subset_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_fillna_subset_smoothed_df['new_cluster'] = grouped_fillna_subset_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_fillna_subset_smoothed_df['data_type'] = 'Subset Smoothed'
grouped_fillna_subset_smoothed_df.sort_values(by='new_cluster', inplace=True)

In [251]:
final_thresholding_full_raw_df = combined_across_clusters(norm_thresholding_full_raw_df, cluster_cols)
grouped_thresholding_full_raw_df = final_thresholding_full_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_thresholding_full_raw_df['new_cluster'] = grouped_thresholding_full_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_thresholding_full_raw_df['data_type'] = 'Full Raw'
grouped_thresholding_full_raw_df.sort_values(by='new_cluster', inplace=True)
final_thresholding_subset_raw_df = combined_across_clusters(norm_thresholding_subset_raw_df, cluster_cols)
grouped_thresholding_subset_raw_df = final_thresholding_subset_raw_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_thresholding_subset_raw_df['new_cluster'] = grouped_thresholding_subset_raw_df['new_cluster'].apply(lambda x: x[0])
grouped_thresholding_subset_raw_df['data_type'] = 'Subset Raw'
grouped_thresholding_subset_raw_df.sort_values(by='new_cluster', inplace=True)
final_thresholding_full_smoothed_df = combined_across_clusters(norm_thresholding_full_smoothed_df, cluster_cols)
grouped_thresholding_full_smoothed_df = final_thresholding_full_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_thresholding_full_smoothed_df['new_cluster'] = grouped_thresholding_full_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_thresholding_full_smoothed_df['data_type'] = 'Full Smoothed'
grouped_thresholding_full_smoothed_df.sort_values(by='new_cluster', inplace=True)
final_thresholding_subset_smoothed_df = combined_across_clusters(norm_thresholding_subset_smoothed_df, cluster_cols)
grouped_thresholding_subset_smoothed_df = final_thresholding_subset_smoothed_df.groupby(['metric']).agg({'original_cluster': lambda x: list(x), 'cluster_type': lambda x: list(x), 'new_cluster': 'unique'}).reset_index()
grouped_thresholding_subset_smoothed_df['new_cluster'] = grouped_thresholding_subset_smoothed_df['new_cluster'].apply(lambda x: x[0])
grouped_thresholding_subset_smoothed_df['data_type'] = 'Subset Smoothed'
grouped_thresholding_subset_smoothed_df.sort_values(by='new_cluster', inplace=True)

In [252]:
fillna_unstable_df, fillna_stable_df = test_stability_clusters(grouped_fillna_full_raw_df, grouped_fillna_subset_raw_df, grouped_fillna_full_smoothed_df, grouped_fillna_subset_smoothed_df)

fillna_raw_clusters_dfs = []
fillna_smoothed_clusters_dfs = []
if len(fillna_stable_df) > 0:
	fillna_raw_stable_clusters = fillna_stable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	fillna_raw_clusters_dfs.append(fillna_raw_stable_clusters)
	fillna_smoothed_stable_clusters = fillna_stable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	fillna_smoothed_clusters_dfs.append(fillna_smoothed_stable_clusters)

if len(fillna_unstable_df) > 0:
	fillna_raw_unstable_clusters = fillna_unstable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	fillna_raw_clusters_dfs.append(fillna_raw_unstable_clusters)
	fillna_smoothed_unstable_clusters = fillna_unstable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	fillna_smoothed_clusters_dfs.append(fillna_smoothed_unstable_clusters)

fillna_raw_clusters = pd.concat(fillna_raw_clusters_dfs)
fillna_smoothed_clusters = pd.concat(fillna_smoothed_clusters_dfs)

thresholding_unstable_df, thresholding_stable_df = test_stability_clusters(grouped_thresholding_full_raw_df, grouped_thresholding_subset_raw_df, grouped_thresholding_full_smoothed_df, grouped_thresholding_subset_smoothed_df)

thresholding_raw_clusters_dfs = []
thresholding_smoothed_clusters_dfs = []

if len(thresholding_stable_df) > 0:
	thresholding_raw_stable_clusters = thresholding_stable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	thresholding_raw_clusters_dfs.append(thresholding_raw_stable_clusters)
	thresholding_smoothed_stable_clusters = thresholding_stable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	thresholding_smoothed_clusters_dfs.append(thresholding_smoothed_stable_clusters)

if len(thresholding_unstable_df) > 0:
	thresholding_raw_unstable_clusters = thresholding_unstable_df[['cluster_number_raw', 'cluster_raw']].drop_duplicates(subset=['cluster_number_raw'])
	thresholding_raw_clusters_dfs.append(thresholding_raw_unstable_clusters)
	thresholding_smoothed_unstable_clusters = thresholding_unstable_df[['cluster_number_smoothed', 'cluster_smoothed']].drop_duplicates(subset=['cluster_number_smoothed'])
	thresholding_smoothed_clusters_dfs.append(thresholding_smoothed_unstable_clusters)


thresholding_raw_clusters = pd.concat(thresholding_raw_clusters_dfs)
thresholding_smoothed_clusters = pd.concat(thresholding_smoothed_clusters_dfs)

### Assess Corelations Across Clusters & Metrics

In [261]:
print("Full Raw Correlation")
full_raw_corr = compute_correlation_scores(merged_full_raw_df, norm_cols, "full", "raw")

print("Full Smoothed Correlation")
full_smoothed_corr = compute_correlation_scores(merged_full_smoothed_df, norm_cols, "full", "smoothed")

Full Raw Correlation
Full Smoothed Correlation


In [262]:
full_raw_corr['reconstruction_fillna_metric_cluster'] = None
full_raw_corr['signal_fillna_metric_cluster'] = None
full_raw_corr['all_fillna_metric_cluster'] = None
full_raw_corr['reconstruction_thresholding_metric_cluster'] = None
full_raw_corr['signal_thresholding_metric_cluster'] = None
full_raw_corr['all_thresholding_metric_cluster'] = None

print("Reconstruction Raw Correlation Fill NaNs")

for idx, row in reconstruction_fillna_raw_clusters.iterrows():
	full_raw_corr.loc[full_raw_corr.metric.isin(row.cluster_raw), 'reconstruction_fillna_metric_cluster'] = row.cluster_number_raw

print("Signal Raw Correlation Fill NaNs")
for idx, row in signal_fillna_raw_clusters.iterrows():
	full_raw_corr.loc[full_raw_corr.metric.isin(row.cluster_raw), 'signal_fillna_metric_cluster'] = row.cluster_number_raw

print("All Raw Correlation Fill NaNs")
for idx, row in fillna_raw_clusters.iterrows():
	full_raw_corr.loc[full_raw_corr.metric.isin(row.cluster_raw), 'all_fillna_metric_cluster'] = row.cluster_number_raw


print("Reconstruction Raw Correlation Thresholding")

for idx, row in reconstruction_thresholding_raw_clusters.iterrows():
	full_raw_corr.loc[full_raw_corr.metric.isin(row.cluster_raw), 'reconstruction_thresholding_metric_cluster'] = row.cluster_number_raw

print("Signal Raw Correlation Thresholding")
for idx, row in signal_thresholding_raw_clusters.iterrows():

	full_raw_corr.loc[full_raw_corr.metric.isin(row.cluster_raw), 'signal_thresholding_metric_cluster'] = row.cluster_number_raw

print("All Raw Correlation Thresholding")
for idx, row in thresholding_raw_clusters.iterrows():
	full_raw_corr.loc[full_raw_corr.metric.isin(row.cluster_raw), 'all_thresholding_metric_cluster'] = row.cluster_number_raw

full_raw_corr[full_raw_corr.metric.isin(norm_reconstruction_cols + norm_signal_cols)].sort_values(by='all_fillna_metric_cluster')

Reconstruction Raw Correlation Fill NaNs
Signal Raw Correlation Fill NaNs
All Raw Correlation Fill NaNs
Reconstruction Raw Correlation Thresholding
Signal Raw Correlation Thresholding
All Raw Correlation Thresholding


Unnamed: 0,metric,reconstruction_score_sum,wavelet_summed_norm_score,reconstruction_score_weighted,final_score,summed_scores,data_type,signal_type,reconstruction_fillna_metric_cluster,signal_fillna_metric_cluster,all_fillna_metric_cluster,reconstruction_thresholding_metric_cluster,signal_thresholding_metric_cluster,all_thresholding_metric_cluster
2,relative_right_bases_global_alignment_score_no...,0.804091,0.125058,0.976296,-0.044909,0.784112,full,raw,0.0,,0,3.0,,2.0
44,dominant_frequency_diff_normalized,,,,,,full,raw,0.0,,0,,,
43,relative_num_peaks_diff_normalized,,,,,,full,raw,0.0,,0,,,
42,num_fft_peaks_diff_normalized,,,,,,full,raw,0.0,,0,,,
41,positive_frequencies_wasserstein_normalized,,,,,,full,raw,0.0,,0,,,
40,positive_frequencies_euclidean_normalized,,,,,,full,raw,0.0,,0,,,
39,positive_frequencies_dtw_normalized,,,,,,full,raw,0.0,,0,,,
38,relative_left_bases_global_alignment_score_nor...,,,,,,full,raw,0.0,,0,,,
37,relative_left_bases_matcher_alignment_score_no...,,,,,,full,raw,0.0,,0,,,
36,frequency_max_diff_normalized,,,,,,full,raw,0.0,,0,,,


In [263]:
full_smoothed_corr['reconstruction_fillna_metric_cluster'] = None
full_smoothed_corr['signal_fillna_metric_cluster'] = None
full_smoothed_corr['all_fillna_metric_cluster'] = None
full_smoothed_corr['reconstruction_thresholding_metric_cluster'] = None
full_smoothed_corr['signal_thresholding_metric_cluster'] = None
full_smoothed_corr['all_thresholding_metric_cluster'] = None

print("Reconstruction smoothed Correlation Fill NaNs")

for idx, row in reconstruction_fillna_smoothed_clusters.iterrows():
	full_smoothed_corr.loc[full_smoothed_corr.metric.isin(row.cluster_smoothed), 'reconstruction_fillna_metric_cluster'] = row.cluster_number_smoothed

print("Signal smoothed Correlation Fill NaNs")
for idx, row in signal_fillna_smoothed_clusters.iterrows():
	full_smoothed_corr.loc[full_smoothed_corr.metric.isin(row.cluster_smoothed), 'signal_fillna_metric_cluster'] = row.cluster_number_smoothed

print("All smoothed Correlation Fill NaNs")
for idx, row in fillna_smoothed_clusters.iterrows():
	full_smoothed_corr.loc[full_smoothed_corr.metric.isin(row.cluster_smoothed), 'all_fillna_metric_cluster'] = row.cluster_number_smoothed


print("Reconstruction smoothed Correlation Thresholding")

for idx, row in reconstruction_thresholding_smoothed_clusters.iterrows():
	full_smoothed_corr.loc[full_smoothed_corr.metric.isin(row.cluster_smoothed), 'reconstruction_thresholding_metric_cluster'] = row.cluster_number_smoothed

print("Signal smoothed Correlation Thresholding")
for idx, row in signal_thresholding_smoothed_clusters.iterrows():

	full_smoothed_corr.loc[full_smoothed_corr.metric.isin(row.cluster_smoothed), 'signal_thresholding_metric_cluster'] = row.cluster_number_smoothed

print("All smoothed Correlation Thresholding")
for idx, row in thresholding_smoothed_clusters.iterrows():
	full_smoothed_corr.loc[full_smoothed_corr.metric.isin(row.cluster_smoothed), 'all_thresholding_metric_cluster'] = row.cluster_number_smoothed

full_smoothed_corr[full_smoothed_corr.metric.isin(norm_reconstruction_cols + norm_signal_cols)].sort_values(by='all_fillna_metric_cluster')

Reconstruction smoothed Correlation Fill NaNs
Signal smoothed Correlation Fill NaNs
All smoothed Correlation Fill NaNs
Reconstruction smoothed Correlation Thresholding
Signal smoothed Correlation Thresholding
All smoothed Correlation Thresholding


Unnamed: 0,metric,reconstruction_score_sum,wavelet_summed_norm_score,reconstruction_score_weighted,final_score,summed_scores,data_type,signal_type,reconstruction_fillna_metric_cluster,signal_fillna_metric_cluster,all_fillna_metric_cluster,reconstruction_thresholding_metric_cluster,signal_thresholding_metric_cluster,all_thresholding_metric_cluster
3,relative_left_bases_global_alignment_score_nor...,0.821174,-0.019131,0.844381,0.003083,0.811191,full,smoothed,0.0,,0,2.0,,2.0
44,dominant_frequency_diff_normalized,,,,,,full,smoothed,0.0,,0,,,
43,relative_num_peaks_diff_normalized,,,,,,full,smoothed,0.0,,0,,,
42,num_fft_peaks_diff_normalized,,,,,,full,smoothed,0.0,,0,,,
41,positive_frequencies_wasserstein_normalized,,,,,,full,smoothed,0.0,,0,,,
40,positive_frequencies_euclidean_normalized,,,,,,full,smoothed,0.0,,0,,,
39,positive_frequencies_dtw_normalized,,,,,,full,smoothed,0.0,,0,,,
38,frequency_max_diff_normalized,,,,,,full,smoothed,0.0,,0,,,
37,wavelet_mse_norm,,,,,,full,smoothed,,0.0,0,,,
29,wavelet_sparsity_norm,-0.088997,0.321134,-0.074502,-0.042954,-0.067252,full,smoothed,,4.0,0,,3.0,2.0


In [278]:
subset_full_smoothed_corr = full_smoothed_corr[full_smoothed_corr.metric.isin(norm_reconstruction_cols + norm_signal_cols)][['metric', 'all_fillna_metric_cluster']]
subset_full_smoothed_corr = subset_full_smoothed_corr.rename(columns={'all_fillna_metric_cluster': 'smoothed_all_fillna_metric_cluster'})

subset_full_raw_corr = full_raw_corr[full_raw_corr.metric.isin(norm_reconstruction_cols + norm_signal_cols)][['metric', 'all_fillna_metric_cluster']]
subset_full_raw_corr = subset_full_raw_corr.rename(columns={'all_fillna_metric_cluster': 'raw_all_fillna_metric_cluster'})

subset_full_corr = pd.merge(subset_full_raw_corr, subset_full_smoothed_corr, on='metric', how='outer')
subset_full_corr.sort_values(by='raw_all_fillna_metric_cluster')

Unnamed: 0,metric,raw_all_fillna_metric_cluster,smoothed_all_fillna_metric_cluster
39,wavelet_sparsity_norm,0,0
17,positive_frequencies_wasserstein_normalized,0,0
16,positive_frequencies_euclidean_normalized,0,0
15,positive_frequencies_dtw_normalized,0,0
22,relative_num_peaks_diff_normalized,0,0
23,relative_peaks_global_alignment_score_normalized,0,0
24,relative_peaks_matcher_alignment_score_normalized,0,0
11,num_fft_peaks_diff_normalized,0,0
21,relative_left_bases_matcher_alignment_score_no...,0,0
28,relative_right_bases_global_alignment_score_no...,0,0


| **Cluster** | **Metric Themes** | **Interpretation** | **Suggested Weighting** |
|------------|------------------|---------------------|-------------------------|
| **0 (Amplitude & Prominence Variability)** | `prominence_min_diff_normalized`, `positive_amplitudes_dtw_normalized`, `avg_prominence_diff_normalized`, `relative_prominences_avg_diff_normalized`, `positive_amplitudes_wasserstein_normalized`, `positive_amplitudes_euclidean_normalized`, `relative_prominences_total_diff_normalized` | These metrics capture **changes in peak prominence and amplitude variability**. They likely **track how well the fundamental peaks and variations in amplitude are preserved during reconstruction**, rather than distortions introduced. This cluster is particularly relevant for analyzing **signal integrity at the level of local variations**. | **High (0.4 - 0.6)** - These metrics directly impact perceptual quality and core structure of the signal. |
| **1 (Spectral & Structure-Based Distortions)** | `amplitude_max_diff_normalized`, `dynamic_cutoff_diff_normalized`, `spectral_centroid_diff_normalized`, `spectral_magnitude_diff_normalized`, `spectral_bandwidth_diff_normalized`, `max_autocorrelation_diff_normalized`, `prominence_max_diff_normalized`, `wavelet_psnr_norm`, `emd_value_norm` | This cluster captures **spectral characteristics and periodicity distortions**. It is likely linked to **how frequencies shift during reconstruction**, affecting **signal clarity, periodic features, and information loss in the spectral domain**. These metrics may be crucial for assessing whether **the reconstruction maintains the original frequency composition or introduces unwanted distortions**. | **Very High (0.5 - 0.7)** - These spectral distortions are crucial for identifying reconstruction failure. |
| **2 (Envelope & Structural Integrity Metrics)** | `upper_envelope_diff_normalized`, `lower_envelope_diff_normalized`, `relative_left_bases_global_alignment_score_normalized`, `relative_left_bases_matcher_alignment_score_normalized`, `relative_right_bases_global_alignment_score_normalized`, `relative_right_bases_matcher_alignment_score_normalized` | This cluster tracks **global structural alignment and waveform shape distortions**. It likely relates to **how well the reconstructed signal maintains its envelope, positioning, and symmetry**. The alignment scores suggest potential **misalignments in reconstruction**, affecting overall waveform integrity. | **Moderate (0.2 - 0.4)** - Important for signal alignment but not always tied to perceptual fidelity. |
| **3 (Wavelet-Based & Entropy Metrics)** | `wavelet_energy_entropy_norm`, `wavelet_sparsity_norm`, `smoothness_norm`, `wavelet_entropy_norm`, `kl_divergence_norm`, `wavelet_mse_norm`, `correlation_norm`, `avg_variance_across_levels_norm` | This cluster focuses on **wavelet decomposition properties, entropy, and sparsity**, which relate to **how efficiently the signal is represented in the transformed space**. These metrics likely indicate **whether the reconstruction preserves key features of the compressed or transformed representation**. Higher entropy or MSE may indicate **greater information loss or unnecessary complexity in the reconstructed signal**. | **Lower (0.1 - 0.3)** - Useful for assessing efficiency and compression but less critical for perceptual accuracy. |

In [535]:
reconstruction_metric_weights = {
    # **Cluster 1: Prominence & Amplitude-Based Metrics (Moderate Importance)**
    "prominence_min_diff_normalized": 0.3,
    "positive_amplitudes_dtw_normalized": 0.35,
    "positive_amplitudes_euclidean_normalized": 0.35,
    "positive_amplitudes_wasserstein_normalized": 0.35,
    "avg_prominence_diff_normalized": 0.3,
    "relative_prominences_avg_diff_normalized": 0.3,
    "relative_prominences_wasserstein_normalized": 0.3,
    "positive_frequencies_dtw_normalized": 0.35,
    "positive_frequencies_euclidean_normalized": 0.35,
    "positive_frequencies_wasserstein_normalized": 0.35,

    # **Cluster 2: Spectral & Structural Fidelity (Highest Importance)**
    "spectral_centroid_diff_normalized": 0.6,
    "spectral_magnitude_diff_normalized": 0.6,
    "dynamic_cutoff_diff_normalized": 0.5,
    "relative_prominences_total_diff_normalized": 0.5,
    "prominence_max_diff_normalized": 0.5,
    "amplitude_max_diff_normalized": 0.5,
    "frequency_max_diff_normalized": 0.5,
    "spectral_bandwidth_diff_normalized": 0.5,
    "num_fft_peaks_diff_normalized": 0.4,
    "relative_num_peaks_diff_normalized": 0.4,  # Fixed typo
    "dominant_frequency_diff_normalized": 0.5,
    "max_autocorrelation_diff_normalized": 0.5,  # Ensured it's here

    # **Cluster 3: Alignment-Based Metrics (Lower Importance)**
    "relative_right_bases_global_alignment_score_normalized": 0.1,
    "relative_right_bases_matcher_alignment_score_normalized": 0.1,
    "relative_left_bases_global_alignment_score_normalized": 0.1,
    "relative_left_bases_matcher_alignment_score_normalized": 0.1,
    "upper_envelope_diff_normalized": 0.2,
    "lower_envelope_diff_normalized": 0.2,
    "relative_peaks_matcher_alignment_score_normalized": 0.2,
    "relative_peaks_global_alignment_score_normalized": 0.2,
}

signal_metric_weights = {
    # **Cluster 1: Prominence & Amplitude-Based Metrics (Moderate Priority)**
    "emd_value_norm": 0.3,
    "smoothness_norm": 0.3,

    # **Cluster 2: Spectral & Structural Fidelity (Highest Priority)**
    "wavelet_mse_norm": 0.6,
    "wavelet_psnr_norm": 0.6,
    "wavelet_entropy_norm": 0.5,
    "wavelet_energy_entropy_norm": 0.5,
    "wavelet_sparsity_norm": 0.4,
    "correlation_norm": 0.4,
    "avg_variance_across_levels_norm": 0.4,

    # **Cluster 3: Structural Fidelity (Moderate Priority)**
    "kl_divergence_norm": 0.4,  # Ensured consistency with reconstruction
}

In [536]:
base_reconstruction_cols = list(reconstruction_metric_weights.keys())
base_signal_cols = list(signal_metric_weights.keys())


In [537]:
missing_reconstruction_cols = [col for col in norm_reconstruction_cols if col not in base_reconstruction_cols]
missing_signal_cols = [col for col in norm_signal_cols if col not in base_signal_cols]
print("Missing Reconstruction Columns")
print(missing_reconstruction_cols)
print("Missing Signal Columns")
print(missing_signal_cols)

Missing Reconstruction Columns
[]
Missing Signal Columns
['max_autocorrelation_diff_normalized']


In [520]:
# Compute reconstruction scores
def weight_scores(normalized_df, metric_weights, metric_type):
	total_scores_weighted = []
	for _, row in normalized_df.iterrows():
		weighted_scores = []
		for metric, weight in metric_weights.items():
			metric_columns = [col for col in normalized_df.columns if col == metric]
			if metric_columns:
				metric_score = row[metric_columns].mean()  # Average across related normalized metrics
				weighted_scores.append(weight * metric_score)

		total_scores_weighted.append(sum(weighted_scores))

	normalized_df[f"redo_{metric_type}_score_weighted"] = total_scores_weighted

	# Compute **simple summation reconstruction score**
	normalized_columns = [col for col in normalized_df.columns if col.endswith("_normalized") or col.endswith("_norm")]
	normalized_df[f"redo_{metric_type}_score_sum"] = normalized_df[normalized_columns].sum(axis=1)
	ranked_comparison_df = normalized_df.sort_values(by=f"redo_{metric_type}_score_weighted", ascending=False).reset_index(drop=True)
	ranked_comparison_df[f"redo_{metric_type}_rank_weighted"] = ranked_comparison_df.index + 1  # Rank by weighted method
	return ranked_comparison_df

normalized_df = merged_full_raw_df.copy()
normalized_df = weight_scores(normalized_df, reconstruction_metric_weights, "reconstruction")
normalized_df = weight_scores(normalized_df, signal_metric_weights, "signal")


In [521]:
def generate_ranking_config(ranking_config_dict, metric_weights):

	# Populate ranking_config dynamically
	for metric, original_weight in metric_weights.items():
		ranking_config_dict["metrics"].append({
			"metric": metric.split('_norm')[0],
			"original_weight": original_weight,
			"final_weight": None,  # Will be updated later
			"normalized_weight": None,  # Will be updated later
			"ignore_metric": False,
			"removal_reason": None,
			"was_inverted": False,
			"was_shared": False,
			"was_specific": False,
			"variance": None,
			"presence": None,
			"was_zscored": False,
		})
	return ranking_config_dict

ranking_config_dict = {
		"metrics": [],
	}
ranking_config_dict = generate_ranking_config(ranking_config_dict, reconstruction_metric_weights)
ranking_config_dict = generate_ranking_config(ranking_config_dict, signal_metric_weights)


In [522]:
def get_normalized_weights(results_df, existing_metrics, metric_weights, ranking_config_dict, metric_signal_type):
	norm_type = "_norm" if metric_signal_type == "signal" else "_normalized"
	final_metrics = [metric for metric in existing_metrics if metric in results_df.columns]
	subset_final_metrics = [metric.split(norm_type)[0] for metric in final_metrics]

	threshold = 0.9
	shared_weight_factor = 0.7
	specific_weight_factor = 0.3
	min_weight = 0.05
	max_weight = 0.5

	# Step 1: Compute Variance and Presence
	metric_variances = results_df[final_metrics].var()
	metric_presence = results_df[final_metrics].notna().mean()

	# Step 2: Log Variance and Presence in ranking_config
	for metric in subset_final_metrics:
		for metric_config in ranking_config_dict["metrics"]:
			if metric_config["metric"] == metric:
				metric = metric + norm_type
				metric_config["variance"] = metric_variances.get(metric, None)
				metric_config["presence"] = metric_presence.get(metric, None)

	# Step 3: Adjust Initial Weights Dynamically
	dynamic_adjustments = {
		metric: min(metric_weights.get(metric, min_weight) * max(metric_variances[metric] * metric_presence[metric], min_weight * 10), max_weight)
		for metric in final_metrics
	}
	print(f"Dynamic Adjustments: {dynamic_adjustments}")

	# Step 4: Normalize Dynamic Adjustments
	total_adjustment = sum(dynamic_adjustments.values())
	if total_adjustment == 0:
		print("[bright_red]All dynamic adjustments are zero. Assigning minimum weights.[/bright_red]")
		normalized_weights = {metric: min_weight for metric in final_metrics}
	else:
		normalized_weights = {
			metric: min(adjustment / total_adjustment, max_weight)  # Ensure max_weight cap
			for metric, adjustment in dynamic_adjustments.items()
		}

	print(f"Normalized Weights: {normalized_weights}")

	# Step 5: Split Metrics Based on Shared & Specific Categories
	shared_metrics = [m for m in final_metrics if metric_presence[m] >= threshold]
	specific_metrics = [m for m in final_metrics if m not in shared_metrics]

	# If no valid metrics exist, raise an error
	if not shared_metrics and not specific_metrics:
		raise ValueError("No valid metrics to normalize. Please check metric definitions.")

	# Step 6: Apply Shared/Specific Weight Factors
	shared_total_weight = sum(dynamic_adjustments[m] for m in shared_metrics)
	specific_total_weight = sum(dynamic_adjustments[m] for m in specific_metrics)

	for metric in shared_metrics:
		normalized_weights[metric] = min(
			(dynamic_adjustments[metric] / shared_total_weight) * shared_weight_factor,
			max_weight
		)
		# Log "shared" flag in ranking_config
		for metric_config in ranking_config_dict["metrics"]:
			metric = metric.split(norm_type)[0]
			if metric_config["metric"] == metric:
				metric_config["was_shared"] = True

	for metric in specific_metrics:
		normalized_weights[metric] = min(
			(dynamic_adjustments[metric] / specific_total_weight) * specific_weight_factor,
			max_weight
		)
		# Log "specific" flag in ranking_config
		for metric_config in ranking_config_dict["metrics"]:
			metric = metric.split(norm_type)[0]
			if metric_config["metric"] == metric:
				metric_config["was_specific"] = True

	# Step 7: Ensure Weights Sum to 1
	total_weight = sum(normalized_weights.values())
	if total_weight == 0:
		print("[bright_red]All weights removed! Check your metric definitions.[/bright_red]")
		normalized_weights = {metric: min_weight for metric in final_metrics}  # Fallback to min_weight
		print("[yellow]Fallback: Minimum weights assigned to all metrics.[/yellow]")
	else:
		normalized_weights = {metric: weight / total_weight for metric, weight in normalized_weights.items()}

	print(f"Final Weights: {normalized_weights}")
	# Step 8: Validate Weights and Ensure No Zeros
	for metric in final_metrics:
		if normalized_weights.get(metric, 0) == 0:
			raise ValueError(
				f"Critical Error: Metric '{metric}' has a weight of zero. "
				"This suggests an unexpected omission or issue in the weighting logic."
			)

	# Step 9: Log Final Weights in ranking_config
	for metric, final_weight in normalized_weights.items():
		for metric_config in ranking_config_dict["metrics"]:
			metric = metric.split(norm_type)[0]
			if metric_config["metric"] == metric:
				metric_config["final_weight"] = final_weight
				metric_config["normalized_weight"] = final_weight / sum(normalized_weights.values())  # Normalize to sum to 1
	return normalized_weights, subset_final_metrics, final_metrics, ranking_config_dict

reconstruction_normalized_weights, reconstruction_subset_final_metrics, reconstruction_final_metrics, ranking_config_dict = get_normalized_weights(normalized_df, base_reconstruction_cols, reconstruction_metric_weights, ranking_config_dict, "reconstruction")
signal_normalized_weights, signal_subset_final_metrics, signal_final_metrics, ranking_config_dict = get_normalized_weights(normalized_df, base_signal_cols, signal_metric_weights, ranking_config_dict, "signal")

Dynamic Adjustments: {'prominence_min_diff_normalized': 0.15, 'positive_amplitudes_dtw_normalized': 0.175, 'positive_amplitudes_euclidean_normalized': 0.175, 'positive_amplitudes_wasserstein_normalized': 0.175, 'avg_prominence_diff_normalized': 0.15, 'relative_prominences_avg_diff_normalized': 0.15, 'relative_prominences_wasserstein_normalized': 0.15, 'positive_frequencies_dtw_normalized': 0.175, 'positive_frequencies_euclidean_normalized': 0.175, 'positive_frequencies_wasserstein_normalized': 0.175, 'spectral_centroid_diff_normalized': 0.3, 'spectral_magnitude_diff_normalized': 0.3, 'dynamic_cutoff_diff_normalized': 0.25, 'relative_prominences_total_diff_normalized': 0.25, 'prominence_max_diff_normalized': 0.25, 'amplitude_max_diff_normalized': 0.25, 'frequency_max_diff_normalized': 0.25, 'spectral_bandwidth_diff_normalized': 0.25, 'num_fft_peaks_diff_normalized': 0.2, 'relative_num_peaks_diff_normalized': 0.2, 'dominant_frequency_diff_normalized': 0.25, 'max_autocorrelation_diff_norm

In [524]:
def get_dynamic_rank(normalized_df, subset_final_metrics, final_metrics, normalized_weights, ranking_config_dict, metric_signal_type):
	norm_type = "_norm" if metric_signal_type == "signal" else "_normalized"
	normalized_df[f'redo_missing_metrics_count'] = normalized_df.apply(
		lambda row: sum(
			normalized_weights.get(metric, 0) * (1 if pd.isna(row[f"{metric}{norm_type}"]) else 0.5)
			for metric in subset_final_metrics
			if f"{metric}{norm_type}" in normalized_df.columns
			and (pd.isna(row[f"{metric}{norm_type}"]) or ranking_config_dict["metrics"][subset_final_metrics.index(metric)]["ignore_metric"])
		),
		axis=1
	)
	for metric in subset_final_metrics:
		if any(
			metric_config["metric"] == metric and metric_config["ignore_metric"]
			for metric_config in ranking_config_dict["metrics"]
		):
			continue  # Skip ignored metrics
		# Proceed with z-score calculation
		metric_norm_col = f"{metric}{norm_type}"
		metric_zscore_col = f"{metric}_zscore"
		
		std_dev = normalized_df[metric_norm_col].std()  # Calculate standard deviation
		if std_dev == 0 or pd.isna(std_dev):  # Handle cases with zero or NaN standard deviation
			print(f"[yellow]Standard deviation for {metric_norm_col} is zero or NaN. Skipping z-score calculation.[/yellow]")
			normalized_df[metric_zscore_col] = np.nan
		else:
			normalized_df[metric_zscore_col] = (
				normalized_df[metric_norm_col] - normalized_df[metric_norm_col].mean()
			) / std_dev
			for metric_config in ranking_config_dict["metrics"]:
				if metric_config["metric"] == metric:
					metric_config["was_zscored"] = True

	# Compute weighted scores for norm and z-score
	penalty_weight = 0.05
	epsilon_threshold = 1e-6

	ranking_config["penalty_weight"] = penalty_weight
	normalized_df[f"redo_{metric_signal_type}_wavelet_norm_weighted_score"] = normalized_df.apply(
		lambda row: (
			sum(
				normalized_weights[metric + norm_type] * row[f"{metric}{norm_type}"]
				for metric in subset_final_metrics
				if pd.notna(row[f"{metric}{norm_type}"])
			) / max(sum(normalized_weights[metric + norm_type] for metric in subset_final_metrics if pd.notna(row[f"{metric}{norm_type}"])), epsilon_threshold)
			- penalty_weight * row[f"redo_missing_metrics_count"]  # Use precomputed penalty
		),
		axis=1
	)

	normalized_df[f"redo_{metric_signal_type}_wavelet_zscore_weighted_score"] = normalized_df.apply(
		lambda row: (
			sum(
				normalized_weights[metric + norm_type] * row[f"{metric}_zscore"]
				for metric in subset_final_metrics
				if pd.notna(row[f"{metric}_zscore"])
			) / max(sum(normalized_weights[metric + norm_type] for metric in subset_final_metrics if pd.notna(row[f"{metric}_zscore"])), epsilon_threshold)
		),
		axis=1
	)


	# Compute summed normalized score
	normalized_df[f"redo_{metric_signal_type}_wavelet_summed_norm_score"] = normalized_df.apply(
		lambda row: sum(
			row[f"{metric}{norm_type}"]
			for metric in subset_final_metrics
			if pd.notna(row[f"{metric}{norm_type}"])
		),
		axis=1
	)

	# Normalize summed score so that it has a comparable range
	max_summed_score = normalized_df[f"redo_{metric_signal_type}_wavelet_summed_norm_score"].max()
	if max_summed_score > 0:
		normalized_df[f"redo_{metric_signal_type}_wavelet_summed_norm_score"] /= max_summed_score
	else:
		print("[yellow]Max summed normalized score is zero! Assigning equal scores.[/yellow]")
		normalized_df[f"redo_{metric_signal_type}_wavelet_summed_norm_score"] = 1 / len(final_metrics)  # Assign equal importance


	normalized_df[f"redo_{metric_signal_type}_normalized_diff"] = (
			(normalized_df[f"redo_{metric_signal_type}_wavelet_norm_weighted_score"] - normalized_df[f"redo_{metric_signal_type}_wavelet_zscore_weighted_score"]).abs()
			/ (normalized_df[f"redo_{metric_signal_type}_wavelet_norm_weighted_score"] + normalized_df[f"redo_{metric_signal_type}_wavelet_zscore_weighted_score"]).abs()
		).abs()


	# Final stability-adjusted score
	normalized_df[f"redo_{metric_signal_type}_final_score"] = (
		normalized_df[f"redo_{metric_signal_type}_wavelet_norm_weighted_score"]
		- penalty_weight * normalized_df[f"redo_{metric_signal_type}_normalized_diff"]
	)
	# Rank results by final_score (Weighted method)
	ranked_results = normalized_df.sort_values(
		by=f"redo_{metric_signal_type}_final_score", ascending=False
	).reset_index(drop=True)
	ranked_results[f"redo_{metric_signal_type}_wavelet_rank"] = ranked_results.index + 1
	return ranked_results, ranking_config_dict

ranked_results, ranking_config_dict = get_dynamic_rank(normalized_df, reconstruction_subset_final_metrics, reconstruction_final_metrics, reconstruction_normalized_weights, ranking_config_dict, "reconstruction")
ranked_results, ranking_config_dict = get_dynamic_rank(ranked_results, signal_subset_final_metrics, signal_final_metrics, signal_normalized_weights, ranking_config_dict, "signal")

[yellow]Standard deviation for positive_frequencies_dtw_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yellow]Standard deviation for positive_frequencies_euclidean_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yellow]Standard deviation for positive_frequencies_wasserstein_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yellow]Standard deviation for frequency_max_diff_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yellow]Standard deviation for num_fft_peaks_diff_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yellow]Standard deviation for relative_num_peaks_diff_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yellow]Standard deviation for dominant_frequency_diff_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yellow]Standard deviation for relative_left_bases_global_alignment_score_normalized is zero or NaN. Skipping z-score calculation.[/yellow]
[yello

In [526]:
def calculate_rank_stability(df: pd.DataFrame, rank_columns: list, comparison : str) -> pd.DataFrame:
	"""
	Calculate a stability metric for wavelet rankings based on multiple ranking columns. The rank columns list should be ordered from the least to most important rank.
	
	Parameters:
	-----------
	df : pd.DataFrame
		DataFrame containing rank columns to evaluate.
	rank_columns : list of str
		Columns representing ranks to compare for stability.
		
	Returns:
	--------
	pd.DataFrame
		DataFrame with an added 'rank_stability' column.
	"""
	# Compute absolute differences between ranks
	for i, col_a in enumerate(rank_columns):
		for col_b in rank_columns[i+1:]:
			diff_col_name = f"{col_a}_vs_{col_b}_abs_diff"
			df[diff_col_name] = (df[col_a] - df[col_b]).abs()
	
	# Calculate the standard deviation of ranks across rank columns
	df[f'{comparison}_rank_std_dev'] = df[rank_columns].std(axis=1)
	
	# Normalize by the maximum possible rank
	max_rank = df[rank_columns].max().max()
	df[f'{comparison}_rank_stability'] = 1 - (df[f'{comparison}_rank_std_dev'] / max_rank)
	
	return df

In [528]:
ranked_results[["redo_reconstruction_rank_weighted", "redo_signal_rank_weighted", "redo_reconstruction_wavelet_rank", "redo_signal_wavelet_rank"]].corr()

Unnamed: 0,redo_reconstruction_rank_weighted,redo_signal_rank_weighted,redo_reconstruction_wavelet_rank,redo_signal_wavelet_rank
redo_reconstruction_rank_weighted,1.0,0.560942,0.981713,0.543886
redo_signal_rank_weighted,0.560942,1.0,0.565626,0.973951
redo_reconstruction_wavelet_rank,0.981713,0.565626,1.0,0.549508
redo_signal_wavelet_rank,0.543886,0.973951,0.549508,1.0


In [529]:
ranked_results = calculate_rank_stability(ranked_results, ["redo_reconstruction_rank_weighted", "redo_signal_rank_weighted", "redo_reconstruction_wavelet_rank", "redo_signal_wavelet_rank"], "redo_reconstruction_signal")

In [532]:

ranked_results['combined_redo_rank'] = ranked_results['redo_reconstruction_rank_weighted'] + ranked_results['redo_signal_rank_weighted'] + ranked_results['redo_reconstruction_wavelet_rank'] + ranked_results['redo_signal_wavelet_rank']

In [534]:
ranked_results[['wavelet', 'combined_redo_rank', "redo_reconstruction_rank_weighted", "redo_signal_rank_weighted", "redo_reconstruction_wavelet_rank", "redo_signal_wavelet_rank", "redo_reconstruction_signal_rank_stability"]].sort_values(by=['combined_redo_rank'], ascending=[True])

Unnamed: 0,wavelet,combined_redo_rank,redo_reconstruction_rank_weighted,redo_signal_rank_weighted,redo_reconstruction_wavelet_rank,redo_signal_wavelet_rank,redo_reconstruction_signal_rank_stability
0,bior3.9,19,3,1,14,1,0.989336
3,bior3.7,22,10,4,4,4,0.994872
2,bior3.7,23,12,3,5,3,0.992697
1,bior3.9,39,14,2,21,2,0.983942
18,bior3.9,75,16,18,22,19,0.995726
...,...,...,...,...,...,...,...
567,sym7,2218,527,563,560,568,0.968157
568,sym6,2221,550,522,580,569,0.956589
576,sym6,2223,547,516,583,577,0.947306
557,sym6,2224,545,536,585,558,0.963524


## Older Code

In [74]:
combined_df = pd.concat([full_raw_corr, full_smoothed_corr])
melted_df = pd.melt(combined_df, id_vars=['metric', 'data_type', 'signal_type'], value_vars=['reconstruction_score_sum', 'wavelet_summed_norm_score', "reconstruction_score_weighted", 'final_score',  'summed_scores'])

In [75]:
melted_df[0:1]

Unnamed: 0,metric,data_type,signal_type,variable,value
0,summed_scores,full,raw,reconstruction_score_sum,0.998508


In [76]:
# Compute correlations of ranking systems with reconstruction score
corr_summed = melted_df[melted_df["variable"] == "reconstruction_score_sum"].groupby(["data_type", "signal_type"])["value"].corr(melted_df["value"])
corr_weighted = melted_df[melted_df["variable"] == "reconstruction_score_weighted"].groupby(["data_type", "signal_type"])["value"].corr(melted_df["value"])

# Check stability (Standard Deviation)
std_summed = melted_df[melted_df["variable"] == "reconstruction_score_sum"].groupby(["data_type", "signal_type"])["value"].std()
std_weighted = melted_df[melted_df["variable"] == "reconstruction_score_weighted"].groupby(["data_type", "signal_type"])["value"].std()

print("Correlation with Reconstruction Score:")
print("Summed:", corr_summed)
print("Weighted:", corr_weighted)

print("\nStability (Lower is better):")
print("Summed Std Dev:", std_summed)
print("Weighted Std Dev:", std_weighted)

Correlation with Reconstruction Score:
Summed: data_type  signal_type
full       raw            1.0
           smoothed       1.0
Name: value, dtype: float64
Weighted: data_type  signal_type
full       raw            1.0
           smoothed       1.0
Name: value, dtype: float64

Stability (Lower is better):
Summed Std Dev: data_type  signal_type
full       raw            0.461400
           smoothed       0.398616
Name: value, dtype: float64
Weighted Std Dev: data_type  signal_type
full       raw            0.338030
           smoothed       0.386786
Name: value, dtype: float64


In [77]:
from scipy.stats import spearmanr

# Compare ranking stability across conditions
df_pivot = melted_df.pivot(index=["metric", "variable"], columns=["data_type", "signal_type"], values="value")

# Compute pairwise Spearman rank correlation between conditions
rank_corr = df_pivot.rank(axis=1).corr(method="spearman")

print("Rank Stability (Spearman Correlation Matrix):")
print(rank_corr)

Rank Stability (Spearman Correlation Matrix):
data_type             full         
signal_type            raw smoothed
data_type signal_type              
full      raw          1.0     -1.0
          smoothed    -1.0      1.0


In [78]:
melted_df.pivot(index=["metric", "variable"], columns=["data_type", "signal_type"], values="value").to_clipboard()

In [79]:
melted_df['combined_type'] = melted_df['data_type'] + " / " + melted_df['signal_type']
melted_df['combined_metric'] = melted_df['metric'] + " / " + melted_df['variable']

In [80]:
melted_df.variable.unique()

array(['reconstruction_score_sum', 'wavelet_summed_norm_score',
       'reconstruction_score_weighted', 'final_score', 'summed_scores'],
      dtype=object)

In [81]:
import altair as alt
import warnings
warnings.filterwarnings("ignore")

selection = alt.selection_multi(fields=['combined_metric'], bind='legend')
alt.Chart(melted_df).mark_line(point=True).encode(
	x=alt.X('combined_type:N', axis=alt.Axis(title='Data Type and Signal Type'), sort=['subset_raw', 'full_raw', 'subset_smoothed', 'full_smoothed']),
	# x=alt.X('variable:N', axis=alt.Axis(title='Score Type'), sort=['reconstruction_score_sum', 'wavelet_summed_norm_score',
    #    'summed_scores']),
	y='value:Q',
	color=alt.Color('combined_metric:N', legend=alt.Legend(title='Metric and Variable', symbolLimit=0, columns=3)),
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
	shape='variable:N',
	tooltip=['combined_metric', 'value', 'combined_type']
).add_params(selection).properties(width=800, height=800)

In [82]:
melted_df.variable.unique()

array(['reconstruction_score_sum', 'wavelet_summed_norm_score',
       'reconstruction_score_weighted', 'final_score', 'summed_scores'],
      dtype=object)

In [83]:


selection = alt.selection_multi(fields=['metric'], bind='legend')
alt.Chart(melted_df[(melted_df.value < 1) & (melted_df.variable.isin(['reconstruction_score_sum', 'wavelet_summed_norm_score',  'reconstruction_score_weighted', 'final_score']))]).mark_line(point=True).encode(
	x=alt.X('variable:N', axis=alt.Axis(title='Data Type and Signal Type'), sort=['reconstruction_score_sum', 'wavelet_summed_norm_score',
       'reconstruction_score_weighted', 'final_score']),
	# x=alt.X('variable:N', axis=alt.Axis(title='Score Type'), sort=['reconstruction_score_sum', 'wavelet_summed_norm_score',
    #    'summed_scores']),
	y='value:Q',
	color=alt.Color('metric:N', legend=alt.Legend(title='Metric and Variable', symbolLimit=0, columns=3)),
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
	shape='variable:N',
	row='data_type:N',
	column='signal_type:N',
	tooltip=['metric', 'variable', 'value', 'data_type', 'signal_type']
).add_params(selection).properties(width=400, height=200)

In [84]:
def calculate_rank_stability_from_melted(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate stability metric for rankings from a melted dataframe setup.

    Parameters:
    -----------
    df : pd.DataFrame
        Melted dataframe with columns ['metric', 'data_type', 'signal_type', 'variable', 'value'].

    Returns:
    --------
    pd.DataFrame
        DataFrame with rank stability metrics for each metric and variable.
    """
    # Step 1: Pivot the melted dataframe
    pivoted_df = df.pivot(
        index=["metric", "variable"],  # Metrics and variables as indices
        columns="combined_type",  # Create columns based on the data_type/signal_type combo
        values="value"
    ).reset_index()

    # Step 2: Calculate absolute differences and rank stability across columns
    columns_to_compare = pivoted_df.columns.difference(["metric", "variable"])  # Dynamic column selection

    # Compute absolute differences between columns
    for col_a in columns_to_compare:
        for col_b in columns_to_compare:
            if col_a != col_b:
                diff_col_name = f"{col_a}_vs_{col_b}_abs_diff"
                pivoted_df[diff_col_name] = (pivoted_df[col_a] - pivoted_df[col_b]).abs()

    # Calculate standard deviation of values across columns
    pivoted_df['rank_std_dev'] = pivoted_df[columns_to_compare].std(axis=1)

    # Normalize the standard deviation by the maximum possible rank (assuming ranks are between 0 and 1)
    pivoted_df['rank_stability'] = 1 - pivoted_df['rank_std_dev']  # Stability as inverse of variability

    return pivoted_df

pivoted_df = calculate_rank_stability_from_melted(melted_df)

In [85]:
pivoted_df[0:1].to_dict()

{'metric': {0: 'amplitude_max_diff_normalized'},
 'variable': {0: 'final_score'},
 'full / raw': {0: 0.14665122504388498},
 'full / smoothed': {0: 0.08913634117191484},
 'full / raw_vs_full / smoothed_abs_diff': {0: 0.05751488387197014},
 'full / smoothed_vs_full / raw_abs_diff': {0: 0.05751488387197014},
 'rank_std_dev': {0: 0.04066916440502688},
 'rank_stability': {0: 0.9593308355949731}}

In [86]:
pivoted_df

combined_type,metric,variable,full / raw,full / smoothed,full / raw_vs_full / smoothed_abs_diff,full / smoothed_vs_full / raw_abs_diff,rank_std_dev,rank_stability
0,amplitude_max_diff_normalized,final_score,0.146651,0.089136,0.057515,0.057515,0.040669,0.959331
1,amplitude_max_diff_normalized,reconstruction_score_sum,0.569809,0.339185,0.230624,0.230624,0.163076,0.836924
2,amplitude_max_diff_normalized,reconstruction_score_weighted,-0.123590,-0.129996,0.006406,0.006406,0.004529,0.995471
3,amplitude_max_diff_normalized,summed_scores,0.584137,0.370541,0.213596,0.213596,0.151035,0.848965
4,amplitude_max_diff_normalized,wavelet_summed_norm_score,0.534026,0.539957,0.005931,0.005931,0.004194,0.995806
...,...,...,...,...,...,...,...,...
180,wavelet_summed_norm_score,final_score,0.459282,0.286711,0.172571,0.172571,0.122026,0.877974
181,wavelet_summed_norm_score,reconstruction_score_sum,0.520711,0.132582,0.388129,0.388129,0.274449,0.725551
182,wavelet_summed_norm_score,reconstruction_score_weighted,-0.012912,-0.145981,0.133070,0.133070,0.094094,0.905906
183,wavelet_summed_norm_score,summed_scores,0.566550,0.195930,0.370620,0.370620,0.262068,0.737932


In [87]:
melted_df

Unnamed: 0,metric,data_type,signal_type,variable,value,combined_type,combined_metric
0,summed_scores,full,raw,reconstruction_score_sum,0.998508,full / raw,summed_scores / reconstruction_score_sum
1,reconstruction_score_sum,full,raw,reconstruction_score_sum,1.000000,full / raw,reconstruction_score_sum / reconstruction_scor...
2,relative_right_bases_global_alignment_score_no...,full,raw,reconstruction_score_sum,0.804091,full / raw,relative_right_bases_global_alignment_score_no...
3,relative_right_bases_matcher_alignment_score_n...,full,raw,reconstruction_score_sum,0.804091,full / raw,relative_right_bases_matcher_alignment_score_n...
4,spectral_centroid_diff_normalized,full,raw,reconstruction_score_sum,0.676341,full / raw,spectral_centroid_diff_normalized / reconstruc...
...,...,...,...,...,...,...,...
355,positive_amplitudes_euclidean_normalized,full,smoothed,summed_scores,-0.378013,full / smoothed,positive_amplitudes_euclidean_normalized / sum...
356,positive_amplitudes_dtw_normalized,full,smoothed,summed_scores,-0.383116,full / smoothed,positive_amplitudes_dtw_normalized / summed_sc...
357,positive_amplitudes_wasserstein_normalized,full,smoothed,summed_scores,-0.383116,full / smoothed,positive_amplitudes_wasserstein_normalized / s...
358,prominence_min_diff_normalized,full,smoothed,summed_scores,-0.420702,full / smoothed,prominence_min_diff_normalized / summed_scores


In [88]:
pivoted_df.columns

Index(['metric', 'variable', 'full / raw', 'full / smoothed',
       'full / raw_vs_full / smoothed_abs_diff',
       'full / smoothed_vs_full / raw_abs_diff', 'rank_std_dev',
       'rank_stability'],
      dtype='object', name='combined_type')

In [89]:


alt.Chart(pivoted_df).mark_bar().encode(
	y=alt.Y('metric:N', title='Metric'),
    x=alt.X('rank_stability:Q', title='Rank Stability', sort='-y', ),
    color=alt.Color('variable:N', legend=alt.Legend(title="Variable")),
    tooltip=['metric', 'variable', 'rank_stability']
).properties(width=800, height=400, title="Rank Stability Across Metrics and Variables")

In [90]:


alt.Chart(pivoted_df).mark_line(point=True).encode(
	x=alt.X('variable:N', axis=alt.Axis(title='Variable'), sort=['reconstruction_score_sum', 'wavelet_summed_norm_score',
	   'reconstruction_score_weighted', 'final_score']),
    y=alt.Y('rank_stability:Q', title='Rank Stability', sort='-y', ),
    color=alt.Color('metric:N', legend=alt.Legend(title="Metric", symbolLimit=0, columns=3)),
    tooltip=['metric', 'variable', 'rank_stability']
).properties(width=800, height=400, title="Rank Stability Across Metrics and Variables")