# Exploring Stability and Ranks of Wavelet Processed Volumes

## Load Libraries

In [1]:
import os
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()

import warnings
warnings.filterwarnings('ignore')
from rich.console import Console

import numpy as np

import matplotlib.pyplot as plt
console = Console()
import sys

sys.path.append("../..")
from scripts.utils import read_csv_file, get_data_directory_path

data_directory_path = get_data_directory_path()
preidentified_periodicals_df = read_csv_file(os.path.join(data_directory_path, "HathiTrust-pcc-datasets", "datasets", "periodical_metadata", "classified_preidentified_periodicals_with_full_metadata.csv"))
console.print(f"Processed {len(preidentified_periodicals_df)} preidentified periodicals.", style="bright_green")

all_frequencies_df = pd.read_csv(os.path.join("..", "..", "datasets", "all_volume_features_and_frequencies.csv"))
console.print(f"Processed {len(all_frequencies_df)} volume features and frequencies.", style="bright_green")

missing_volumes = preidentified_periodicals_df[~preidentified_periodicals_df.htid.isin(all_frequencies_df.htid)]
console.print(f"Missing Volumes: {len(missing_volumes)}", style="bright_red")
missing_titles = missing_volumes.lowercase_periodical_name.unique().tolist()
console.print(f"Missing Periodical Titles: {missing_titles}", style="bright_red")

FileNotFoundError: [Errno 2] No such file or directory: '../../datasets/all_volume_features_and_frequencies.csv'

## Shared Functions

In [7]:
def calculate_rank_stability(df, rank_columns):
    """
    Calculate a stability metric for wavelet rankings based on multiple ranking columns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing rank columns to evaluate.
    rank_columns : list of str
        Columns representing ranks to compare for stability.
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with an added 'rank_stability' column.
    """
    # Compute absolute differences between ranks
    for i, col_a in enumerate(rank_columns):
        for col_b in rank_columns[i+1:]:
            diff_col_name = f"{col_a}_vs_{col_b}_abs_diff"
            df[diff_col_name] = (df[col_a] - df[col_b]).abs()
    
    # Calculate the standard deviation of ranks across rank columns
    df['rank_std_dev'] = df[rank_columns].std(axis=1)
    
    # Normalize by the maximum possible rank
    max_rank = df[rank_columns].max().max()
    df['rank_stability'] = 1 - (df['rank_std_dev'] / max_rank)
    
    return df

## Individual Volume Wavelet EDA

In [8]:
subset_preidentified_periodicals_df = preidentified_periodicals_df[(preidentified_periodicals_df['lowercase_periodical_name'].isin(['arab_observer_and_the_scribe'])) & (preidentified_periodicals_df.volume_directory.notna())]

individual_htid = subset_preidentified_periodicals_df[subset_preidentified_periodicals_df.htid.isin(all_frequencies_df.htid)].htid.unique()[10]
individual_publication_directory = subset_preidentified_periodicals_df[subset_preidentified_periodicals_df.htid == individual_htid].publication_directory.values[0]
individual_volume_directory = subset_preidentified_periodicals_df[subset_preidentified_periodicals_df.htid == individual_htid].volume_directory.values[0]
console.print(f"Individual HTID: {individual_htid}", style="bright_green")
console.print(f"Individual Publication Directory: {individual_publication_directory}", style="bright_green")
console.print(f"Individual Volume Directory: {individual_volume_directory}", style="bright_green")
subset_frequencies_df = all_frequencies_df[all_frequencies_df.htid == individual_htid]
console.print(f"Processed {len(subset_frequencies_df)} frequencies for {individual_htid}.", style="bright_green")

In [9]:
full_combined_results_path = os.path.join(data_directory_path, "HathiTrust-pcc-datasets", "datasets", individual_publication_directory, "volumes", individual_volume_directory, "wavelet_analysis", individual_volume_directory + "_combined_results.csv")
if os.path.exists(full_combined_results_path):
	full_combined_results_df = pd.read_csv(full_combined_results_path)
	full_combined_results_df['htid'] = individual_htid
	console.print(f"Loaded {len(full_combined_results_df)} combined results from {full_combined_results_path}.", style="bright_green")
else:
	console.print(f"Could not find {full_combined_results_path}.", style="bright_red")

subset_combined_results_path = os.path.join(data_directory_path, "HathiTrust-pcc-datasets", "datasets", individual_publication_directory, "volumes", individual_volume_directory, "wavelet_analysis", individual_volume_directory + "_subset_combined_results.csv")
if os.path.exists(subset_combined_results_path):
	original_subset_combined_results_df = pd.read_csv(subset_combined_results_path)
	original_subset_combined_results_df['htid'] = individual_htid
	console.print(f"Loaded {len(original_subset_combined_results_df)} subset combined results from {subset_combined_results_path}.", style="bright_green")
else:
	console.print(f"Could not find {subset_combined_results_path}.", style="bright_red")

wavelet_volume_data_path = os.path.join(data_directory_path, "HathiTrust-pcc-datasets", "datasets", individual_publication_directory, "volumes", individual_volume_directory, "wavelet_analysis", individual_volume_directory + "_wavelet_volume_results.csv")
if os.path.exists(wavelet_volume_data_path):
	wavelet_volume_data_df = pd.read_csv(wavelet_volume_data_path)
	console.print(f"Loaded {len(wavelet_volume_data_df)} wavelet volume data from {wavelet_volume_data_path}.", style="bright_green")
else:
	console.print(f"Could not find {wavelet_volume_data_path}.", style="bright_red")

In [10]:
shared_cols = set(original_subset_combined_results_df.columns).intersection(set(wavelet_volume_data_df.columns))
avoid_cols = [col for col in wavelet_volume_data_df.columns if not col in shared_cols]
final_cols = avoid_cols + ['htid']
subset_combined_results_df = original_subset_combined_results_df.merge(wavelet_volume_data_df[final_cols], on='htid', how='left')
subset_combined_results_df['wavelet_family'] = subset_combined_results_df['wavelet'].str.extract(r'([a-zA-Z]+)')

subset_combined_results_df['wavelet_family'].value_counts()

wavelet_family
db      38
rbio    33
sym     19
bior    17
coif    17
gaus     8
haar     1
dmey     1
morl     1
mexh     1
Name: count, dtype: int64

In [11]:
melted_subset_combined_results_df = pd.melt(subset_combined_results_df, id_vars=['htid',  'wavelet', 'wavelet_type', 'signal_type', 'wavelet_mode', 'wavelet_level'], value_vars=['wavelet_rank', 'final_wavelet_rank', 'combined_wavelet_rank', 'combined_final_wavelet_rank'], var_name='rank_type', value_name='rank_value')

selection = alt.selection_multi(fields=['wavelet'], bind='legend')
sort_order = ['wavelet_rank', 'final_wavelet_rank', 'combined_wavelet_rank', 'combined_final_wavelet_rank']
alt.Chart(melted_subset_combined_results_df).mark_line(point=True).encode(
	x=alt.X('rank_type', sort=sort_order),
	y=alt.Y('rank_value', scale=alt.Scale(reverse=True)),  # Invert the y-axis
	color=alt.Color('wavelet', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(symbolLimit=0, columns=8)),
	column='signal_type',
	row='wavelet_type',
	tooltip=['wavelet', 'rank_value', 'rank_type', 'signal_type', 'wavelet_type', 'htid', 'wavelet_mode', 'wavelet_level'],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
	width=400,
	height=200
)

In [12]:
rank_columns = ['wavelet_rank', 'final_wavelet_rank', 'combined_wavelet_rank', 'combined_final_wavelet_rank']
subset_combined_results_df = calculate_rank_stability(subset_combined_results_df, rank_columns)

# Sort by rank stability
subset_combined_results_df.sort_values(by=['combined_final_wavelet_rank', 'rank_stability'], ascending=[True, False])[[ 'htid',  'wavelet', 'wavelet_type', 'signal_type', 'rank_stability', 'combined_final_wavelet_rank', 'wavelet_mode', 'wavelet_level']].head(10)

Unnamed: 0,htid,wavelet,wavelet_type,signal_type,rank_stability,combined_final_wavelet_rank,wavelet_mode,wavelet_level
0,uc1.l0073177743,rbio3.9,DWT,raw,0.977101,1,antireflect,1.0
1,uc1.l0073177743,rbio3.7,DWT,raw,0.96611,2,smooth,1.0
2,uc1.l0073177743,rbio3.7,DWT,raw,0.970079,3,zero,1.0
3,uc1.l0073177743,rbio3.7,DWT,raw,0.970079,4,constant,1.0
4,uc1.l0073177743,rbio3.5,DWT,raw,0.965194,5,periodic,1.0
5,uc1.l0073177743,rbio3.9,DWT,raw,0.976796,6,reflect,1.0
6,uc1.l0073177743,rbio3.7,DWT,raw,0.972827,7,antireflect,1.0
7,uc1.l0073177743,rbio3.5,DWT,raw,0.966721,8,antireflect,1.0
8,uc1.l0073177743,rbio3.7,DWT,raw,0.972522,9,antisymmetric,1.0
9,uc1.l0073177743,rbio3.5,DWT,raw,0.967026,10,reflect,1.0


In [13]:
subset_cols = ['avg_tokens', 'avg_digits', 'total_pages', 'total_tokens', 'total_digits']
limited_subset_combined_results_df = subset_combined_results_df[['htid', 'avg_tokens', 'avg_digits', 'total_pages', 'total_tokens', 'total_digits']].drop_duplicates()

limited_subset_combined_results_df[subset_cols]

Unnamed: 0,avg_tokens,avg_digits,total_pages,total_tokens,total_digits
0,645.138859,6.691066,929,599334.0,6216


In [14]:
selection = alt.selection_point(fields=['wavelet'], bind='legend')
alt.Chart(subset_combined_results_df[['htid', 'wavelet', 'combined_final_wavelet_rank', 'wavelet_type', 'signal_type', 'wavelet_mode', 'wavelet_level', 'combined_final_score']]).mark_bar().encode(
	x=alt.X('combined_final_wavelet_rank:O', title='Wavelet Rank'),
	y=alt.Y('count()', title='Count'),
	color=alt.Color('wavelet:N', title='Wavelet Type', legend=alt.Legend(symbolLimit=0, columns=8)),
	row='wavelet_type:N',
	column='signal_type:N',
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
	tooltip=['htid', 'wavelet', 'signal_type', 'wavelet_type', 'combined_final_wavelet_rank', 'wavelet_mode', 'wavelet_level', 'combined_final_score']
).add_params(selection).properties(
	title='Wavelet Rank Distribution by Wavelength Type',
	width=600,
	height=200
).configure_legend(
	orient='bottom'
)

In [15]:
# Normalize rank and rank stability
subset_combined_results_df['normalized_rank'] = subset_combined_results_df['combined_final_wavelet_rank'] / subset_combined_results_df['combined_final_wavelet_rank'].max()
subset_combined_results_df['normalized_stability'] = 1 - subset_combined_results_df['rank_stability']  # Penalize instability

# Define weights for rank and stability
alpha = 0.5  # Weight for rank
beta = 0.5   # Weight for stability

# Compute composite score
subset_combined_results_df['composite_score'] = (
    alpha * subset_combined_results_df['normalized_rank'] + 
    beta * subset_combined_results_df['normalized_stability']
)

# Sort by composite score (ascending)
subset_combined_results_df = subset_combined_results_df.sort_values(by='composite_score', ascending=True)

# Add rank bins to the data
subset_combined_results_df['rank_bin'] = pd.cut(
    subset_combined_results_df['combined_final_wavelet_rank'],
    bins=[0, 10, 20, 50, 100, subset_combined_results_df['combined_final_wavelet_rank'].max()],
    labels=['Top 10', 'Top 20', 'Top 50', 'Top 100', 'Beyond 100']
)

## Individual Volume With Signal Type, Mode, and Level

In [15]:
mode_subset_combined_results_df = subset_combined_results_df.copy()

In [16]:
# Add unique htid count and stability metrics to the summary
rank_bin_summary = mode_subset_combined_results_df.groupby(['signal_type', 'wavelet_family', 'wavelet_mode', 'wavelet_level', 'rank_bin']).agg(
    binned_count=('combined_final_wavelet_rank', 'count'),
    binned_unique_htid=('htid', 'nunique'),  # Count of unique volumes (htid)
    binned_mean_rank_stability=('rank_stability', 'mean'),  # Mean rank stability
    binned_std_rank_stability=('rank_stability', 'std')  # Standard deviation of rank stability
).reset_index()

# Calculate proportions
rank_bin_summary['global_proportion'] = rank_bin_summary['binned_count'] / rank_bin_summary['binned_count'].sum()
rank_bin_summary['htid_proportion'] = rank_bin_summary['binned_unique_htid'] / rank_bin_summary['binned_unique_htid'].sum()

# Sort order for consistent visualization
sort_order = ['Top 10', 'Top 20', 'Top 50', 'Top 100', 'Beyond 100']

selection = alt.selection_multi(fields=['wavelet_family'], bind='legend')
# Create a bar chart to include rank stability metrics
global_chart = alt.Chart(rank_bin_summary).mark_bar().encode(
    x=alt.X('rank_bin:N', title='Rank Bin', sort=sort_order),
    y=alt.Y('global_proportion:Q', title='Proportion of All Volumes (htid)', stack='normalize'),
    color=alt.Color('wavelet_family:N', title='Wavelet Family', scale=alt.Scale(scheme='tableau10')),
    tooltip=[
		'signal_type',
        'wavelet_family',
		'wavelet_mode',
		'wavelet_level',
        'rank_bin',
        'binned_count',
        'global_proportion',
        'binned_unique_htid',
        'htid_proportion',
        'binned_mean_rank_stability',
        'binned_std_rank_stability'
    ],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin (All Volumes)',
    width=300,
    height=300
)

htid_chart = alt.Chart(rank_bin_summary).mark_bar().encode(
    x=alt.X('rank_bin:N', title='Rank Bin', sort=sort_order),
    y=alt.Y('htid_proportion:Q', title='Proportion of Unique Volumes (htid)', stack='normalize'),
    color=alt.Color('wavelet_family:N', title='Wavelet Family', scale=alt.Scale(scheme='tableau10')),
    tooltip=[
        'signal_type',
        'wavelet_family',
		'wavelet_mode',
		'wavelet_level',
        'rank_bin',
        'binned_count',
        'global_proportion',
        'binned_unique_htid',
        'htid_proportion',
        'binned_mean_rank_stability',
        'binned_std_rank_stability'
    ],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin (Unique Volumes)',
    width=300,
    height=300
)

# Scatter plot for rank stability metrics
htid_global_chart = alt.Chart(rank_bin_summary).mark_point(filled=True).encode(
    x='htid_proportion:Q',
    y='global_proportion:Q',
    color='wavelet_family:N',
    tooltip=[
        'signal_type',
        'wavelet_family',
		'wavelet_mode',
		'wavelet_level',
        'rank_bin', 
        'binned_count', 
        'global_proportion', 
        'binned_unique_htid', 
        'htid_proportion', 
        'binned_mean_rank_stability', 
        'binned_std_rank_stability'
    ],
    shape=alt.Shape('rank_bin:N', sort=sort_order),
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin with Stability Metrics',
    width=300,
    height=300
)

stability_chart = alt.Chart(rank_bin_summary).mark_point(filled=True).encode(
	y='binned_mean_rank_stability:Q',
	x='binned_std_rank_stability:Q',
	color='wavelet_family:N',
	tooltip=[
		'signal_type',
        'wavelet_family',
		'wavelet_mode',
		'wavelet_level',
		'rank_bin',
		'binned_count',
		'global_proportion',
		'binned_unique_htid',
		'htid_proportion',
		'binned_mean_rank_stability',
		'binned_std_rank_stability'
	],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
	shape=alt.Shape('rank_bin:N', sort=sort_order)
).add_params(selection).properties(
	title='Rank Stability Metrics by Wavelet Family',
	width=300,
	height=300
)

# Combine charts
alt.vconcat(alt.hconcat(global_chart, htid_chart), alt.hconcat(htid_global_chart, stability_chart))

In [17]:
wavelet_summary = mode_subset_combined_results_df.groupby(['signal_type', 'wavelet_family', 'wavelet_mode', 'wavelet_level']).agg({
    'combined_final_score': ['mean', 'std', 'min', 'max'],
	'combined_final_wavelet_rank': ['mean', 'std', 'min', 'max', 'sum'],
    'rank_stability': ['mean', 'std'],
	'htid': ['count', 'nunique'],
	'composite_score': ['mean', 'std'],
}).reset_index()

wavelet_summary.columns = ['signal_type', 'wavelet_family', 'wavelet_mode', 'wavelet_level', 'mean_combined_final_score', 'std_combined_final_score', 'min_combined_final_score', 'max_combined_final_score', 'mean_combined_final_wavelet_rank', 'std_combined_final_wavelet_rank', 'min_combined_final_wavelet_rank', 'max_combined_final_wavelet_rank', 'sum_combined_final_wavelet_rank', 'mean_rank_stability', 'std_rank_stability', 'count', 'unique_htid', 'mean_composite_score', 'std_composite_score']
wavelet_summary.sort_values(by=['unique_htid', 'sum_combined_final_wavelet_rank', 'mean_combined_final_wavelet_rank'], ascending=[False, True, True])[['signal_type', 'wavelet_family', 'wavelet_mode', 'wavelet_level', 'mean_combined_final_wavelet_rank', 'sum_combined_final_wavelet_rank', 'count', 'mean_rank_stability', 'unique_htid', 'mean_composite_score', 'std_composite_score']].head(10)

Unnamed: 0,signal_type,wavelet_family,wavelet_mode,wavelet_level,mean_combined_final_wavelet_rank,sum_combined_final_wavelet_rank,count,mean_rank_stability,unique_htid,mean_composite_score,std_composite_score
21,raw,rbio,zero,1.0,12.0,24,2,0.969469,1,0.059383,0.047226
18,raw,rbio,reflect,1.0,9.666667,29,3,0.972623,1,0.049227,0.013995
2,raw,bior,periodic,1.0,29.0,29,1,0.991469,1,0.110883,
4,raw,bior,smooth,1.0,33.0,33,1,0.990027,1,0.12631,
14,raw,rbio,antisymmetric,1.0,14.0,42,3,0.971911,1,0.065515,0.016741
16,raw,rbio,periodic,1.0,14.0,42,3,0.9713,1,0.06582,0.036811
20,raw,rbio,symmetric,1.0,17.333333,52,3,0.972216,1,0.077617,0.017677
1,raw,bior,antisymmetric,1.0,26.0,52,2,0.995268,1,0.097954,0.006171
12,raw,haar,periodization,1.0,89.0,89,1,0.964013,1,0.345199,
10,raw,db,periodization,1.0,92.0,92,1,0.965193,1,0.355639,


In [18]:
shared_cols = wavelet_summary.columns.intersection(rank_bin_summary.columns).to_list()
console.print(f"Shared columns: {shared_cols}", style="bright_green")
top10_ranked = rank_bin_summary[rank_bin_summary.rank_bin == 'Top 10'].sort_values(by=['global_proportion', 'htid_proportion', 'binned_mean_rank_stability', 'binned_std_rank_stability', 'binned_count', 'binned_unique_htid'], ascending=[False, False, False, True, False, False])
merged_rank_summary = top10_ranked.merge(wavelet_summary, on=shared_cols, how='left')
# Find NAs columns

na_cols = merged_rank_summary.columns[merged_rank_summary.isna().any()].tolist()
merged_rank_summary[na_cols] = merged_rank_summary[na_cols].fillna(0)
merged_rank_summary.sort_values(by=['global_proportion', 'htid_proportion', 'binned_mean_rank_stability', 'binned_std_rank_stability', 'binned_count', 'binned_unique_htid'], ascending=[False, False, False, True, False, False])[['signal_type', 'wavelet_family', 'wavelet_mode', 'wavelet_level', 'binned_count', 'binned_unique_htid', 'global_proportion', 'htid_proportion', 'binned_mean_rank_stability', 'binned_std_rank_stability', 'mean_combined_final_wavelet_rank', 'sum_combined_final_wavelet_rank']].head(10)


Unnamed: 0,signal_type,wavelet_family,wavelet_mode,wavelet_level,binned_count,binned_unique_htid,global_proportion,htid_proportion,binned_mean_rank_stability,binned_std_rank_stability,mean_combined_final_wavelet_rank,sum_combined_final_wavelet_rank
0,raw,rbio,antireflect,1.0,3,1,0.02381,0.020833,0.972216,0.005217,31.0,217.0
1,raw,rbio,reflect,1.0,2,1,0.015873,0.020833,0.971911,0.006908,9.666667,29.0
2,raw,rbio,antisymmetric,1.0,1,1,0.007937,0.020833,0.972522,0.0,14.0,42.0
3,raw,rbio,constant,1.0,1,1,0.007937,0.020833,0.970079,0.0,35.333333,106.0
4,raw,rbio,zero,1.0,1,1,0.007937,0.020833,0.970079,0.0,12.0,24.0
5,raw,rbio,smooth,1.0,1,1,0.007937,0.020833,0.96611,0.0,25.25,101.0
6,raw,rbio,periodic,1.0,1,1,0.007937,0.020833,0.965194,0.0,14.0,42.0
7,raw,bior,antireflect,1.0,0,0,0.0,0.0,0.0,0.0,50.0,350.0
8,raw,bior,antisymmetric,1.0,0,0,0.0,0.0,0.0,0.0,26.0,52.0
9,raw,bior,constant,1.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
final_shared_cols = set(mode_subset_combined_results_df.columns).intersection(set(merged_rank_summary.columns))
console.print(f"Final shared columns: {final_shared_cols}", style="bright_green")

final_merged_mode_subset_combined_results_df = mode_subset_combined_results_df.merge(merged_rank_summary, on=list(final_shared_cols), how='left')
final_merged_mode_subset_combined_results_df.sort_values(by=['combined_final_wavelet_rank', 'rank_stability', 'binned_mean_rank_stability', 'binned_std_rank_stability'], ascending=[True, False, False, True])[[ 'htid',  'wavelet', 'wavelet_type', 'signal_type', 'rank_stability', 'combined_final_wavelet_rank', 'wavelet_mode', 'wavelet_level']].head(10)

Unnamed: 0,htid,wavelet,wavelet_type,signal_type,rank_stability,combined_final_wavelet_rank,wavelet_mode,wavelet_level
0,uc1.l0073177743,rbio3.9,DWT,raw,0.977101,1,antireflect,1.0
1,uc1.l0073177743,rbio3.7,DWT,raw,0.96611,2,smooth,1.0
2,uc1.l0073177743,rbio3.7,DWT,raw,0.970079,3,zero,1.0
3,uc1.l0073177743,rbio3.7,DWT,raw,0.970079,4,constant,1.0
5,uc1.l0073177743,rbio3.5,DWT,raw,0.965194,5,periodic,1.0
4,uc1.l0073177743,rbio3.9,DWT,raw,0.976796,6,reflect,1.0
6,uc1.l0073177743,rbio3.7,DWT,raw,0.972827,7,antireflect,1.0
7,uc1.l0073177743,rbio3.5,DWT,raw,0.966721,8,antireflect,1.0
8,uc1.l0073177743,rbio3.7,DWT,raw,0.972522,9,antisymmetric,1.0
9,uc1.l0073177743,rbio3.5,DWT,raw,0.967026,10,reflect,1.0


In [20]:
len(subset_combined_results_df), len(final_merged_mode_subset_combined_results_df)

(136, 136)

In [23]:
final_merged_mode_subset_combined_results_df.rank_bin.value_counts()

rank_bin
Top 100       50
Beyond 100    36
Top 50        30
Top 10        10
Top 20        10
Name: count, dtype: int64

## Individual Volume With Just Wavelet Family

In [23]:
family_subset_combined_results_df = subset_combined_results_df.copy()

# Add unique htid count and stability metrics to the summary
rank_bin_summary = family_subset_combined_results_df.groupby(['wavelet_family', 'rank_bin']).agg(
    binned_count=('combined_final_wavelet_rank', 'count'),
    binned_unique_htid=('htid', 'nunique'),  # Count of unique volumes (htid)
    binned_mean_rank_stability=('rank_stability', 'mean'),  # Mean rank stability
    binned_std_rank_stability=('rank_stability', 'std')  # Standard deviation of rank stability
).reset_index()

# Calculate proportions
rank_bin_summary['global_proportion'] = rank_bin_summary['binned_count'] / rank_bin_summary['binned_count'].sum()
rank_bin_summary['htid_proportion'] = rank_bin_summary['binned_unique_htid'] / rank_bin_summary['binned_unique_htid'].sum()

# Sort order for consistent visualization
sort_order = ['Top 10', 'Top 20', 'Top 50', 'Top 100', 'Beyond 100']

In [24]:


selection = alt.selection_multi(fields=['wavelet_family'], bind='legend')
# Create a bar chart to include rank stability metrics
global_chart = alt.Chart(rank_bin_summary).mark_bar().encode(
    x=alt.X('rank_bin:N', title='Rank Bin', sort=sort_order),
    y=alt.Y('global_proportion:Q', title='Proportion of All Volumes (htid)', stack='normalize'),
    color=alt.Color('wavelet_family:N', title='Wavelet Family', scale=alt.Scale(scheme='tableau10')),
    tooltip=[
        'wavelet_family',
        'rank_bin',
        'binned_count',
        'global_proportion',
        'binned_unique_htid',
        'htid_proportion',
        'binned_mean_rank_stability',
        'binned_std_rank_stability'
    ],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin (All Volumes)',
    width=300,
    height=300
)
global_chart

htid_chart = alt.Chart(rank_bin_summary).mark_bar().encode(
    x=alt.X('rank_bin:N', title='Rank Bin', sort=sort_order),
    y=alt.Y('htid_proportion:Q', title='Proportion of Unique Volumes (htid)', stack='normalize'),
    color=alt.Color('wavelet_family:N', title='Wavelet Family', scale=alt.Scale(scheme='tableau10')),
    tooltip=[
        'wavelet_family',
        'rank_bin',
        'binned_count',
        'global_proportion',
        'binned_unique_htid',
        'htid_proportion',
        'binned_mean_rank_stability',
        'binned_std_rank_stability'
    ],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin (Unique Volumes)',
    width=300,
    height=300
)

# Scatter plot for rank stability metrics
htid_global_chart = alt.Chart(rank_bin_summary).mark_point(filled=True).encode(
    x='htid_proportion:Q',
    y='global_proportion:Q',
    color='wavelet_family:N',
    tooltip=[
        'wavelet_family',
        'rank_bin',
        'binned_count',
        'global_proportion',
        'binned_unique_htid',
        'htid_proportion',
        'binned_mean_rank_stability',
        'binned_std_rank_stability'
    ],
    shape=alt.Shape('rank_bin:N', sort=sort_order),
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin with Stability Metrics',
    width=300,
    height=300
)

stability_chart = alt.Chart(rank_bin_summary).mark_point(filled=True).encode(
	y='binned_mean_rank_stability:Q',
	x='binned_std_rank_stability:Q',
	color='wavelet_family:N',
	tooltip=[
		'wavelet_family',
        'rank_bin',
        'binned_count',
        'global_proportion',
        'binned_unique_htid',
        'htid_proportion',
        'binned_mean_rank_stability',
        'binned_std_rank_stability'
	],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
	shape=alt.Shape('rank_bin:N', sort=sort_order)
).add_params(selection).properties(
	title='Rank Stability Metrics by Wavelet Family',
	width=300,
	height=300
)

# Combine charts
alt.vconcat(alt.hconcat(global_chart, htid_chart), alt.hconcat(htid_global_chart, stability_chart))

In [25]:
wavelet_summary = family_subset_combined_results_df.groupby(['wavelet_family']).agg({
    'combined_final_score': ['mean', 'std', 'min', 'max'],
	'combined_final_wavelet_rank': ['mean', 'std', 'min', 'max', 'sum'],
    'rank_stability': ['mean', 'std'],
	'htid': ['count', 'nunique'],
	'composite_score': ['mean', 'std'],
}).reset_index()

wavelet_summary.columns = ['wavelet_family', 'mean_combined_final_score', 'std_combined_final_score', 'min_combined_final_score', 'max_combined_final_score', 'mean_combined_final_wavelet_rank', 'std_combined_final_wavelet_rank', 'min_combined_final_wavelet_rank', 'max_combined_final_wavelet_rank', 'sum_combined_final_wavelet_rank', 'mean_rank_stability', 'std_rank_stability', 'count', 'unique_htid', 'mean_composite_score', 'std_composite_score']
wavelet_summary.sort_values(by=['unique_htid', 'sum_combined_final_wavelet_rank', 'mean_combined_final_wavelet_rank'], ascending=[False, True, True])[['wavelet_family', 'mean_combined_final_wavelet_rank', 'sum_combined_final_wavelet_rank', 'count', 'mean_rank_stability', 'unique_htid', 'mean_composite_score', 'std_composite_score']].head(10)

Unnamed: 0,wavelet_family,mean_combined_final_wavelet_rank,sum_combined_final_wavelet_rank,count,mean_rank_stability,unique_htid,mean_composite_score,std_composite_score
5,haar,89.0,89,1,0.964013,1,0.345199,
3,dmey,126.0,126,1,0.556306,1,0.685082,
7,morl,127.0,127,1,0.935599,1,0.499112,
6,mexh,135.0,135,1,0.934905,1,0.528871,
1,coif,45.647059,776,17,0.96451,1,0.185565,0.075046
0,bior,61.411765,1044,17,0.915721,1,0.267918,0.199167
4,gaus,131.625,1053,8,0.933758,1,0.517036,0.010573
8,rbio,34.090909,1125,33,0.934758,1,0.157955,0.174017
9,sym,112.421053,2136,19,0.624421,1,0.601102,0.045815
2,db,71.184211,2705,38,0.944119,1,0.289647,0.072864


In [26]:
shared_cols = wavelet_summary.columns.intersection(rank_bin_summary.columns).to_list()
console.print(f"Shared columns: {shared_cols}", style="bright_green")
top10_ranked = rank_bin_summary[rank_bin_summary.rank_bin == 'Top 10'].sort_values(by=['global_proportion', 'htid_proportion', 'binned_mean_rank_stability', 'binned_std_rank_stability', 'binned_count', 'binned_unique_htid'], ascending=[False, False, False, True, False, False])
merged_rank_summary = top10_ranked.merge(wavelet_summary, on=shared_cols, how='left')
# Find NAs columns

na_cols = merged_rank_summary.columns[merged_rank_summary.isna().any()].tolist()
merged_rank_summary[na_cols] = merged_rank_summary[na_cols].fillna(0)
merged_rank_summary.sort_values(by=['global_proportion', 'htid_proportion', 'binned_mean_rank_stability', 'binned_std_rank_stability', 'binned_count', 'binned_unique_htid'], ascending=[False, False, False, True, False, False])[['wavelet_family', 'binned_count', 'binned_unique_htid', 'global_proportion', 'htid_proportion', 'binned_mean_rank_stability', 'binned_std_rank_stability', 'mean_combined_final_wavelet_rank', 'sum_combined_final_wavelet_rank']].head(10)


Unnamed: 0,wavelet_family,binned_count,binned_unique_htid,global_proportion,htid_proportion,binned_mean_rank_stability,binned_std_rank_stability,mean_combined_final_wavelet_rank,sum_combined_final_wavelet_rank
0,rbio,10,1,0.073529,0.05,0.970446,0.004305,34.090909,1125
1,bior,0,0,0.0,0.0,0.0,0.0,61.411765,1044
2,coif,0,0,0.0,0.0,0.0,0.0,45.647059,776
3,db,0,0,0.0,0.0,0.0,0.0,71.184211,2705
4,dmey,0,0,0.0,0.0,0.0,0.0,126.0,126
5,gaus,0,0,0.0,0.0,0.0,0.0,131.625,1053
6,haar,0,0,0.0,0.0,0.0,0.0,89.0,89
7,mexh,0,0,0.0,0.0,0.0,0.0,135.0,135
8,morl,0,0,0.0,0.0,0.0,0.0,127.0,127
9,sym,0,0,0.0,0.0,0.0,0.0,112.421053,2136


In [27]:
final_shared_cols = set(family_subset_combined_results_df.columns).intersection(set(merged_rank_summary.columns))
console.print(f"Final shared columns: {final_shared_cols}", style="bright_green")

final_merged_family_subset_combined_results_df = family_subset_combined_results_df.merge(merged_rank_summary, on=list(final_shared_cols), how='left')
final_merged_family_subset_combined_results_df.sort_values(by=['combined_final_wavelet_rank', 'rank_stability', 'binned_mean_rank_stability', 'binned_std_rank_stability'], ascending=[True, False, False, True])[[ 'htid',  'wavelet', 'wavelet_type', 'signal_type', 'rank_stability', 'combined_final_wavelet_rank', 'wavelet_mode', 'wavelet_level']].head(10)

Unnamed: 0,htid,wavelet,wavelet_type,signal_type,rank_stability,combined_final_wavelet_rank,wavelet_mode,wavelet_level
0,uc1.l0073177743,rbio3.9,DWT,raw,0.977101,1,antireflect,1.0
1,uc1.l0073177743,rbio3.7,DWT,raw,0.96611,2,smooth,1.0
2,uc1.l0073177743,rbio3.7,DWT,raw,0.970079,3,zero,1.0
3,uc1.l0073177743,rbio3.7,DWT,raw,0.970079,4,constant,1.0
5,uc1.l0073177743,rbio3.5,DWT,raw,0.965194,5,periodic,1.0
4,uc1.l0073177743,rbio3.9,DWT,raw,0.976796,6,reflect,1.0
6,uc1.l0073177743,rbio3.7,DWT,raw,0.972827,7,antireflect,1.0
7,uc1.l0073177743,rbio3.5,DWT,raw,0.966721,8,antireflect,1.0
8,uc1.l0073177743,rbio3.7,DWT,raw,0.972522,9,antisymmetric,1.0
9,uc1.l0073177743,rbio3.5,DWT,raw,0.967026,10,reflect,1.0


In [53]:
subset_preidentified_periodicals_df = preidentified_periodicals_df[(preidentified_periodicals_df['lowercase_periodical_name'].isin(['arab_observer_and_the_scribe'])) & (preidentified_periodicals_df.volume_directory.notna())]

volume_dfs = []
for index, row in subset_preidentified_periodicals_df.iterrows():
	individual_htid = row.htid
	individual_publication_directory = row.publication_directory
	individual_volume_directory = row.volume_directory
	# console.print(f"Individual HTID: {individual_htid}", style="bright_green")
	# console.print(f"Individual Publication Directory: {individual_publication_directory}", style="bright_green")
	# console.print(f"Individual Volume Directory: {individual_volume_directory}", style="bright_green")
	subset_frequencies_df = all_frequencies_df[all_frequencies_df.htid == individual_htid]
	# console.print(f"Processed {len(subset_frequencies_df)} frequencies for {individual_htid}.", style="bright_green")

	

	subset_combined_results_path = os.path.join(data_directory_path, "HathiTrust-pcc-datasets", "datasets", individual_publication_directory, "volumes", individual_volume_directory, "wavelet_analysis", individual_volume_directory + "_subset_combined_results.csv")
	if os.path.exists(subset_combined_results_path):
		subset_combined_results_df = pd.read_csv(subset_combined_results_path)
		subset_combined_results_df['htid'] = individual_htid
		# console.print(f"Loaded {len(subset_combined_results_df)} subset combined results from {subset_combined_results_path}.", style="bright_green")
	# else:
	# 	console.print(f"Could not find {subset_combined_results_path}.", style="bright_red")

	wavelet_volume_data_path = os.path.join(data_directory_path, "HathiTrust-pcc-datasets", "datasets", individual_publication_directory, "volumes", individual_volume_directory, "wavelet_analysis", individual_volume_directory + "_wavelet_volume_results.csv")
	if os.path.exists(wavelet_volume_data_path):
		wavelet_volume_data_df = pd.read_csv(wavelet_volume_data_path)
	# 	console.print(f"Loaded {len(wavelet_volume_data_df)} wavelet volume data from {wavelet_volume_data_path}.", style="bright_green")
	# else:
	# 	console.print(f"Could not find {wavelet_volume_data_path}.", style="bright_red")
	
	if not wavelet_volume_data_df.empty and not subset_combined_results_df.empty:
		shared_cols = set(subset_combined_results_df.columns).intersection(set(wavelet_volume_data_df.columns))
		avoid_cols = [col for col in wavelet_volume_data_df.columns if not col in shared_cols]
		final_cols = avoid_cols + ['htid']
		subset_combined_results_df = subset_combined_results_df.merge(wavelet_volume_data_df[final_cols], on='htid', how='left')
		subset_combined_results_df['wavelet_family'] = subset_combined_results_df['wavelet'].str.extract(r'([a-zA-Z]+)')

		subset_combined_results_df = calculate_rank_stability(subset_combined_results_df, rank_columns)

		# Normalize rank and rank stability
		subset_combined_results_df['normalized_rank'] = subset_combined_results_df['combined_final_wavelet_rank'] / subset_combined_results_df['combined_final_wavelet_rank'].max()
		subset_combined_results_df['normalized_stability'] = 1 - subset_combined_results_df['rank_stability']  # Penalize instability

		# Define weights for rank and stability
		alpha = 0.5  # Weight for rank
		beta = 0.5   # Weight for stability

		# Compute composite score
		subset_combined_results_df['composite_score'] = (
			alpha * subset_combined_results_df['normalized_rank'] + 
			beta * subset_combined_results_df['normalized_stability']
		)

		# Add rank bins to the data
		subset_combined_results_df['rank_bin'] = pd.cut(
			subset_combined_results_df['combined_final_wavelet_rank'],
			bins=[0, 10, 20, 50, 100, subset_combined_results_df['combined_final_wavelet_rank'].max()],
			labels=['Top 10', 'Top 20', 'Top 50', 'Top 100', 'Beyond 100']
		)

		# Add unique htid count and stability metrics to the summary
		rank_bin_summary = subset_combined_results_df.groupby(['wavelet_family', 'rank_bin']).agg(
			count=('combined_final_wavelet_rank', 'count'),
			unique_htid=('htid', 'nunique'),  # Count of unique volumes (htid)
			mean_rank_stability=('rank_stability', 'mean'),  # Mean rank stability
			std_rank_stability=('rank_stability', 'std')  # Standard deviation of rank stability
		).reset_index()

		# Add proportions
		rank_bin_summary[f'global_proportion'] = rank_bin_summary['count'] / rank_bin_summary.groupby(f'rank_bin')['count'].transform('sum')
		rank_bin_summary[f'htid_proportion'] = rank_bin_summary['unique_htid'] / rank_bin_summary.groupby(f'rank_bin')['unique_htid'].transform('sum')
		rank_bin_summary = rank_bin_summary.sort_values(by=['global_proportion', 'htid_proportion', 'mean_rank_stability', 'std_rank_stability', 'count', 'unique_htid'], ascending=[False, False, False, True, False, False])
		top_wavelet_family = rank_bin_summary.iloc[0].wavelet_family
		finalized_subset_combined_results_df = subset_combined_results_df.merge(rank_bin_summary, on=['wavelet_family', 'rank_bin'], how='left')
		volume_dfs.append(finalized_subset_combined_results_df)
# Combine all volume data for the title into one DataFrame
combined_volume_df = pd.concat(volume_dfs, ignore_index=True)
console.print(f"Combined data for {individual_htid} with {len(combined_volume_df)} rows.", style="bright_green")

In [54]:

# Normalize rank and stability across all volumes
combined_volume_df['all_volumes_normalized_rank'] = combined_volume_df['combined_final_wavelet_rank'] / combined_volume_df['combined_final_wavelet_rank'].max()
combined_volume_df['all_volumes_normalized_stability'] = 1 - combined_volume_df['rank_stability']

# Compute composite score across all volumes
alpha = 0.5  # Weight for rank
beta = 0.5   # Weight for stability
combined_volume_df['all_volumes_composite_score'] = (
    alpha * combined_volume_df['all_volumes_normalized_rank'] +
    beta * combined_volume_df['all_volumes_normalized_stability']
)

In [67]:
# Aggregate metrics for wavelet families across all volumes
wavelet_summary = combined_volume_df.groupby('wavelet_family').agg(
    mean_composite_score=('all_volumes_composite_score', 'mean'),
    mean_rank_stability=('rank_stability', 'mean'),
    std_rank_stability=('rank_stability', 'std'),
    mean_rank=('combined_final_wavelet_rank', 'mean'),
    total_count=('htid', 'count')  # Total number of volumes where this wavelet appears
).reset_index()

# Sort by composite score and rank stability
wavelet_summary = wavelet_summary.sort_values(
    by=['mean_composite_score', 'mean_rank_stability', 'mean_rank'],
    ascending=[True, False, True]
)

In [68]:
# Step 1: Compute normalized metrics
wavelet_summary['all_volumes_normalized_mean_composite_score'] = wavelet_summary['mean_composite_score'] / wavelet_summary['mean_composite_score'].max()
wavelet_summary['all_volumes_normalized_mean_rank_stability'] = wavelet_summary['mean_rank_stability'] / wavelet_summary['mean_rank_stability'].max()
wavelet_summary['all_volumes_normalized_mean_rank'] = 1 - (wavelet_summary['mean_rank'] / wavelet_summary['mean_rank'].max())
wavelet_summary['all_volumes_normalized_total_count'] = wavelet_summary['total_count'] / wavelet_summary['total_count'].max()

# # Step 2: Define weights
# alpha = 0.4  # Weight for mean composite score
# beta = 0.3   # Weight for rank stability
# gamma = 0.2  # Weight for mean rank
# delta = 0.1  # Weight for total count

# # Step 3: Compute final composite score
# wavelet_summary['final_wavelet_composite_score'] = (
#     alpha * wavelet_summary['normalized_mean_composite_score'] +
#     beta * wavelet_summary['normalized_mean_rank_stability'] +
#     gamma * wavelet_summary['normalized_mean_rank'] +
#     delta * wavelet_summary['normalized_total_count']
# )

# # Step 4: Sort wavelets by the new composite score
# wavelet_summary = wavelet_summary.sort_values(
#     by='final_wavelet_composite_score', ascending=False
# )

# # Step 5: Select the best wavelet family
# top_wavelet_family = wavelet_summary.iloc[0]
# print(f"Best wavelet family for the title: {top_wavelet_family.wavelet_family}")
# wavelet_summary

In [58]:
# Add rank bins to the data
combined_volume_df['all_volumes_rank_bin'] = pd.cut(
    combined_volume_df ['combined_final_wavelet_rank'],
    bins=[0, 10, 20, 50, 100, combined_volume_df ['combined_final_wavelet_rank'].max()],
    labels=['Top 10', 'Top 20', 'Top 50', 'Top 100', 'Beyond 100']
)

# Add unique htid count and stability metrics to the summary
final_rank_bin_summary = combined_volume_df .groupby(['wavelet_family', 'all_volumes_rank_bin']).agg(
    count=('combined_final_wavelet_rank', 'count'),
    unique_htid=('htid', 'nunique'),  # Count of unique volumes (htid)
    mean_rank_stability=('rank_stability', 'mean'),  # Mean rank stability
    std_rank_stability=('rank_stability', 'std')  # Standard deviation of rank stability
).reset_index()

final_rank_bin_summary[f'all_volumes_global_proportion'] = final_rank_bin_summary['count'] / final_rank_bin_summary.groupby(f'all_volumes_rank_bin')['count'].transform('sum')
final_rank_bin_summary[f'all_volumes_htid_proportion'] = final_rank_bin_summary['unique_htid'] / final_rank_bin_summary.groupby(f'all_volumes_rank_bin')['unique_htid'].transform('sum')

# Sort order for consistent visualization
sort_order = ['Top 10', 'Top 20', 'Top 50', 'Top 100', 'Beyond 100']

selection = alt.selection_multi(fields=['wavelet_family'], bind='legend')
# Create a bar chart to include rank stability metrics
global_chart = alt.Chart(final_rank_bin_summary).mark_bar().encode(
    x=alt.X('all_volumes_rank_bin:N', title='Rank Bin', sort=sort_order),
    y=alt.Y('all_volumes_global_proportion:Q', title='Proportion of All Volumes (htid)', stack='normalize'),
    color=alt.Color('wavelet_family:N', title='Wavelet Family', scale=alt.Scale(scheme='tableau10')),
    tooltip=[
        'wavelet_family',
        'all_volumes_rank_bin',
        'count',
        'all_volumes_global_proportion',
        'unique_htid',
        'all_volumes_htid_proportion',
        'mean_rank_stability',
        'std_rank_stability'
    ],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin (All Volumes)',
    width=300,
    height=300
)

htid_chart = alt.Chart(final_rank_bin_summary).mark_bar().encode(
    x=alt.X('all_volumes_rank_bin:N', title='Rank Bin', sort=sort_order),
    y=alt.Y('all_volumes_htid_proportion:Q', title='Proportion of Unique Volumes (htid)', stack='normalize'),
    color=alt.Color('wavelet_family:N', title='Wavelet Family', scale=alt.Scale(scheme='tableau10')),
    tooltip=[
        'wavelet_family',
        'all_volumes_rank_bin',
        'count',
        'all_volumes_global_proportion',
        'unique_htid',
        'all_volumes_htid_proportion',
        'mean_rank_stability',
        'std_rank_stability'
    ],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin (Unique Volumes)',
    width=300,
    height=300
)

# Scatter plot for rank stability metrics
htid_global_chart = alt.Chart(final_rank_bin_summary).mark_point(filled=True).encode(
    x='all_volumes_htid_proportion:Q',
    y='all_volumes_global_proportion:Q',
    color='wavelet_family:N',
    tooltip=[
        'wavelet_family', 
        'all_volumes_rank_bin', 
        'count', 
        'all_volumes_global_proportion', 
        'unique_htid', 
        'all_volumes_htid_proportion', 
        'mean_rank_stability', 
        'std_rank_stability'
    ],
    shape=alt.Shape('all_volumes_rank_bin:N', sort=sort_order),
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Proportion of Wavelets by Rank Bin with Stability Metrics',
    width=300,
    height=300
)

stability_chart = alt.Chart(final_rank_bin_summary).mark_point(filled=True).encode(
	y='mean_rank_stability:Q',
	x='std_rank_stability:Q',
	color='wavelet_family:N',
	tooltip=[
		'wavelet_family', 
		'all_volumes_rank_bin',
		'count',
		'all_volumes_global_proportion',
		'unique_htid',
		'all_volumes_htid_proportion',
		'mean_rank_stability',
		'std_rank_stability'
	],
	opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
	shape=alt.Shape('all_volumes_rank_bin:N', sort=sort_order)
).add_params(selection).properties(
	title='Rank Stability Metrics by Wavelet Family',
	width=300,
	height=300
)

# Combine charts
alt.vconcat(alt.hconcat(global_chart, htid_chart), alt.hconcat(htid_global_chart, stability_chart))

In [69]:
final_rank_bin_summary = final_rank_bin_summary.rename(columns={'std_rank_stability': 'binned_std_rank_stability', 'mean_rank_stability': 'binned_mean_rank_stability'})

In [65]:
final_rank_bin_summary[(final_rank_bin_summary.wavelet_family == 'bior') & (final_rank_bin_summary.all_volumes_rank_bin == 'Top 10')]

Unnamed: 0,wavelet_family,all_volumes_rank_bin,count,unique_htid,mean_rank_stability,std_rank_stability,all_volumes_global_proportion,all_volumes_htid_proportion
0,bior,Top 10,39,6,0.998716,0.001512,0.108333,0.133333


In [84]:
merged_df = wavelet_summary.merge(
    final_rank_bin_summary, on='wavelet_family', how='left'
)

# find any NA columns
na_cols = merged_df.columns[merged_df.isna().any()].tolist()
merged_df[na_cols] = merged_df[na_cols].fillna(0)

# Step 2: Normalize all relevant metrics
merged_df['normalized_mean_composite_score'] = merged_df['mean_composite_score'] / merged_df['mean_composite_score'].max()
merged_df['normalized_mean_rank_stability'] = merged_df['mean_rank_stability'] / merged_df['mean_rank_stability'].max()
merged_df['normalized_mean_rank'] = 1 - (merged_df['mean_rank'] / merged_df['mean_rank'].max())
merged_df['normalized_total_count'] = merged_df['total_count'] / merged_df['total_count'].max()
merged_df['normalized_global_proportion'] = merged_df['all_volumes_global_proportion'] / merged_df['all_volumes_global_proportion'].max()
merged_df['normalized_htid_proportion'] = merged_df['all_volumes_htid_proportion'] / merged_df['all_volumes_htid_proportion'].max()

# Step 3: Define weights for all metrics
alpha = 0.3  # Weight for mean composite score
beta = 0.25  # Weight for rank stability
gamma = 0.15  # Weight for rank
delta = 0.1   # Weight for total count
epsilon = 0.1  # Weight for global proportion
zeta = 0.1    # Weight for HTID proportion

# Step 4: Compute the final composite score
merged_df['final_wavelet_composite_score'] = (
    alpha * merged_df['normalized_mean_composite_score'] +
    beta * merged_df['normalized_mean_rank_stability'] +
    gamma * merged_df['normalized_mean_rank'] +
    delta * merged_df['normalized_total_count'] +
    epsilon * merged_df['normalized_global_proportion'] +
    zeta * merged_df['normalized_htid_proportion']
)

# Step 5: Sort wavelets by the final composite score
merged_df = merged_df.sort_values(
    by='final_wavelet_composite_score', ascending=False
)

# Step 6: Select the best wavelet family
top_wavelet_family = merged_df.iloc[0]
print(f"Best wavelet family for the title: {top_wavelet_family.wavelet_family}")
merged_df

Best wavelet family for the title: db


Unnamed: 0,wavelet_family,mean_composite_score,mean_rank_stability,std_rank_stability,mean_rank,total_count,all_volumes_normalized_mean_composite_score,all_volumes_normalized_mean_rank_stability,all_volumes_normalized_mean_rank,all_volumes_normalized_total_count,...,binned_std_rank_stability,all_volumes_global_proportion,all_volumes_htid_proportion,normalized_mean_composite_score,normalized_mean_rank_stability,normalized_mean_rank,normalized_total_count,normalized_global_proportion,normalized_htid_proportion,final_wavelet_composite_score
23,db,0.257046,0.948131,0.023043,111.857828,2293,0.590753,0.984615,0.412991,1.0,...,0.023671,0.582778,0.2,0.590753,0.984615,0.412991,1.0,1.0,0.9,0.775328
22,db,0.257046,0.948131,0.023043,111.857828,2293,0.590753,0.984615,0.412991,1.0,...,0.017511,0.22963,0.209677,0.590753,0.984615,0.412991,1.0,0.394026,0.943548,0.719086
21,db,0.257046,0.948131,0.023043,111.857828,2293,0.590753,0.984615,0.412991,1.0,...,0.022861,0.205556,0.205882,0.590753,0.984615,0.412991,1.0,0.352717,0.926471,0.713247
20,db,0.257046,0.948131,0.023043,111.857828,2293,0.590753,0.984615,0.412991,1.0,...,0.00861,0.119444,0.2,0.590753,0.984615,0.412991,1.0,0.204957,0.9,0.695824
5,rbio,0.19575,0.923997,0.099928,76.350215,1165,0.44988,0.959553,0.599328,0.508068,...,0.01213,0.327778,0.222222,0.44988,0.959553,0.599328,0.508068,0.56244,1.0,0.671802
39,sym,0.39982,0.74292,0.154795,131.299456,1102,0.918881,0.771508,0.310965,0.480593,...,0.155203,0.287079,0.118182,0.918881,0.771508,0.310965,0.480593,0.492604,0.531818,0.665688
7,rbio,0.19575,0.923997,0.099928,76.350215,1165,0.44988,0.959553,0.599328,0.508068,...,0.032913,0.319444,0.209677,0.44988,0.959553,0.599328,0.508068,0.548141,0.943548,0.664727
24,db,0.257046,0.948131,0.023043,111.857828,2293,0.590753,0.984615,0.412991,1.0,...,0.012589,0.24691,0.081818,0.590753,0.984615,0.412991,1.0,0.423678,0.368182,0.664514
6,rbio,0.19575,0.923997,0.099928,76.350215,1165,0.44988,0.959553,0.599328,0.508068,...,0.012229,0.4,0.176471,0.44988,0.959553,0.599328,0.508068,0.686368,0.794118,0.663607
12,bior,0.216783,0.927746,0.107518,87.437439,1031,0.498219,0.963446,0.541145,0.449629,...,0.037044,0.241667,0.209677,0.498219,0.963446,0.541145,0.449629,0.414681,0.943548,0.652285


In [23]:
# Step 1: Merge rank bin summary into wavelet summary
rank_bin_summary_top10 = final_rank_bin_summary[final_rank_bin_summary.rank_bin == 'Top 10'][[
    'wavelet_family', 'all_volumes_global_proportion', 'all_volumes_htid_proportion'
]]

# Merge with the wavelet summary DataFrame
wavelet_summary = wavelet_summary.merge(
    rank_bin_summary_top10, on='wavelet_family', how='left'
).fillna(0)  # Fill NaNs with 0 for wavelets that don't appear in the Top 10 bin

# Step 2: Normalize all relevant metrics
wavelet_summary['normalized_mean_composite_score'] = wavelet_summary['mean_composite_score'] / wavelet_summary['mean_composite_score'].max()
wavelet_summary['normalized_mean_rank_stability'] = wavelet_summary['mean_rank_stability'] / wavelet_summary['mean_rank_stability'].max()
wavelet_summary['normalized_mean_rank'] = 1 - (wavelet_summary['mean_rank'] / wavelet_summary['mean_rank'].max())
wavelet_summary['normalized_total_count'] = wavelet_summary['total_count'] / wavelet_summary['total_count'].max()
wavelet_summary['normalized_global_proportion'] = wavelet_summary['global_proportion'] / wavelet_summary['global_proportion'].max()
wavelet_summary['normalized_htid_proportion'] = wavelet_summary['htid_proportion'] / wavelet_summary['htid_proportion'].max()

# Step 3: Define weights for all metrics
alpha = 0.3  # Weight for mean composite score
beta = 0.25  # Weight for rank stability
gamma = 0.15  # Weight for rank
delta = 0.1   # Weight for total count
epsilon = 0.1  # Weight for global proportion
zeta = 0.1    # Weight for HTID proportion

# Step 4: Compute the final composite score
wavelet_summary['final_wavelet_composite_score'] = (
    alpha * wavelet_summary['normalized_mean_composite_score'] +
    beta * wavelet_summary['normalized_mean_rank_stability'] +
    gamma * wavelet_summary['normalized_mean_rank'] +
    delta * wavelet_summary['normalized_total_count'] +
    epsilon * wavelet_summary['normalized_global_proportion'] +
    zeta * wavelet_summary['normalized_htid_proportion']
)

# Step 5: Sort wavelets by the final composite score
wavelet_summary = wavelet_summary.sort_values(
    by='final_wavelet_composite_score', ascending=False
)

# Step 6: Select the best wavelet family
top_wavelet_family = wavelet_summary.iloc[0]
print(f"Best wavelet family for the title: {top_wavelet_family.wavelet_family}")
wavelet_summary

Best wavelet family for the title: rbio


Unnamed: 0,wavelet_family,mean_composite_score,mean_rank_stability,std_rank_stability,mean_rank,total_count,normalized_mean_composite_score,normalized_mean_rank_stability,normalized_mean_rank,normalized_total_count,final_wavelet_composite_score,global_proportion,htid_proportion,normalized_global_proportion,normalized_htid_proportion
7,rbio,0.153778,0.929066,0.101865,57.262338,1155,0.363616,0.959655,0.643965,0.623314,0.707925,0.605556,0.222222,1.0,1.0
1,db,0.229959,0.94521,0.024161,98.041015,1853,0.543752,0.976331,0.390419,1.0,0.670909,0.091667,0.2,0.151376,0.9
0,sym,0.404022,0.702377,0.139301,123.52193,912,0.955335,0.725502,0.231988,0.492175,0.610249,0.05,0.111111,0.082569,0.5
8,bior,0.193316,0.926036,0.118524,75.665505,861,0.457107,0.956525,0.529541,0.464652,0.58005,0.108333,0.133333,0.178899,0.6
6,coif,0.201151,0.95355,0.024651,86.116061,853,0.475633,0.984945,0.464563,0.460335,0.578864,0.086111,0.133333,0.142202,0.6
3,gaus,0.343315,0.93109,0.02313,149.488136,295,0.811788,0.961746,0.07054,0.159201,0.554603,0.025,0.088889,0.041284,0.4
2,dmey,0.422912,0.647892,0.169296,119.479167,48,1.0,0.669224,0.257124,0.025904,0.519382,0.005556,0.022222,0.009174,0.1
5,mexh,0.342361,0.933047,0.023536,149.5,36,0.809532,0.963768,0.070466,0.019428,0.506773,0.002778,0.022222,0.004587,0.1
4,morl,0.369537,0.925527,0.014763,160.833333,36,0.873792,0.956,0.0,0.019428,0.50308,0.0,0.0,0.0,0.0
9,haar,0.138888,0.968125,0.014527,59.508197,61,0.32841,1.0,0.630001,0.03292,0.480444,0.025,0.066667,0.041284,0.3


In [49]:
final_combined_volume_df = combined_volume_df.merge(wavelet_summary[['wavelet_family', 'final_wavelet_composite_score']], on='wavelet_family', how='left').sort_values(by=['final_wavelet_composite_score', 'combined_final_wavelet_rank', 'rank_stability'], ascending=[True, True, False])

# final_combined_volume_df.merge(final_rank_bin_summary[['wavelet_family', 'rank_bin', 'global_proportion', 'htid_proportion']], on=['wavelet_family', 'rank_bin'], how='left')

In [50]:
final_rank_bin_summary.columns.intersection(final_combined_volume_df.columns)

Index(['wavelet_family', 'rank_bin', 'count', 'unique_htid',
       'mean_rank_stability', 'std_rank_stability', 'total_bin_count',
       'total_bin_unique_htid', 'global_proportion', 'htid_proportion'],
      dtype='object')