In [None]:
# Cell 1: Import Required Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Cell 2: Extract Core Lengths and Load Segment Pool
from pyCoreRelator import load_segment_pool

# Define directory paths
mother_dir = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites'

# Function to extract core length from depth data
def get_core_length(core_name, depth_column='SB_DEPTH_cm'):
    """Extract maximum depth from core data"""
    # Try hiresMS file first (most common)
    depth_file = f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv'
    try:
        df = pd.read_csv(depth_file)
        return df[depth_column].max()
    except:
        print(f"Warning: Could not read depth from {depth_file}")
        return None

# Define core names and target parameters
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Extract core lengths
core_a_length = get_core_length(CORE_A, DEPTH_COLUMN)
core_b_length = get_core_length(CORE_B, DEPTH_COLUMN)

print(f"Core A ({CORE_A}) length: {core_a_length} cm")
print(f"Core B ({CORE_B}) length: {core_b_length} cm")

#####

LOG_COLUMNS = ['hiresMS']  # Choose one log column for segment pool
# LOG_COLUMNS = ['hiresMS', 'CT', 'Lumin']


# Define all cores for segment pool
SEGMENT_POOL_CORES = ["M9907-22PC", "M9907-23PC", "M9907-25PC", "M9907-11PC", "M9907-12PC", "M9907-14TC", "M9907-30PC", "M9907-31PC","RR0207-56PC"]

# Define paths and parameters for multiple log types
CORE_LOG_PATHS = {
    core_name: {
        'hiresMS': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv',
        'CT': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_CT_MLfilled.csv',
        'Lumin': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
        'R': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
        'G': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
        'B': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
        'Den_gm/cc': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_MST_MLfilled.csv'
    }
    for core_name in SEGMENT_POOL_CORES
}

COLUMN_ALTERNATIVES = {
    'hiresMS': ['MS'],
    'CT': ['CT_value'],
    'R': ['R', 'red', 'Red'],
    'G': ['G', 'green', 'Green'],
    'B': ['B', 'blue', 'Blue'],
    'Lumin': ['luminance', 'Luminance'],
    'Den_gm/cc': ['Density', 'density']
}

PICKED_DEPTH_PATHS = {
    core_name: f'{mother_dir}/pyCoreRelator/pickeddepth/{core_name}_pickeddepth.csv'
    for core_name in SEGMENT_POOL_CORES
}

# Execute the function using the imported function from pyCoreRelator
segment_pool_cores_data, turb_logs, depth_logs, target_dimensions = load_segment_pool(
    core_names=SEGMENT_POOL_CORES,
    core_log_paths=CORE_LOG_PATHS,
    picked_depth_paths=PICKED_DEPTH_PATHS,
    log_columns=LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    column_alternatives=COLUMN_ALTERNATIVES,
    boundary_category=1
)

print(f"Core A target length: {core_a_length} cm")
print(f"Core B target length: {core_b_length} cm")

In [None]:
# Cell 3: Plot All Turbidite Segments from Pool
from pyCoreRelator import plot_segment_pool, print_segment_pool_summary

# Plot the segment pool using imported function
fig, axes = plot_segment_pool(
    segment_logs=turb_logs,
    segment_depths=depth_logs,
    log_column_names=LOG_COLUMNS,
    n_cols=12,
    figsize_per_row=4,
    plot_segments=True,
    save_plot=False,
    plot_filename=None
)

# Print summary statistics using imported function
print_segment_pool_summary(turb_logs, depth_logs, target_dimensions)

In [None]:
# core_a_length = core_a_length *0.5
# core_b_length = core_b_length *0.5
core_a_length = 600
core_b_length = 600

In [None]:
# Cell 4: Create and Plot Synthetic Core Pair
from pyCoreRelator import create_and_plot_synthetic_core_pair

# Generate and plot synthetic core pair using imported functions
(synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a,
 synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b) = create_and_plot_synthetic_core_pair(
    core_a_length, core_b_length, turb_logs, depth_logs, LOG_COLUMNS,
    repetition=True, # True: allow reselecting turbidite segments; False: each segment can only be selected once
    plot_results=True, save_plot=False, plot_filename='synthetic_core_pair.png'
)

In [None]:
# Cell 5: DTW Analysis on Synthetic Pair
from pyCoreRelator import run_comprehensive_dtw_analysis, find_complete_core_paths

# Run DTW analysis
dtw_results, valid_dtw_pairs, segments_a, segments_b, _, _, dtw_distance_matrix_full = run_comprehensive_dtw_analysis(
    synthetic_log_a, synthetic_log_b, synthetic_md_a, synthetic_md_b,
    picked_depths_a=synthetic_picked_a,
    picked_depths_b=synthetic_picked_b,
    independent_dtw=False,
    top_bottom=False,
    mute_mode=False
)

# Find complete core paths and extract r-values
_ = find_complete_core_paths(
    valid_dtw_pairs,
    segments_a, 
    segments_b, 
    synthetic_log_a, 
    synthetic_log_b,
    synthetic_picked_a, 
    synthetic_picked_b,
    dtw_results,
    dtw_distance_matrix_full,
    output_csv="synthetic_core_pair_metrics.csv",
    output_metric_only=True,
    shortest_path_search=True,
    shortest_path_level=2,
    max_search_path=100000,
    mute_mode=False
)

In [None]:
# Cell 6: Plot R-Values Distribution from Synthetic Pair
from pyCoreRelator import plot_correlation_distribution

# Define quality index and parameters (norm_dtw, dtw_ratio, perc_diag, corr_coef, dtw_warp_eff, perc_age_overlap)
targeted_quality_index = 'corr_coef' 
# targeted_quality_index = 'norm_dtw'
# targeted_quality_index = 'dtw_ratio'
# targeted_quality_index = 'dtw_warp_eff' 
# targeted_quality_index = 'perc_diag'

# Define CSV filename
csv_filename = 'outputs/synthetic_core_pair_metrics.csv'

# Plot correlation distribution
_, _, fit_params = plot_correlation_distribution(
    csv_file=csv_filename,
    quality_index=targeted_quality_index,
    no_bins=30,
    save_png=False,
    pdf_method='normal',  # 'KDE', 'skew-normal', 'normal'
    kde_bandwidth=0.05,
    mute_mode=False
)

# Remove temporary CSV file after loop is complete
if os.path.exists("outputs/synthetic_core_pair_metrics.csv"):
    os.remove("outputs/synthetic_core_pair_metrics.csv")

In [None]:
# Cell 7: Re-plot Distribution Using Fit Parameters
if 'fit_params' in locals() and fit_params is not None:
    print("Re-plotting fitted curve only from 'fit_params'...")
    
    # Create new figure
    fig, ax = plt.subplots(figsize=(6, 4))
    
    # Plot only the fitted distribution curve
    if 'x_range' in fit_params and 'y_values' in fit_params:
        x = fit_params['x_range']
        y = fit_params['y_values']
        
        method = fit_params.get('method', 'unknown')
        
        if method == 'normal':
            mean_val = fit_params['mean']
            std_val = fit_params['std']
            n_points = fit_params['n_points']
            ax.plot(x, y, 'r-', linewidth=2, alpha=0.8,
                    label=f'Normal Fit\n(mean = {mean_val:.3f})\n(σ = {std_val:.3f})\nn = {n_points:,}')
                      
        elif method == 'skew-normal':
            shape = fit_params['shape']
            location = fit_params['location']
            scale = fit_params['scale']
            n_points = fit_params['n_points']
            ax.plot(x, y, 'r-', linewidth=2, alpha=0.8,
                    label=f'Skew-Normal Fit\n(α = {shape:.3f})\n(μ = {location:.3f})\n(σ = {scale:.3f})\nn = {n_points:,}')
                      
        elif method == 'KDE':
            bandwidth = fit_params['bandwidth']
            n_points = fit_params['n_points']
            ax.plot(x, y, 'r-', linewidth=2, alpha=0.8,
                    label=f'KDE\n(bandwidth = {bandwidth})\nn = {n_points:,}')
    
    # Add median line
    if 'median' in fit_params:
        median_val = fit_params['median']
        ax.axvline(median_val, color='green', linestyle='dashed', linewidth=2,
                  label=f'Median: {median_val:.3f}')
    
    # Formatting
    ax.set_xlabel(f'{targeted_quality_index}')
    ax.set_ylabel('Density (%)')
    ax.set_title(f'Fitted Distribution Curve\nSynthetic Cores {CORE_A} vs {CORE_B}')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Set appropriate x-axis limits
    if targeted_quality_index == 'corr_coef':
        ax.set_xlim(0, 1.0)
    
    plt.tight_layout()
    plt.show()
                
else:
    print("No fit_params available. Please run Cell 6 first to generate the distribution.")

In [None]:
# Cell 8: Run 100 Iterations for Synthetic Logs R-Value Findings
from pyCoreRelator import run_comprehensive_dtw_analysis, find_complete_core_paths, plot_correlation_distribution, create_synthetic_log_with_depths

# Define quality indices to iterate through (norm_dtw, dtw_ratio, variance_deviation, perc_diag, corr_coef, match_min, match_mean, perc_age_overlap)
quality_indices = ['corr_coef', 'norm_dtw', 'perc_diag', 'dtw_warp_eff']
# quality_indices = ['norm_dtw']

# Define number of iterations
number_of_iterations = 100

# Create output directories
os.makedirs('outputs', exist_ok=True)

# Run iterations with progress bar
for iteration in tqdm(range(number_of_iterations), desc=f"Running synthetic analysis"):
    
    # Generate synthetic core pair (from cell 4)
    synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a_tuples = create_synthetic_log_with_depths(
        core_a_length, turb_logs, depth_logs, exclude_inds=None, 
        repetition=True # True: allow reselecting turbidite segments; False: each segment can only be selected once
    )
    synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b_tuples = create_synthetic_log_with_depths(
        core_b_length, turb_logs, depth_logs, exclude_inds=None, 
        repetition=True # True: allow reselecting turbidite segments; False: each segment can only be selected once
    )
    
    # Extract depths from tuples
    synthetic_picked_a = [depth for depth, category in synthetic_picked_a_tuples]
    synthetic_picked_b = [depth for depth, category in synthetic_picked_b_tuples]
    
    # Run DTW analysis (from cell 5)
    dtw_results, valid_dtw_pairs, segments_a, segments_b, _, _, dtw_distance_matrix_full = run_comprehensive_dtw_analysis(
        synthetic_log_a, synthetic_log_b, synthetic_md_a, synthetic_md_b,
        picked_depths_a=synthetic_picked_a,
        picked_depths_b=synthetic_picked_b,
        independent_dtw=False,
        top_bottom=False,
        mute_mode=True
    )
    
    # Find complete core paths
    _ = find_complete_core_paths(
        valid_dtw_pairs,
        segments_a, 
        segments_b, 
        synthetic_log_a, 
        synthetic_log_b,
        synthetic_picked_a, 
        synthetic_picked_b,
        dtw_results,
        dtw_distance_matrix_full,
        output_csv="temp_synthetic_core_pair_metrics.csv",
        output_metric_only=True,
        shortest_path_search=True,
        shortest_path_level=2,
        max_search_path=100000,
        mute_mode=True
    )
    
    # Iterate through each quality index to extract fit_params
    for targeted_quality_index in quality_indices:
        
        # Define output filename based on log columns and quality index
        if LOG_COLUMNS == ['hiresMS']:
            output_csv_filename = f'outputs/synthetic_PDFs_MSonly_{targeted_quality_index}.csv'
        elif LOG_COLUMNS == ['hiresMS','CT', 'Lumin']:
            output_csv_filename = f'outputs/synthetic_PDFs_MSCTLumin_{targeted_quality_index}.csv'
        else:
            output_csv_filename = f'outputs/synthetic_PDFs_unspecified_{targeted_quality_index}.csv'
        
        # Plot correlation distribution to get fit_params only
        _, _, fit_params = plot_correlation_distribution(
            csv_file="outputs/temp_synthetic_core_pair_metrics.csv",
            quality_index=targeted_quality_index,
            no_bins=30,
            save_png=False,
            pdf_method='normal',
            kde_bandwidth=0.05,
            mute_mode=True
        )
        
        # Store fit_params with iteration number and incrementally save to CSV
        if fit_params is not None:
            fit_params_copy = fit_params.copy()
            fit_params_copy['iteration'] = iteration
            
            # Incrementally save to CSV
            df_single = pd.DataFrame([fit_params_copy])
            if iteration == 0:
                # Write header for first iteration
                df_single.to_csv(output_csv_filename, mode='w', index=False, header=True)
            else:
                # Append subsequent iterations without header
                df_single.to_csv(output_csv_filename, mode='a', index=False, header=False)
            
            del df_single, fit_params_copy
        
        del fit_params
    
    # Clear memory after each iteration
    del synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a_tuples
    del synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b_tuples
    del synthetic_picked_a, synthetic_picked_b
    del dtw_results, valid_dtw_pairs, segments_a, segments_b
    
    # Force garbage collection
    import gc
    gc.collect()

# Remove temporary CSV file after all iterations are complete
if os.path.exists("outputs/temp_synthetic_core_pair_metrics.csv"):
    os.remove("outputs/temp_synthetic_core_pair_metrics.csv")

print(f"\nCompleted {number_of_iterations} iterations for all quality indices: {quality_indices}")
for targeted_quality_index in quality_indices:
    if LOG_COLUMNS == ['hiresMS']:
        output_csv_filename = f'outputs/synthetic_PDFs_MSonly_{targeted_quality_index}.csv'
    elif LOG_COLUMNS == ['hiresMS','CT', 'Lumin']:
        output_csv_filename = f'outputs/synthetic_PDFs_MSCTLumin_{targeted_quality_index}.csv'
    else:
        output_csv_filename = f'outputs/synthetic_PDFs_unspecified_{targeted_quality_index}.csv'
    print(f"Distribution curves parameters for {targeted_quality_index} saved to: {output_csv_filename}")

In [None]:
# Cell 9: Plot all distribution curves for each quality index

# Define quality indices to plot
quality_indices = ['corr_coef', 'norm_dtw', 'perc_diag']

for targeted_quality_index in quality_indices:
    print(f"\nPlotting distribution curves for {targeted_quality_index}...")
    
    # Define input filename based on log columns and quality index
    if LOG_COLUMNS == ['hiresMS']:
        input_csv_filename = f'outputs/synthetic_PDFs_MSonly_{targeted_quality_index}.csv'
    elif LOG_COLUMNS == ['hiresMS','CT', 'Lumin']:
        input_csv_filename = f'outputs/synthetic_PDFs_MSCTLumin_{targeted_quality_index}.csv'
    else:
        input_csv_filename = f'outputs/synthetic_PDFs_unspecified_{targeted_quality_index}.csv'
    
    # Check if file exists
    if not os.path.exists(input_csv_filename):
        print(f"Error: File {input_csv_filename} does not exist. Skipping {targeted_quality_index}.")
        continue
    
    # Load fit params from CSV
    df_fit_params = pd.read_csv(input_csv_filename)

    # Convert to list of dictionaries containing only necessary columns
    all_fit_params = []
    mean_values = []
    for _, row in df_fit_params.iterrows():
        fit_params = {
            'x_range': np.fromstring(row['x_range'].strip('[]'), sep=' ') if 'x_range' in row and pd.notna(row['x_range']) else None,
            'y_values': np.fromstring(row['y_values'].strip('[]'), sep=' ') if 'y_values' in row and pd.notna(row['y_values']) else None
        }
        all_fit_params.append(fit_params)
        
        # Extract mean value if available
        if 'mean' in row and pd.notna(row['mean']):
            mean_values.append(row['mean'])

    # Plot all distribution curves
    fig, ax = plt.subplots(figsize=(6, 4))

    # Plot histogram of mean values in gray bars
    if mean_values:
        ax.hist(mean_values, bins=20, alpha=0.5, color='gray', density=True, label='Mean Values')

    # Plot all curves as transparent red lines
    for fit_params in all_fit_params:
        if 'x_range' in fit_params and 'y_values' in fit_params:
            x = fit_params['x_range']
            y = fit_params['y_values']
            if x is not None and y is not None:
                ax.plot(x, y, 'r-', linewidth=.7, alpha=0.3)

    # Formatting based on quality index
    if targeted_quality_index == 'corr_coef':
        ax.set_xlabel("Pearson's r\n(Correlation Coefficient)")
        ax.set_xlim(0, 1.0)
    elif targeted_quality_index == 'norm_dtw':
        ax.set_xlabel("Normalized DTW Distance")
    elif targeted_quality_index == 'dtw_ratio':
        ax.set_xlabel("DTW Ratio")
    elif targeted_quality_index == 'perc_diag':
        ax.set_xlabel("Percentage Diagonal (%)")
    
    ax.set_ylabel('Probability Density (%)')
    ax.set_title(f'Synthetic Core {targeted_quality_index.replace("_", " ").title()}: {len(all_fit_params)} Iterations\n[Optimal (shortest path) search; no age consideration)]')
    ax.grid(True, alpha=0.3)
    if mean_values:
        ax.legend()

    plt.tight_layout()
    plt.savefig(f'outputs/synthetic_iterations_{targeted_quality_index}.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"Saved plot for {targeted_quality_index} to outputs/synthetic_iterations_{targeted_quality_index}.png")

In [None]:
# Cell 10: Combine all binned data and recalculate distribution for each quality index

# Define quality indices to iterate through (matching Cell 8)
quality_indices = ['corr_coef', 'norm_dtw', 'perc_diag']

# Iterate through each quality index
for targeted_quality_index in quality_indices:
    print(f"\nProcessing combined distribution for {targeted_quality_index}...")
    
    # Define input filename based on log columns and quality index
    if LOG_COLUMNS == ['hiresMS']:
        output_csv_filename = f'outputs/synthetic_PDFs_MSonly_{targeted_quality_index}.csv'
    elif LOG_COLUMNS == ['hiresMS','CT', 'Lumin']:
        output_csv_filename = f'outputs/synthetic_PDFs_MSCTLumin_{targeted_quality_index}.csv'
    else:
        output_csv_filename = f'outputs/synthetic_PDFs_unspecified_{targeted_quality_index}.csv'
    
    # Check if the file exists
    import os
    if not os.path.exists(output_csv_filename):
        print(f"Error: File {output_csv_filename} does not exist. Skipping {targeted_quality_index}.")
        continue
    
    # Load fit params from CSV
    df_fit_params = pd.read_csv(output_csv_filename)

    # Initialize lists to collect all raw data points
    all_raw_data = []

    # Process each iteration to reconstruct raw data from binned data
    for _, row in df_fit_params.iterrows():
        # Extract binned data
        bins = np.fromstring(row['bins'].strip('[]'), sep=' ') if 'bins' in row and pd.notna(row['bins']) else None
        hist_percentages = np.fromstring(row['hist'].strip('[]'), sep=' ') if 'hist' in row and pd.notna(row['hist']) else None
        n_points = row['n_points'] if 'n_points' in row and pd.notna(row['n_points']) else None
        
        if bins is not None and hist_percentages is not None and n_points is not None:
            # Convert percentages back to raw counts
            raw_counts = (hist_percentages * n_points) / 100
            
            # Reconstruct data points by sampling from each bin
            bin_centers = (bins[:-1] + bins[1:]) / 2
            bin_width = bins[1] - bins[0]
            
            for i, count in enumerate(raw_counts):
                if count > 0:
                    # Generate random points within each bin
                    n_samples = int(round(count))
                    if n_samples > 0:
                        # Sample uniformly within the bin
                        bin_samples = np.random.uniform(
                            bins[i], bins[i+1], n_samples
                        )
                        all_raw_data.extend(bin_samples)

    # Convert to numpy array
    combined_data = np.array(all_raw_data)

    print(f"Combined {len(combined_data)} data points from {len(df_fit_params)} iterations")

    # Calculate combined statistics
    combined_mean = np.mean(combined_data)
    combined_std = np.std(combined_data)
    combined_median = np.median(combined_data)

    # Create new histogram from combined data
    n_bins = 30  # You can adjust this
    hist_combined, bins_combined = np.histogram(combined_data, bins=n_bins, density=True)

    # Fit normal distribution to combined data
    from scipy import stats
    fitted_mean, fitted_std = stats.norm.fit(combined_data)

    # Generate fitted curve
    x_fitted = np.linspace(combined_data.min(), combined_data.max(), 1000)
    y_fitted = stats.norm.pdf(x_fitted, fitted_mean, fitted_std)

    # Create the plot (same style as Cell 9)
    fig, ax = plt.subplots(figsize=(6, 4))

    # Plot combined histogram in gray bars (same as Cell 9)
    ax.hist(combined_data, bins=n_bins, alpha=0.5, color='gray', density=True, label='Combined Data')

    # Plot fitted normal curve as red line (same as Cell 9)
    ax.plot(x_fitted, y_fitted, 'r-', linewidth=2, alpha=0.8,
            label=f'Normal Fit (μ={fitted_mean:.3f}, σ={fitted_std:.3f})')

    # Formatting based on quality index
    if targeted_quality_index == 'corr_coef':
        ax.set_xlabel("Pearson's r\n(Correlation Coefficient)")
        ax.set_xlim(0, 1.0)
    elif targeted_quality_index == 'norm_dtw':
        ax.set_xlabel("Normalized DTW Distance")
    elif targeted_quality_index == 'dtw_ratio':
        ax.set_xlabel("DTW Ratio")
    elif targeted_quality_index == 'perc_diag':
        ax.set_xlabel("Percentage Diagonal (%)")
    
    ax.set_ylabel('Probability Density (%)')
    ax.set_title(f'Combined {targeted_quality_index.replace("_", " ").title()} Distribution from All {len(df_fit_params)} Iterations\n[Synthetic Core Analysis - Null Hypothesis]')
    ax.grid(True, alpha=0.3)
    ax.legend()

    plt.tight_layout()
    plt.savefig(f'outputs/combined_synthetic_distribution_{targeted_quality_index}.png', dpi=150, bbox_inches='tight')
    plt.show()

    # Print comprehensive summary
    print(f"\nCombined Distribution Summary for {targeted_quality_index}:")
    print(f"{'='*50}")
    print(f"Total data points: {len(combined_data):,}")
    print(f"Number of iterations combined: {len(df_fit_params)}")
    print(f"Combined Mean: {combined_mean:.4f}")
    print(f"Combined Median: {combined_median:.4f}")
    print(f"Combined Std Dev: {combined_std:.4f}")
    print(f"Data Range: {combined_data.min():.4f} to {combined_data.max():.4f}")
    print(f"\nFitted Normal Distribution:")
    print(f"Fitted Mean (μ): {fitted_mean:.4f}")
    print(f"Fitted Std Dev (σ): {fitted_std:.4f}")

    # Calculate some percentiles for reference
    percentiles = [5, 25, 50, 75, 95]
    pct_values = np.percentile(combined_data, percentiles)
    print(f"\nPercentiles:")
    for pct, val in zip(percentiles, pct_values):
        print(f"{pct}th percentile: {val:.4f}")