In [None]:
# =============================================================================
# NULL HYPOTHESIS TESTING FOR pyCoreRelator
# =============================================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tempfile
from IPython.display import Image as IPImage, display

# Import pyCoreRelator null hypothesis functions
from pyCoreRelator import (
    create_segment_pool_from_available_cores,
    generate_synthetic_core_pair,
    compute_pycorerelator_null_hypothesis,
    plot_correlation_distribution
)

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("=== NULL HYPOTHESIS TESTING FOR pyCoreRelator ===")
print("This notebook implements null hypothesis testing for DTW correlation significance")
print("Required inputs from main analysis: log_a, log_b, md_a, md_b, segments, boundaries, dtw_results")
print("Setup complete!")

In [None]:
# =============================================================================
# CELL 2: LOAD REQUIRED DATA FROM MAIN ANALYSIS
# =============================================================================

from pyCoreRelator import (
    load_log_data,
    run_comprehensive_dtw_analysis,
    calculate_interpolated_ages
)

print("=== LOADING REQUIRED DATA FROM MAIN ANALYSIS ===")

# Check if required variables exist from main analysis
required_vars = [
    'log_a', 'log_b', 'md_a', 'md_b',
    'segments_a', 'segments_b', 
    'depth_boundaries_a', 'depth_boundaries_b',
    'dtw_results', 'valid_dtw_pairs',
    'CORE_A', 'CORE_B'
]

missing_vars = []
for var_name in required_vars:
    if var_name not in globals():
        missing_vars.append(var_name)

if missing_vars:
    print(f"Missing required variables: {missing_vars}")
    print("Loading data from scratch...")
    
    # =================================================================
    # STANDALONE DATA LOADING (modify paths and parameters as needed)
    # =================================================================
    
    # Define core names
    CORE_A = "M9907-23PC"  # Modify these
    CORE_B = "M9907-25PC"
    
    # Define log columns and paths
    LOG_COLUMNS = ['hiresMS']  # Modify as needed
    DEPTH_COLUMN = 'SB_DEPTH_cm'
    
    # Define directory paths (MODIFY THESE PATHS)
    mother_dir = '/path/to/your/data/'  # UPDATE THIS PATH
    
    # Core A paths
    core_a_log_paths = {
        'hiresMS': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_hiresMS_MLfilled.csv',
        # Add more log types as needed
    }
    
    # Core B paths  
    core_b_log_paths = {
        'hiresMS': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_hiresMS_MLfilled.csv',
        # Add more log types as needed
    }
    
    # Column alternatives
    column_alternatives = {
        'hiresMS': ['MS'],
        'CT': ['CT_value'],
        # Add more alternatives as needed
    }
    
    try:
        # Load data for Core A
        print(f"Loading {CORE_A}...")
        log_a, md_a, available_columns_a, _, _ = load_log_data(
            core_a_log_paths,
            {},  # No images for null hypothesis
            LOG_COLUMNS,
            depth_column=DEPTH_COLUMN,
            normalize=True,
            column_alternatives=column_alternatives
        )
        
        # Load data for Core B
        print(f"Loading {CORE_B}...")
        log_b, md_b, available_columns_b, _, _ = load_log_data(
            core_b_log_paths,
            {},  # No images for null hypothesis
            LOG_COLUMNS,
            depth_column=DEPTH_COLUMN,
            normalize=True,
            column_alternatives=column_alternatives
        )
        
        print("✓ Log data loaded successfully")
        
        # Load picked depths (modify paths as needed)
        try:
            picked_data_a = pd.read_csv(f'pickeddepth/{CORE_A}_pickeddepth.csv')
            picked_depths_a = picked_data_a['picked_depths_cm'].values.tolist()
            print(f"✓ Loaded {len(picked_depths_a)} picked depths for {CORE_A}")
        except:
            picked_depths_a = None
            print(f"⚠️  No picked depths found for {CORE_A}")
            
        try:
            picked_data_b = pd.read_csv(f'pickeddepth/{CORE_B}_pickeddepth.csv')
            picked_depths_b = picked_data_b['picked_depths_cm'].values.tolist()
            print(f"✓ Loaded {len(picked_depths_b)} picked depths for {CORE_B}")
        except:
            picked_depths_b = None
            print(f"⚠️  No picked depths found for {CORE_B}")
        
        # Run DTW analysis to get segments and boundaries
        print("Running DTW analysis...")
        dtw_results, valid_dtw_pairs, segments_a, segments_b, depth_boundaries_a, depth_boundaries_b, _ = run_comprehensive_dtw_analysis(
            log_a, log_b, md_a, md_b,
            picked_depths_a=picked_depths_a,
            picked_depths_b=picked_depths_b,
            top_bottom=True,
            top_depth=0.0,
            independent_dtw=False,
            exclude_deadend=True,
            create_dtw_matrix=False,  # Skip visualization for null hypothesis
            creategif=False,
            age_consideration=False,  # No age constraints for null hypothesis
            debug=False
        )
        
        print("✓ DTW analysis completed")
        
    except Exception as e:
        print(f"ERROR loading data: {e}")
        print("Please modify the paths and parameters in this cell")
        raise

else:
    print("✓ All required variables found!")

# Display final status
print(f"\nFinal status:")
print(f"Core A: {CORE_A} - {len(log_a)} data points")
print(f"Core B: {CORE_B} - {len(log_b)} data points") 
print(f"Valid segment pairs: {len(valid_dtw_pairs)}")
print(f"Segments A: {len(segments_a)}, Segments B: {len(segments_b)}")

# Determine dimensionality
target_dimensions = log_a.shape[1] if log_a.ndim > 1 else 1
print(f"Log dimensions: {target_dimensions}")

print("Data loading complete!")

In [None]:
# =============================================================================
# CELL 3: CREATE SEGMENT POOL FROM AVAILABLE CORES
# =============================================================================

print("=== CREATING SEGMENT POOL FROM AVAILABLE CORES ===")

# Step 1: Prepare data structures for segment pool creation
# Add all available cores and their boundaries
all_cores_data = {
    CORE_A: {
        'log_data': log_a,
        'md_data': md_a
    },
    CORE_B: {
        'log_data': log_b,
        'md_data': md_b
    }
    # Add more cores here if available
    # 'CORE_C': {'log_data': log_c, 'md_data': md_c},
}

all_boundaries_data = {
    CORE_A: {
        'depth_boundaries': depth_boundaries_a,
        'segments': segments_a
    },
    CORE_B: {
        'depth_boundaries': depth_boundaries_b,
        'segments': segments_b
    }
    # Add corresponding boundaries for additional cores
}

print(f"Available cores: {list(all_cores_data.keys())}")
print(f"Total cores in pool: {len(all_cores_data)}")

# Step 2: Create segment pool from available cores
print("\nCreating segment pool from available cores...")
segment_pool = create_segment_pool_from_available_cores(all_cores_data, all_boundaries_data)

print(f"\n=== SEGMENT POOL STATISTICS ===")
print(f"Total segments extracted: {len(segment_pool)}")

if segment_pool:
    # Analyze segment characteristics
    lengths = [seg['length'] for seg in segment_pool]
    depth_spans = [seg['depth_span'] for seg in segment_pool]
    dimensions = [seg['dimensions'] for seg in segment_pool]
    source_cores = [seg['source_core'] for seg in segment_pool]
    
    print(f"Segment lengths: min={min(lengths)}, max={max(lengths)}, mean={np.mean(lengths):.1f}")
    print(f"Depth spans: min={min(depth_spans):.1f}, max={max(depth_spans):.1f}, mean={np.mean(depth_spans):.1f}")
    print(f"Dimensions: {set(dimensions)}")
    
    # Count segments by source core
    from collections import Counter
    core_counts = Counter(source_cores)
    print(f"Segments by core: {dict(core_counts)}")
    
    # Show a few example segments
    print(f"\nExample segments:")
    for i, seg in enumerate(segment_pool[:3]):
        print(f"  {seg['segment_id']}: {seg['length']} points, span={seg['depth_span']:.1f}, dims={seg['dimensions']}")
        
else:
    print("ERROR: No segments were extracted!")
    
print("Segment pool creation complete!")

In [None]:
# =============================================================================
# CELL 4: CONFIGURE TARGET CHARACTERISTICS FOR SYNTHETIC CORES
# =============================================================================

print("=== CONFIGURING TARGET CHARACTERISTICS ===")

# Step 3: Configure target characteristics for synthetic cores
# Match the characteristics of your actual analysis
target_dimensions = log_a.shape[1] if log_a.ndim > 1 else 1

core_a_config = {
    'target_length': len(log_a),
    'target_dimensions': target_dimensions
}

core_b_config = {
    'target_length': len(log_b), 
    'target_dimensions': target_dimensions
}

print(f"Target Core A: {core_a_config['target_length']} points, {core_a_config['target_dimensions']} dimensions")
print(f"Target Core B: {core_b_config['target_length']} points, {core_b_config['target_dimensions']} dimensions")

# Verify compatibility with segment pool
compatible_segments = [seg for seg in segment_pool if seg['dimensions'] == target_dimensions]
print(f"\nCompatible segments in pool: {len(compatible_segments)}/{len(segment_pool)}")

if len(compatible_segments) == 0:
    print("ERROR: No segments in pool match the target dimensions!")
else:
    print("✓ Segment pool is compatible with target characteristics")

# Optional: Test synthetic core generation
print("\n=== TESTING SYNTHETIC CORE GENERATION ===")
try:
    print("Generating test synthetic core pair...")
    test_log_a, test_log_b, test_md_a, test_md_b, test_bounds_a, test_bounds_b = generate_synthetic_core_pair(
        segment_pool, 
        core_a_config['target_length'], 
        core_b_config['target_length'],
        target_dimensions
    )
    
    print(f"✓ Test successful!")
    print(f"  Generated Core A: {len(test_log_a)} points, shape={test_log_a.shape}")
    print(f"  Generated Core B: {len(test_log_b)} points, shape={test_log_b.shape}")
    print(f"  Boundaries A: {len(test_bounds_a)} boundaries")
    print(f"  Boundaries B: {len(test_bounds_b)} boundaries")
    
except Exception as e:
    print(f"ERROR in synthetic core generation: {e}")
    
print("Configuration complete!")

In [None]:
# =============================================================================
# CELL 5: CONFIGURE NULL HYPOTHESIS PARAMETERS
# =============================================================================

print("=== CONFIGURING NULL HYPOTHESIS PARAMETERS ===")

# Configuration parameters for null hypothesis testing
# These should match your actual DTW analysis parameters

# Number of iterations for null hypothesis distribution
n_iterations = 1000  # Adjust based on computational resources
# Recommended: 1000 for testing, 10000+ for publication

# DTW parameters (should match your main analysis)
exponent = 0.3  # DTW exponent parameter
dtw_distance_threshold = None  # Set to match your main analysis if used

# Display progress bar during computation
progress_bar = True

print(f"Null hypothesis configuration:")
print(f"  Iterations: {n_iterations}")
print(f"  DTW exponent: {exponent}")
print(f"  DTW distance threshold: {dtw_distance_threshold}")
print(f"  Progress bar: {progress_bar}")

# Estimate computation time
segments_per_core = len(segment_pool) / len(all_cores_data)
estimated_time_per_iteration = 0.5  # seconds (rough estimate)
estimated_total_time = n_iterations * estimated_time_per_iteration

print(f"\nEstimated computation time: {estimated_total_time/60:.1f} minutes")
print(f"Average segments per core: {segments_per_core:.1f}")

if n_iterations > 5000:
    print("WARNING: Large number of iterations may take significant time")
    print("Consider reducing n_iterations for initial testing")

print("Parameter configuration complete!")

In [None]:
# =============================================================================
# CELL 6: COMPUTE NULL HYPOTHESIS DISTRIBUTION
# =============================================================================

print("=== COMPUTING NULL HYPOTHESIS DISTRIBUTION ===")
print("This may take several minutes depending on n_iterations...")
print(f"Processing {n_iterations} synthetic core pairs...")

# Step 4: Compute null hypothesis distribution
null_hypothesis_results = compute_pycorerelator_null_hypothesis(
    segment_pool=segment_pool,
    core_a_config=core_a_config,
    core_b_config=core_b_config,
    n_iterations=n_iterations,
    exponent=exponent,
    dtw_distance_threshold=dtw_distance_threshold,
    progress_bar=progress_bar
)

# Step 5: Extract and analyze null hypothesis distribution
r_values_null = null_hypothesis_results['r_values_distribution']
distribution_stats = null_hypothesis_results['distribution_stats']

print(f"\n=== NULL HYPOTHESIS RESULTS ===")
print(f"Successful iterations: {null_hypothesis_results['successful_iterations']}")
print(f"Failed iterations: {null_hypothesis_results['failed_iterations']}")
success_rate = null_hypothesis_results['successful_iterations'] / n_iterations * 100
print(f"Success rate: {success_rate:.1f}%")

if len(r_values_null) > 0:
    print(f"\nDistribution statistics:")
    print(f"  Mean r-value: {distribution_stats['mean']:.4f} ± {distribution_stats['std']:.4f}")
    print(f"  Median r-value: {distribution_stats['median']:.4f}")
    print(f"  Range: [{distribution_stats['min']:.4f}, {distribution_stats['max']:.4f}]")
    print(f"  95th percentile: {distribution_stats['percentile_95']:.4f}")
    print(f"  97.5th percentile: {distribution_stats['percentile_97_5']:.4f}")
    print(f"  99th percentile: {distribution_stats['percentile_99']:.4f}")
    
    # Quick visualization
    plt.figure(figsize=(10, 6))
    plt.hist(r_values_null, bins=50, alpha=0.7, density=True, color='skyblue', edgecolor='black')
    plt.axvline(distribution_stats['mean'], color='red', linestyle='-', linewidth=2, label=f"Mean: {distribution_stats['mean']:.3f}")
    plt.axvline(distribution_stats['percentile_95'], color='orange', linestyle='--', linewidth=2, label=f"95th %ile: {distribution_stats['percentile_95']:.3f}")
    plt.axvline(distribution_stats['percentile_97_5'], color='green', linestyle='--', linewidth=2, label=f"97.5th %ile: {distribution_stats['percentile_97_5']:.3f}")
    plt.xlabel('Correlation Coefficient (r)')
    plt.ylabel('Density')
    plt.title(f'Null Hypothesis Distribution\n{CORE_A} vs {CORE_B} (n={len(r_values_null)})')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
else:
    print("ERROR: No successful iterations - cannot create null hypothesis distribution")

print("Null hypothesis computation complete!")

In [None]:
# =============================================================================
# CELL 7: EXTRACT ACTUAL CORRELATIONS FROM DTW ANALYSIS
# =============================================================================

print("=== EXTRACTING ACTUAL CORRELATIONS ===")

# Step 6: Extract actual correlation coefficients from your DTW analysis
actual_correlations = {}

# Extract correlation coefficients from valid segment pairs
segment_correlations = []
for pair_key in valid_dtw_pairs:
    if pair_key in dtw_results:
        paths, _, quality_metrics = dtw_results[pair_key]
        if quality_metrics and len(quality_metrics) > 0:
            qi = quality_metrics[0]  # Get first quality indicator
            if 'corr_coef' in qi and not np.isnan(qi['corr_coef']):
                pair_name = f"Seg_{pair_key[0]+1}-{pair_key[1]+1}"
                actual_correlations[pair_name] = qi['corr_coef']
                segment_correlations.append(qi['corr_coef'])

print(f"Found {len(actual_correlations)} valid segment correlations")

# Compute overall correlation (mean of segment correlations)
if actual_correlations:
    overall_actual_correlation = np.mean(list(actual_correlations.values()))
    actual_correlations['Overall'] = overall_actual_correlation
    
    print(f"Segment correlations:")
    for pair_name, r_value in actual_correlations.items():
        if pair_name != 'Overall':
            print(f"  {pair_name}: r = {r_value:.4f}")
    
    print(f"\nOverall correlation: {overall_actual_correlation:.4f}")
    print(f"Range of segment correlations: [{min(segment_correlations):.4f}, {max(segment_correlations):.4f}]")
    print(f"Standard deviation: {np.std(segment_correlations):.4f}")
    
else:
    print("WARNING: No valid correlations found in DTW results")
    print("This may indicate issues with the DTW analysis or segment pairs")

print("Actual correlation extraction complete!")

In [None]:
# =============================================================================
# CELL 8: PLOT NULL HYPOTHESIS DISTRIBUTION WITH ACTUAL CORRELATIONS
# =============================================================================

print("=== PLOTTING NULL HYPOTHESIS DISTRIBUTION ===")

# Step 7: Create temporary CSV for plot_correlation_distribution compatibility
temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
temp_csv_path = temp_csv.name

# Write data in the format expected by plot_correlation_distribution
with open(temp_csv_path, 'w') as f:
    f.write("mapping_id,corr_coef\n")
    for i, r_val in enumerate(r_values_null):
        f.write(f"{i},{r_val}\n")

temp_csv.close()

# Step 8: Plot null hypothesis distribution with actual correlations
try:
    fig, ax, fit_params = plot_correlation_distribution(
        csv_file=temp_csv_path,
        quality_index='corr_coef',
        save_png=True,
        png_filename=f'Null_Hypothesis_Distribution_{CORE_A}_{CORE_B}.png',
        core_a_name=CORE_A,
        core_b_name=CORE_B,
        pdf_method='KDE',
        kde_bandwidth=0.05
    )
    
    # Add actual correlations as vertical lines
    if actual_correlations:
        colors = ['red', 'orange', 'blue', 'green', 'purple']
        color_idx = 0
        
        for pair_name, r_value in actual_correlations.items():
            if not np.isnan(r_value):
                # Calculate percentile and p-value
                percentile = (r_values_null < r_value).mean() * 100
                p_value = (r_values_null >= r_value).mean()
                
                # Choose color and style based on significance and pair type
                if pair_name == 'Overall':
                    color = 'red'
                    linestyle = 'solid'
                    linewidth = 3
                    alpha = 0.9
                else:
                    color = colors[color_idx % len(colors)]
                    linestyle = 'dashed'
                    linewidth = 2
                    alpha = 0.7
                    color_idx += 1
                
                # Determine significance marker
                if p_value < 0.001:
                    sig_marker = '***'
                elif p_value < 0.01:
                    sig_marker = '**'
                elif p_value < 0.05:
                    sig_marker = '*'
                else:
                    sig_marker = ''
                
                # Plot vertical line
                ax.axvline(r_value, color=color, linestyle=linestyle, alpha=alpha, linewidth=linewidth)
                
                # Add text label
                y_pos = ax.get_ylim()[1] * (0.85 if pair_name == 'Overall' else 0.75 - color_idx * 0.05)
                ax.text(r_value, y_pos, f'{pair_name}\nr={r_value:.3f}\np={p_value:.3f}{sig_marker}', 
                       rotation=90, ha='right', va='top', fontsize=8,
                       bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))
        
        # Add legend for significance levels
        legend_elements = [
            plt.Line2D([0], [0], color='black', linestyle='-', label='*** p < 0.001'),
            plt.Line2D([0], [0], color='black', linestyle='-', label='** p < 0.01'),
            plt.Line2D([0], [0], color='black', linestyle='-', label='* p < 0.05')
        ]
        ax.legend(handles=legend_elements, loc='upper right', fontsize=8)
        
        plt.tight_layout()
        plt.show()
        
    else:
        print("No actual correlations to overlay on the plot")
        
finally:
    # Clean up temporary file
    if os.path.exists(temp_csv_path):
        os.unlink(temp_csv_path)

print("Null hypothesis distribution plotting complete!")

In [None]:
# =============================================================================
# CELL 9: SIGNIFICANCE TESTING RESULTS AND INTERPRETATION
# =============================================================================

print("=== SIGNIFICANCE TESTING RESULTS ===")

if len(r_values_null) > 0 and actual_correlations:
    
    # Print detailed significance results
    print(f"Null hypothesis distribution (n={len(r_values_null)}):")
    print(f"  Mean: {distribution_stats['mean']:.4f} ± {distribution_stats['std']:.4f}")
    print(f"  Significance thresholds:")
    print(f"    p < 0.05: r > {distribution_stats['percentile_95']:.4f}")
    print(f"    p < 0.025: r > {distribution_stats['percentile_97_5']:.4f}")
    print(f"    p < 0.01: r > {distribution_stats['percentile_99']:.4f}")
    
    print(f"\nActual correlations vs null hypothesis:")
    print(f"{'Segment':<12} {'r-value':<8} {'Percentile':<10} {'p-value':<8} {'Significance':<12}")
    print("-" * 60)
    
    significant_segments = []
    non_significant_segments = []
    
    for pair_name, r_value in actual_correlations.items():
        if not np.isnan(r_value):
            percentile = (r_values_null < r_value).mean() * 100
            p_value = (r_values_null >= r_value).mean()
            
            if p_value < 0.001:
                significance = "***"
                status = "Highly Sig."
            elif p_value < 0.01:
                significance = "**"
                status = "Very Sig."
            elif p_value < 0.05:
                significance = "*"
                status = "Significant"
            else:
                significance = ""
                status = "Not Sig."
            
            print(f"{pair_name:<12} {r_value:<8.4f} {percentile:<10.1f} {p_value:<8.4f} {status:<12} {significance}")
            
            if p_value < 0.05:
                significant_segments.append(pair_name)
            else:
                non_significant_segments.append(pair_name)
    
    # Summary
    total_segments = len(actual_correlations) - (1 if 'Overall' in actual_correlations else 0)
    print(f"\n=== SUMMARY ===")
    print(f"Total segment correlations tested: {total_segments}")
    print(f"Significant correlations (p < 0.05): {len([s for s in significant_segments if s != 'Overall'])}")
    print(f"Non-significant correlations: {len(non_significant_segments)}")
    
    if 'Overall' in actual_correlations:
        overall_p = (r_values_null >= actual_correlations['Overall']).mean()
        overall_sig = "SIGNIFICANT" if overall_p < 0.05 else "NOT SIGNIFICANT"
        print(f"Overall correlation: {overall_sig} (p = {overall_p:.4f})")
    
    # Effect size interpretation
    if 'Overall' in actual_correlations:
        r_val = actual_correlations['Overall']
        if abs(r_val) >= 0.7:
            effect_size = "Large"
        elif abs(r_val) >= 0.3:
            effect_size = "Medium"
        elif abs(r_val) >= 0.1:
            effect_size = "Small"
        else:
            effect_size = "Negligible"
        
        print(f"Effect size: {effect_size} (|r| = {abs(r_val):.3f})")
    
else:
    print("ERROR: Cannot perform significance testing")
    print("Missing null hypothesis distribution or actual correlations")

print("\nSignificance testing complete!")

In [None]:
# =============================================================================
# CELL 10: SAVE RESULTS AND SUMMARY
# =============================================================================

print("=== SAVING RESULTS ===")

# Step 9: Save null hypothesis results for future use
results_filename = f'Null_Hypothesis_Results_{CORE_A}_{CORE_B}.csv'
null_df = pd.DataFrame({
    'iteration': range(len(r_values_null)),
    'r_value': r_values_null
})
null_df.to_csv(results_filename, index=False)
print(f"Null hypothesis r-values saved to: {results_filename}")

# Save summary statistics
summary_filename = f'Null_Hypothesis_Summary_{CORE_A}_{CORE_B}.txt'
with open(summary_filename, 'w') as f:
    f.write(f"=== NULL HYPOTHESIS TESTING SUMMARY ===\n")
    f.write(f"Cores: {CORE_A} vs {CORE_B}\n")
    f.write(f"Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    
    f.write(f"ANALYSIS PARAMETERS:\n")
    f.write(f"  Iterations: {n_iterations}\n")
    f.write(f"  Success rate: {null_hypothesis_results['successful_iterations']}/{n_iterations} ({null_hypothesis_results['successful_iterations']/n_iterations*100:.1f}%)\n")
    f.write(f"  DTW exponent: {exponent}\n")
    f.write(f"  Segment pool size: {len(segment_pool)}\n\n")
    
    if len(r_values_null) > 0:
        f.write(f"NULL HYPOTHESIS DISTRIBUTION:\n")
        f.write(f"  Mean: {distribution_stats['mean']:.4f} ± {distribution_stats['std']:.4f}\n")
        f.write(f"  Median: {distribution_stats['median']:.4f}\n")
        f.write(f"  Range: [{distribution_stats['min']:.4f}, {distribution_stats['max']:.4f}]\n")
        f.write(f"  95th percentile: {distribution_stats['percentile_95']:.4f}\n")
        f.write(f"  97.5th percentile: {distribution_stats['percentile_97_5']:.4f}\n")
        f.write(f"  99th percentile: {distribution_stats['percentile_99']:.4f}\n\n")
    
    if actual_correlations:
        f.write(f"ACTUAL CORRELATIONS:\n")
        for pair_name, r_value in actual_correlations.items():
            if not np.isnan(r_value):
                percentile = (r_values_null < r_value).mean() * 100
                p_value = (r_values_null >= r_value).mean()
                significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
                f.write(f"  {pair_name}: r = {r_value:.4f}, p = {p_value:.4f} {significance}\n")

print(f"Summary saved to: {summary_filename}")

# Save metadata
metadata_filename = f'Null_Hypothesis_Metadata_{CORE_A}_{CORE_B}.json'
import json

metadata = {
    'analysis_date': pd.Timestamp.now().isoformat(),
    'cores': [CORE_A, CORE_B],
    'parameters': {
        'n_iterations': n_iterations,
        'exponent': exponent,
        'dtw_distance_threshold': dtw_distance_threshold,
        'target_dimensions': target_dimensions
    },
    'results': {
        'successful_iterations': null_hypothesis_results['successful_iterations'],
        'failed_iterations': null_hypothesis_results['failed_iterations'],
        'distribution_stats': distribution_stats if len(r_values_null) > 0 else None
    },
    'segment_pool': {
        'total_segments': len(segment_pool),
        'cores_included': list(all_cores_data.keys())
    }
}

with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to: {metadata_filename}")

# Display final summary
print(f"\n=== FINAL SUMMARY ===")
print(f"✓ Null hypothesis testing completed for {CORE_A} vs {CORE_B}")
print(f"✓ {len(r_values_null)} successful iterations out of {n_iterations}")
if len(r_values_null) > 0:
    print(f"✓ Null hypothesis mean: {distribution_stats['mean']:.4f} ± {distribution_stats['std']:.4f}")
    print(f"✓ Significance threshold (p<0.05): r > {distribution_stats['percentile_95']:.4f}")

if actual_correlations and 'Overall' in actual_correlations:
    overall_p = (r_values_null >= actual_correlations['Overall']).mean()
    print(f"✓ Overall correlation: r = {actual_correlations['Overall']:.4f}, p = {overall_p:.4f}")

print(f"\nFiles saved:")
print(f"  - {results_filename}")
print(f"  - {summary_filename}")
print(f"  - {metadata_filename}")
print(f"  - Null_Hypothesis_Distribution_{CORE_A}_{CORE_B}.png")

print("\n=== NULL HYPOTHESIS TESTING COMPLETE ===")

In [None]:
# =============================================================================
# CELL 11: OPTIONAL ANALYSIS AND TROUBLESHOOTING
# =============================================================================

print("=== OPTIONAL ANALYSIS AND TROUBLESHOOTING ===")

# This cell contains optional analyses and troubleshooting code
# Run sections as needed based on your results

# =============================================================================
# OPTION 1: ANALYZE FAILED ITERATIONS
# =============================================================================
if null_hypothesis_results['failed_iterations'] > 0:
    print(f"\n--- ANALYZING FAILED ITERATIONS ---")
    failure_rate = null_hypothesis_results['failed_iterations'] / n_iterations * 100
    print(f"Failure rate: {failure_rate:.1f}% ({null_hypothesis_results['failed_iterations']}/{n_iterations})")
    
    if failure_rate > 20:
        print("WARNING: High failure rate may indicate:")
        print("  - Insufficient segments in pool")
        print("  - Incompatible segment dimensions")
        print("  - DTW analysis parameter issues")
        print("  - Target core lengths too large for available segments")

# =============================================================================
# OPTION 2: COMPARE WITH DIFFERENT AGGREGATION METHODS
# =============================================================================
if len(r_values_null) > 0 and actual_correlations:
    print(f"\n--- TESTING DIFFERENT AGGREGATION METHODS ---")
    
    # Extract individual segment correlations (excluding 'Overall')
    segment_r_values = [r for name, r in actual_correlations.items() 
                       if name != 'Overall' and not np.isnan(r)]
    
    if len(segment_r_values) > 1:
        # Different ways to aggregate segment correlations
        aggregation_methods = {
            'Mean': np.mean(segment_r_values),
            'Median': np.median(segment_r_values),
            'Max': np.max(segment_r_values),
            'Min': np.min(segment_r_values),
            'RMS': np.sqrt(np.mean(np.array(segment_r_values)**2))
        }
        
        print("Different aggregation methods:")
        for method, value in aggregation_methods.items():
            percentile = (r_values_null < value).mean() * 100
            p_value = (r_values_null >= value).mean()
            sig = "*" if p_value < 0.05 else ""
            print(f"  {method}: r = {value:.4f}, p = {p_value:.4f} {sig}")

# =============================================================================
# OPTION 3: DISTRIBUTION DIAGNOSTICS
# =============================================================================
if len(r_values_null) > 0:
    print(f"\n--- DISTRIBUTION DIAGNOSTICS ---")
    
    # Test for normality
    from scipy.stats import shapiro, anderson
    
    # Shapiro-Wilk test (for small samples)
    if len(r_values_null) <= 5000:
        stat, p_val = shapiro(r_values_null)
        print(f"Shapiro-Wilk normality test: p = {p_val:.4f}")
        print(f"  Normal distribution: {'Yes' if p_val > 0.05 else 'No'}")
    
    # Anderson-Darling test
    ad_result = anderson(r_values_null, dist='norm')
    print(f"Anderson-Darling test statistic: {ad_result.statistic:.4f}")
    
    # Check for outliers
    q1, q3 = np.percentile(r_values_null, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = r_values_null[(r_values_null < lower_bound) | (r_values_null > upper_bound)]
    print(f"Outliers (IQR method): {len(outliers)} ({len(outliers)/len(r_values_null)*100:.1f}%)")

# =============================================================================
# OPTION 4: SEGMENT POOL ANALYSIS
# =============================================================================
if segment_pool:
    print(f"\n--- SEGMENT POOL ANALYSIS ---")
    
    # Analyze segment characteristics in detail
    lengths = [seg['length'] for seg in segment_pool]
    depth_spans = [seg['depth_span'] for seg in segment_pool]
    
    print(f"Segment length distribution:")
    print(f"  Mean: {np.mean(lengths):.1f} ± {np.std(lengths):.1f}")
    print(f"  Median: {np.median(lengths):.1f}")
    print(f"  Quartiles: Q1={np.percentile(lengths, 25):.1f}, Q3={np.percentile(lengths, 75):.1f}")
    
    print(f"Depth span distribution:")
    print(f"  Mean: {np.mean(depth_spans):.1f} ± {np.std(depth_spans):.1f}")
    print(f"  Median: {np.median(depth_spans):.1f}")
    
    # Check for segment diversity
    unique_lengths = len(set(lengths))
    print(f"Segment diversity: {unique_lengths} unique lengths from {len(segment_pool)} segments")

# =============================================================================
# OPTION 5: POWER ANALYSIS
# =============================================================================
if len(r_values_null) > 0:
    print(f"\n--- POWER ANALYSIS ---")
    
    # Estimate power to detect different effect sizes
    null_std = np.std(r_values_null)
    null_mean = np.mean(r_values_null)
    alpha = 0.05
    threshold = np.percentile(r_values_null, 95)
    
    effect_sizes = [0.1, 0.3, 0.5, 0.7]
    print("Power to detect effect sizes (assuming normal distribution):")
    
    for effect_size in effect_sizes:
        # Simulate distribution under alternative hypothesis
        alt_mean = null_mean + effect_size
        # Simple power approximation
        z_score = (threshold - alt_mean) / null_std
        power = 1 - stats.norm.cdf(z_score)
        print(f"  Effect size r = {effect_size}: Power ≈ {power:.3f}")

# =============================================================================
# OPTION 6: RECOMMENDATIONS
# =============================================================================
print(f"\n--- RECOMMENDATIONS ---")

if len(r_values_null) == 0:
    print("⚠️  No successful iterations - Check segment pool and parameters")
elif null_hypothesis_results['failed_iterations'] / n_iterations > 0.5:
    print("⚠️  High failure rate - Consider:")
    print("   - Increasing segment pool size")
    print("   - Reducing target core lengths")
    print("   - Checking DTW parameters")
elif len(r_values_null) < 1000:
    print("⚠️  Small sample size - Consider increasing n_iterations for more reliable results")
else:
    print("✓ Analysis appears successful")

if actual_correlations and len(r_values_null) > 0:
    overall_p = (r_values_null >= actual_correlations.get('Overall', 0)).mean()
    if overall_p < 0.001:
        print("✓ Very strong evidence against null hypothesis")
    elif overall_p < 0.01:
        print("✓ Strong evidence against null hypothesis")
    elif overall_p < 0.05:
        print("✓ Moderate evidence against null hypothesis")
    else:
        print("⚠️  Weak evidence against null hypothesis - correlation may not be significant")

print("\nOptional analysis complete!")