In [None]:
# Cell 1: Import Required Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tempfile
from tqdm import tqdm
warnings.filterwarnings('ignore')

# Import pyCoreRelator functions
from pyCoreRelator import (
    create_segment_pool_from_available_cores,
    generate_synthetic_core_pair,
    compute_pycorerelator_null_hypothesis,
    plot_correlation_distribution,
    run_comprehensive_dtw_analysis,
    load_log_data
)

print("Packages imported successfully")

In [None]:
# Cell 2: Load Core Data
# Define core names and log columns
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"
LOG_COLUMNS = ['hiresMS']  # Single log for simplicity
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Define directory paths
mother_dir = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites/Cascadia_core_data/OSU_dataset/'

# Define paths for Core A
core_a_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_hiresMS_MLfilled.csv'
}

# Define paths for Core B
core_b_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_hiresMS_MLfilled.csv'
}

# Load data for both cores
log_a, md_a, available_columns_a, _, _ = load_log_data(
    core_a_log_paths,
    {},  # No images needed
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True
)

log_b, md_b, available_columns_b, _, _ = load_log_data(
    core_b_log_paths,
    {},  # No images needed
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True
)

print(f"Loaded {CORE_A}: {len(log_a)} points, columns: {available_columns_a}")
print(f"Loaded {CORE_B}: {len(log_b)} points, columns: {available_columns_b}")

In [None]:
# Cell 3: Load Boundaries and Run DTW Analysis to Get Segments
# Load picked depths from CSV files (or define them manually if needed)
pickeddepth_a_csv = f'pickeddepth/{CORE_A}_pickeddepth.csv'
pickeddepth_b_csv = f'pickeddepth/{CORE_B}_pickeddepth.csv'

# Extract picked depths (category 1 only for simplicity)
if os.path.exists(pickeddepth_a_csv):
    picked_data_a = pd.read_csv(pickeddepth_a_csv)
    all_depths_a_cat1 = picked_data_a[picked_data_a['category'] == 1]['picked_depths_cm'].values.astype('float32')
else:
    # Define some default boundaries if CSV doesn't exist
    all_depths_a_cat1 = np.linspace(0, md_a[-1], 10)  # 10 evenly spaced boundaries

if os.path.exists(pickeddepth_b_csv):
    picked_data_b = pd.read_csv(pickeddepth_b_csv)
    all_depths_b_cat1 = picked_data_b[picked_data_b['category'] == 1]['picked_depths_cm'].values.astype('float32')
else:
    # Define some default boundaries if CSV doesn't exist
    all_depths_b_cat1 = np.linspace(0, md_b[-1], 10)  # 10 evenly spaced boundaries

# Run DTW analysis to get segments (no age constraints for null hypothesis)
print("Running DTW analysis to identify segments...")
dtw_results, valid_dtw_pairs, segments_a, segments_b, depth_boundaries_a, depth_boundaries_b, _ = run_comprehensive_dtw_analysis(
    log_a, log_b, md_a, md_b, 
    picked_depths_a=all_depths_a_cat1, 
    picked_depths_b=all_depths_b_cat1,
    top_bottom=True,
    independent_dtw=False,
    exclude_deadend=True,
    create_dtw_matrix=False,  # Skip visualization for speed
    creategif=False,  # Skip animation for speed
    age_consideration=False,  # No age constraints for null hypothesis
    debug=False
)

print(f"Found {len(valid_dtw_pairs)} valid segment pairs")
print(f"Core A segments: {len(segments_a)}")
print(f"Core B segments: {len(segments_b)}")

In [None]:
# Cell 4: Create Segment Pool from Available Cores
print("=== Creating Segment Pool ===")

# Prepare data structures for segment pool creation
all_cores_data = {
    CORE_A: {
        'log_data': log_a,
        'md_data': md_a
    },
    CORE_B: {
        'log_data': log_b,
        'md_data': md_b
    }
}

all_boundaries_data = {
    CORE_A: {
        'depth_boundaries': depth_boundaries_a,
        'segments': segments_a
    },
    CORE_B: {
        'depth_boundaries': depth_boundaries_b,
        'segments': segments_b
    }
}

# Create segment pool from available cores
segment_pool = create_segment_pool_from_available_cores(all_cores_data, all_boundaries_data)

print(f"Created segment pool with {len(segment_pool)} segments")

# Display summary statistics
if segment_pool:
    lengths = [seg['length'] for seg in segment_pool]
    depth_spans = [seg['depth_span'] for seg in segment_pool]
    print(f"Segment lengths: min={min(lengths)}, max={max(lengths)}, mean={np.mean(lengths):.1f}")
    print(f"Depth spans: min={min(depth_spans):.1f}, max={max(depth_spans):.1f}, mean={np.mean(depth_spans):.1f}")

In [None]:
# Cell 5: Compute Null Hypothesis Distribution
print("=== Computing Null Hypothesis Distribution ===")

# Configure target characteristics for synthetic cores
target_dimensions = log_a.shape[1] if log_a.ndim > 1 else 1

core_a_config = {
    'target_length': len(log_a),
    'target_dimensions': target_dimensions
}

core_b_config = {
    'target_length': len(log_b), 
    'target_dimensions': target_dimensions
}

print(f"Target Core A: {core_a_config['target_length']} points, {core_a_config['target_dimensions']} dimensions")
print(f"Target Core B: {core_b_config['target_length']} points, {core_b_config['target_dimensions']} dimensions")

# Compute null hypothesis distribution with 10,000 iterations
print("Computing null hypothesis with 10,000 synthetic core pairs...")
print("This may take several minutes...")

null_hypothesis_results = compute_pycorerelator_null_hypothesis(
    segment_pool=segment_pool,
    core_a_config=core_a_config,
    core_b_config=core_b_config,
    n_iterations=10000,
    exponent=0.3,  # Match typical analysis parameters
    dtw_distance_threshold=None,
    progress_bar=True
)

# Extract results
r_values_null = null_hypothesis_results['r_values_distribution']
distribution_stats = null_hypothesis_results['distribution_stats']

print(f"\n=== NULL HYPOTHESIS RESULTS ===")
print(f"Successful iterations: {null_hypothesis_results['successful_iterations']}")
print(f"Failed iterations: {null_hypothesis_results['failed_iterations']}")
print(f"Mean r-value: {distribution_stats['mean']:.4f} Â± {distribution_stats['std']:.4f}")
print(f"95th percentile threshold: {distribution_stats['percentile_95']:.4f}")
print(f"97.5th percentile threshold: {distribution_stats['percentile_97_5']:.4f}")
print(f"99th percentile threshold: {distribution_stats['percentile_99']:.4f}")

In [None]:
# Cell 6: Plot R-value Distribution
print("=== Plotting R-value Distribution ===")

# Create temporary CSV for plot_correlation_distribution compatibility
temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
temp_csv_path = temp_csv.name

# Write data in the format expected by plot_correlation_distribution
with open(temp_csv_path, 'w') as f:
    f.write("mapping_id,corr_coef\n")
    for i, r_val in enumerate(r_values_null):
        f.write(f"{i},{r_val}\n")

temp_csv.close()

# Plot null hypothesis distribution
try:
    fig, ax, fit_params = plot_correlation_distribution(
        csv_file=temp_csv_path,
        quality_index='corr_coef',
        save_png=True,
        png_filename=f'Null_Hypothesis_Distribution_{CORE_A}_{CORE_B}.png',
        core_a_name=CORE_A,
        core_b_name=CORE_B,
        pdf_method='skew-normal',  # Use skew-normal for better fit
        no_bins=50
    )
    
    plt.show()
    
finally:
    # Clean up temporary file
    if os.path.exists(temp_csv_path):
        os.unlink(temp_csv_path)

# Save null hypothesis results for future use
results_filename = f'Null_Hypothesis_Results_{CORE_A}_{CORE_B}.csv'
null_df = pd.DataFrame({
    'iteration': range(len(r_values_null)),
    'r_value': r_values_null
})
null_df.to_csv(results_filename, index=False)
print(f"Null hypothesis r-values saved to: {results_filename}")

print("=== NULL HYPOTHESIS TESTING COMPLETE ===")