In [None]:
# Cell 1: Import Required Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tempfile
from tqdm import tqdm
warnings.filterwarnings('ignore')

# Import pyCoreRelator functions
from pyCoreRelator import (
    create_segment_pool_from_available_cores,
    generate_synthetic_core_pair,
    compute_pycorerelator_null_hypothesis,
    create_synthetic_picked_depths,
    run_comprehensive_dtw_analysis,
    load_log_data,
    plot_correlation_distribution
)   

print("Packages imported successfully")

In [None]:
# Cell 2: Load ALL Available Core Data
# Define core names and ALL available log columns
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"
# LOG_COLUMNS = ['hiresMS', 'CT', 'R', 'G', 'B', 'Lumin', 'Den_gm/cc']  # Choose available logs
LOG_COLUMNS = ['hiresMS']  # Choose one log column
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Define directory paths
mother_dir = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites/Cascadia_core_data/OSU_dataset/'

# Define paths for Core A - ALL available log types
core_a_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_hiresMS_MLfilled.csv',
    'CT': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_CT_MLfilled.csv',
    'Lumin': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'R': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'G': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'B': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'Den_gm/cc': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_Den_MLfilled.csv'
}

# Define paths for Core B - ALL available log types
core_b_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_hiresMS_MLfilled.csv',
    'CT': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_CT_MLfilled.csv',
    'Lumin': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'R': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'G': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'B': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'Den_gm/cc': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_Den_MLfilled.csv'
}

# Define column mapping for alternative column names
column_alternatives = {
    'hiresMS': ['MS'],
    'CT': ['CT_value'],
    'R': ['R', 'red', 'Red'],
    'G': ['G', 'green', 'Green'],
    'B': ['B', 'blue', 'Blue'],
    'Lumin': ['luminance', 'Luminance'],
    'Den_gm/cc': ['Density', 'density']
}

# Load data for Core A - ALL available logs
log_a, md_a, available_columns_a, _, _ = load_log_data(
    core_a_log_paths,
    {},  # No images needed for null hypothesis
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

# Load data for Core B - ALL available logs
log_b, md_b, available_columns_b, _, _ = load_log_data(
    core_b_log_paths,
    {},  # No images needed for null hypothesis
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

print(f"Loaded {CORE_A}: {len(log_a)} points, columns: {available_columns_a}")
print(f"Loaded {CORE_B}: {len(log_b)} points, columns: {available_columns_b}")
print(f"Log A shape: {log_a.shape}")
print(f"Log B shape: {log_b.shape}")

In [None]:
# Cell 2.5: Define All Cores for Segment Pool
# Define all available cores to be included in the segment pool
SEGMENT_POOL_CORES = ["M9907-24PC", "M9907-26PC", "M9907-27PC"]  # Add any other available cores

# Initialize data structure for all cores that will contribute to segment pool
segment_pool_cores_data = {
    CORE_A: {
        'log_data': log_a,
        'md_data': md_a,
        'available_columns': available_columns_a
    },
    CORE_B: {
        'log_data': log_b,
        'md_data': md_b,
        'available_columns': available_columns_b
    }
}

# Try to load segment pool cores if they exist
for core_name in SEGMENT_POOL_CORES:
    # Check if core files exist before attempting to load
    core_hiresMS_path = f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv'
    
    if not os.path.exists(core_hiresMS_path):
        print(f"Core files for {core_name} do not exist, skipping...")
        continue
        
    try:
        # Define paths for segment pool core
        core_log_paths = {
            'hiresMS': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv',
            'CT': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_CT_MLfilled.csv',
            'Lumin': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'R': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'G': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'B': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'Den_gm/cc': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_Den_MLfilled.csv'
        }
        
        # Load segment pool core data
        log_core, md_core, available_columns_core, _, _ = load_log_data(
            core_log_paths,
            {},
            LOG_COLUMNS,
            depth_column=DEPTH_COLUMN,
            normalize=True,
            column_alternatives=column_alternatives
        )
        
        # Define this core for segment pool
        segment_pool_cores_data[core_name] = {
            'log_data': log_core,
            'md_data': md_core,
            'available_columns': available_columns_core
        }
        
        print(f"Successfully loaded {core_name}: {len(log_core)} points, columns: {available_columns_core}")
        
    except Exception as e:
        print(f"Could not load {core_name}: {e}")
        continue

print(f"\nTotal cores defined for segment pool: {len(segment_pool_cores_data)}")
print(f"Core names: {list(segment_pool_cores_data.keys())}")

In [None]:
# Cell 3: Load ALL Picked Depths for ALL Segment Pool Cores
print("=== Loading Picked Depths for ALL Segment Pool Cores ===")

# Store picked depths for all cores (including segment pool cores)
picked_depths_info = {}

# Load picked depths for ALL cores in segment pool
for core_name in segment_pool_cores_data.keys():
    pickeddepth_csv = f'pickeddepth/{core_name}_pickeddepth.csv'
    
    if os.path.exists(pickeddepth_csv):
        try:
            picked_data = pd.read_csv(pickeddepth_csv)
            # Combine all categories to get maximum segment diversity
            all_depths = picked_data['picked_depths_cm'].values.astype('float32')
            all_categories = picked_data['category'].values.astype('int')
            picked_depths_info[core_name] = list(zip(all_depths.tolist(), all_categories.tolist()))
            print(f"Loaded {len(all_depths)} picked depths for {core_name} (categories: {np.unique(all_categories)})")
        except Exception as e:
            print(f"Error loading picked depths for {core_name}: {e}")
            # Create default boundaries if CSV loading fails
            core_md = segment_pool_cores_data[core_name]['md_data']
            default_depths = np.linspace(core_md[0], core_md[-1], 15)
            default_categories = np.ones(len(default_depths), dtype=int)
            picked_depths_info[core_name] = list(zip(default_depths.tolist(), default_categories.tolist()))
            print(f"Created {len(default_depths)} default boundaries for {core_name}")
    else:
        print(f"Picked depths file for {core_name} does not exist, creating default boundaries...")
        # Create default boundaries if CSV doesn't exist
        core_md = segment_pool_cores_data[core_name]['md_data']
        default_depths = np.linspace(core_md[0], core_md[-1], 15)
        default_categories = np.ones(len(default_depths), dtype=int)
        picked_depths_info[core_name] = list(zip(default_depths.tolist(), default_categories.tolist()))
        print(f"Created {len(default_depths)} default boundaries for {core_name}")

print(f"\nPicked depths loaded for {len(picked_depths_info)} cores:")
for core_name, depths in picked_depths_info.items():
    print(f"  {core_name}: {len(depths)} depth boundaries")

print("\nThese picked depths will be used:")
print(f"- For segment extraction from all cores during segment pool creation")
print(f"- For DTW analysis with synthetic cores (using {CORE_A} and {CORE_B} boundaries)")

In [None]:
# Cell 4: Create Comprehensive Segment Pool Using Picked Depths
print("=== Creating Comprehensive Segment Pool Using Actual Picked Depths ===")

# Initialize boundaries data structure
all_boundaries_data = {}

# Create segments for ALL defined cores using their actual picked depths
for core_name in segment_pool_cores_data.keys():
    if core_name in picked_depths_info:
        # Use actual picked depths to create segments
        picked_depths = [depth for depth, category in picked_depths_info[core_name]]
        picked_depths = sorted(picked_depths)  # Ensure depths are sorted
        
        core_md = segment_pool_cores_data[core_name]['md_data']
        
        # Create segments based on picked depths
        segments = []
        for i in range(len(picked_depths) - 1):
            start_depth = picked_depths[i]
            end_depth = picked_depths[i + 1]
            
            start_idx = np.searchsorted(core_md, start_depth)
            end_idx = np.searchsorted(core_md, end_depth)
            
            if end_idx > start_idx:
                segments.append((start_idx, end_idx, f"seg_{i}"))
        
        all_boundaries_data[core_name] = {
            'depth_boundaries': np.array(picked_depths),
            'segments': segments
        }
        
        print(f"Created {len(segments)} segments for {core_name} using {len(picked_depths)} picked depths")
    else:
        # Fallback to evenly spaced boundaries if no picked depths available
        core_md = segment_pool_cores_data[core_name]['md_data']
        n_boundaries = max(15, len(core_md) // 100)
        boundaries = np.linspace(core_md[0], core_md[-1], n_boundaries)
        
        simple_segments = []
        for i in range(len(boundaries) - 1):
            start_idx = np.searchsorted(core_md, boundaries[i])
            end_idx = np.searchsorted(core_md, boundaries[i + 1])
            if end_idx > start_idx:
                simple_segments.append((start_idx, end_idx, f"seg_{i}"))
        
        all_boundaries_data[core_name] = {
            'depth_boundaries': boundaries,
            'segments': simple_segments
        }
        
        print(f"Created {len(simple_segments)} segments for {core_name} using default boundaries")

# Create comprehensive segment pool from ALL defined cores
segment_pool = create_segment_pool_from_available_cores(segment_pool_cores_data, all_boundaries_data)

print(f"\nCreated comprehensive segment pool with {len(segment_pool)} segments from {len(segment_pool_cores_data)} cores")

# Display detailed segment pool statistics
if segment_pool:
    lengths = [seg['length'] for seg in segment_pool]
    depth_spans = [seg['depth_span'] for seg in segment_pool]
    dimensions = [seg['log_data'].shape[1] if seg['log_data'].ndim > 1 else 1 for seg in segment_pool]
    
    print(f"\n=== Segment Pool Statistics ===")
    print(f"Segment lengths: min={min(lengths)}, max={max(lengths)}, mean={np.mean(lengths):.1f}")
    print(f"Depth spans: min={min(depth_spans):.1f}, max={max(depth_spans):.1f}, mean={np.mean(depth_spans):.1f}")
    print(f"Log dimensions: min={min(dimensions)}, max={max(dimensions)}, mean={np.mean(dimensions):.1f}")
    
    # Show core distribution in segment pool
    core_counts = {}
    for seg in segment_pool:
        core_name = seg.get('core_name', 'unknown')
        core_counts[core_name] = core_counts.get(core_name, 0) + 1
    
    print(f"\nSegment distribution by core:")
    for core_name, count in core_counts.items():
        print(f"  {core_name}: {count} segments")

In [None]:
# Cell 5: Custom Null Hypothesis with DTW Analysis on Synthetic Cores
print("=== Computing Null Hypothesis Distribution with DTW Analysis ===")

# Configure target characteristics for synthetic cores
target_dimensions = log_a.shape[1] if log_a.ndim > 1 else 1

core_a_config = {
    'target_length': len(log_a),
    'target_dimensions': target_dimensions
}

core_b_config = {
    'target_length': len(log_b), 
    'target_dimensions': target_dimensions
}

print(f"Target Core A: {core_a_config['target_length']} points, {core_a_config['target_dimensions']} dimensions")
print(f"Target Core B: {core_b_config['target_length']} points, {core_b_config['target_dimensions']} dimensions")

# Manual null hypothesis computation with DTW analysis
print("Computing null hypothesis with 10,000 synthetic core pairs and DTW analysis...")
print("This may take considerable time...")

n_iterations = 10000
r_values_null = []
successful_iterations = 0
failed_iterations = 0

for i in tqdm(range(n_iterations), desc="Null hypothesis iterations"):
    try:
        # Generate synthetic core pair - need to get segment info for picked depths
        synthetic_log_a, synthetic_md_a, segment_info_a = generate_synthetic_core_pair(
            segment_pool, core_a_config['target_length'], core_a_config['target_dimensions'],
            return_segment_info=True  # Get segment info to calculate picked depths
        )
        synthetic_log_b, synthetic_md_b, segment_info_b = generate_synthetic_core_pair(
            segment_pool, core_b_config['target_length'], core_b_config['target_dimensions'],
            return_segment_info=True  # Get segment info to calculate picked depths
        )
        
        # Create synthetic picked depths based on segment boundaries
        synthetic_picked_a = create_synthetic_picked_depths(synthetic_md_a, segment_info_a)
        synthetic_picked_b = create_synthetic_picked_depths(synthetic_md_b, segment_info_b)
        
        # Run DTW analysis on synthetic cores using synthetic picked depths
        dtw_results, valid_dtw_pairs, _, _, _, _, _ = run_comprehensive_dtw_analysis(
            synthetic_log_a, synthetic_log_b, synthetic_md_a, synthetic_md_b,
            picked_depths_a=synthetic_picked_a,
            picked_depths_b=synthetic_picked_b,
            top_bottom=True,
            independent_dtw=False,
            exclude_deadend=True,
            create_dtw_matrix=False,  # Skip visualization for speed
            creategif=False,  # Skip animation for speed
            age_consideration=False,  # No age constraints
            debug=False
        )
        
        # Extract correlation coefficients from DTW results
        correlations = []
        for pair_key in valid_dtw_pairs:
            if pair_key in dtw_results:
                _, _, quality_metrics = dtw_results[pair_key]
                if 'corr_coef' in quality_metrics and not np.isnan(quality_metrics['corr_coef']):
                    correlations.append(quality_metrics['corr_coef'])
        
        # Use mean correlation if we have valid correlations
        if correlations:
            mean_correlation = np.mean(correlations)
            r_values_null.append(mean_correlation)
            successful_iterations += 1
        else:
            failed_iterations += 1
            
    except Exception as e:
        failed_iterations += 1
        if i < 10:  # Only print first few errors
            print(f"Error in iteration {i}: {e}")

# Calculate distribution statistics
r_values_null = np.array(r_values_null)
distribution_stats = {
    'mean': np.mean(r_values_null),
    'std': np.std(r_values_null),
    'percentile_95': np.percentile(r_values_null, 95),
    'percentile_97_5': np.percentile(r_values_null, 97.5),
    'percentile_99': np.percentile(r_values_null, 99)
}

print(f"\n=== NULL HYPOTHESIS RESULTS ===")
print(f"Successful iterations: {successful_iterations}")
print(f"Failed iterations: {failed_iterations}")
print(f"Mean r-value: {distribution_stats['mean']:.4f} Â± {distribution_stats['std']:.4f}")
print(f"95th percentile threshold: {distribution_stats['percentile_95']:.4f}")
print(f"97.5th percentile threshold: {distribution_stats['percentile_97_5']:.4f}")
print(f"99th percentile threshold: {distribution_stats['percentile_99']:.4f}")

In [None]:
# Cell 6: Plot R-value Distribution
print("=== Plotting R-value Distribution ===")

# Create temporary CSV for plot_correlation_distribution compatibility
temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
temp_csv_path = temp_csv.name

# Write data in the format expected by plot_correlation_distribution
with open(temp_csv_path, 'w') as f:
    f.write("mapping_id,corr_coef\n")
    for i, r_val in enumerate(r_values_null):
        f.write(f"{i},{r_val}\n")

temp_csv.close()

# Plot null hypothesis distribution
try:
    fig, ax, fit_params = plot_correlation_distribution(
        csv_file=temp_csv_path,
        quality_index='corr_coef',
        save_png=True,
        png_filename=f'Null_Hypothesis_Distribution_{CORE_A}_{CORE_B}.png',
        core_a_name=CORE_A,
        core_b_name=CORE_B,
        pdf_method='skew-normal',  # Use skew-normal for better fit
        no_bins=50
    )
    
    plt.show()
    
finally:
    # Clean up temporary file
    if os.path.exists(temp_csv_path):
        os.unlink(temp_csv_path)

# Save null hypothesis results for future use
results_filename = f'Null_Hypothesis_Results_{CORE_A}_{CORE_B}.csv'
null_df = pd.DataFrame({
    'iteration': range(len(r_values_null)),
    'r_value': r_values_null
})
null_df.to_csv(results_filename, index=False)
print(f"Null hypothesis r-values saved to: {results_filename}")

print("=== NULL HYPOTHESIS TESTING COMPLETE ===")