In [2]:
# Cell 1: Import Required Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tempfile
from tqdm import tqdm
import random
warnings.filterwarnings('ignore')

# Import pyCoreRelator functions
from pyCoreRelator import (
    create_segment_pool_from_available_cores,
    generate_synthetic_core_pair,
    create_synthetic_picked_depths,
    run_comprehensive_dtw_analysis,
    load_log_data,
    plot_correlation_distribution
)   

In [None]:
# Cell 2: Extract Core Lengths and Load Segment Pool
# Define core names and target parameters
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"
LOG_COLUMNS = ['hiresMS']  # Choose one log column for segment pool
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Define directory paths
mother_dir = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites'

# Function to extract core length from depth data
def get_core_length(core_name, depth_column='SB_DEPTH_cm'):
    """Extract maximum depth from core data"""
    # Try hiresMS file first (most common)
    depth_file = f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv'
    try:
        df = pd.read_csv(depth_file)
        return df[depth_column].max()
    except:
        print(f"Warning: Could not read depth from {depth_file}")
        return None

# Extract core lengths
core_a_length = get_core_length(CORE_A, DEPTH_COLUMN)
core_b_length = get_core_length(CORE_B, DEPTH_COLUMN)

print(f"Core A ({CORE_A}) length: {core_a_length} cm")
print(f"Core B ({CORE_B}) length: {core_b_length} cm")

# Define all cores for segment pool
SEGMENT_POOL_CORES = ["M9907-22PC", "M9907-23PC", "M9907-25PC"]

# Load segment pool data (turbidite database)
segment_pool_cores_data = {}
turb_logs = []
depth_logs = []

print("Loading segment pool from available cores...")

for core_name in SEGMENT_POOL_CORES:
    print(f"Processing {core_name}...")
    
    # Define log paths for this core
    core_log_paths = {
        'hiresMS': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv',
    }
    
    # Define column alternatives
    column_alternatives = {
        'hiresMS': ['MS'],
    }
    
    try:
        # Load data for segment pool
        log_data, md_data, available_columns, _, _ = load_log_data(
            core_log_paths,
            {},  # No images needed
            LOG_COLUMNS,
            depth_column=DEPTH_COLUMN,
            normalize=True,
            column_alternatives=column_alternatives
        )
        
        # Store core data
        segment_pool_cores_data[core_name] = {
            'log_data': log_data,
            'md_data': md_data,
            'available_columns': available_columns
        }
        
        # Load turbidite boundaries for this core
        picked_file = f'{mother_dir}/pyCoreRelator/pickeddepth/{core_name}_pickeddepth.csv'
        try:
            picked_df = pd.read_csv(picked_file)
            # Filter for category 1 boundaries only
            category_1_depths = picked_df[picked_df['category'] == 1]['picked_depths_cm'].values
            category_1_depths = np.sort(category_1_depths)  # Ensure sorted order
            
            # Create turbidite segments (from boundary to boundary)
            for i in range(len(category_1_depths) - 1):
                start_depth = category_1_depths[i]
                end_depth = category_1_depths[i + 1]
                
                # Find indices corresponding to these depths
                start_idx = np.argmin(np.abs(md_data - start_depth))
                end_idx = np.argmin(np.abs(md_data - end_depth))
                
                if end_idx > start_idx:
                    # Extract turbidite segment
                    turb_segment = log_data[start_idx:end_idx]
                    turb_depth = md_data[start_idx:end_idx] - md_data[start_idx]  # Relative depths
                    
                    turb_logs.append(turb_segment)
                    depth_logs.append(turb_depth)
            
        except Exception as e:
            print(f"Warning: Could not load turbidite boundaries for {core_name}: {e}")
        
        print(f"  Loaded: {len(log_data)} points, columns: {available_columns}")
        
    except Exception as e:
        print(f"Error loading {core_name}: {e}")

print(f"Segment pool created with {len(turb_logs)} turbidites")
print(f"Total cores processed: {len(segment_pool_cores_data)}")

# Set target dimensions based on segment pool
target_dimensions = turb_logs[0].shape[1] if len(turb_logs) > 0 and turb_logs[0].ndim > 1 else 1

print(f"Target dimensions: {target_dimensions}")
print(f"Core A target length: {core_a_length} cm")
print(f"Core B target length: {core_b_length} cm")

In [None]:
# Cell 3: Plot All Turbidite Segments from Pool
print(f"Plotting {len(turb_logs)} turbidite segments from the pool...")

# Create subplot grid
n_segments = len(turb_logs)
n_cols = 8
n_rows = int(np.ceil(n_segments / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
axes = axes.flatten() if n_segments > 1 else [axes]

for i, (turb_segment, turb_depth) in enumerate(zip(turb_logs, depth_logs)):
    ax = axes[i]
    
    # Plot segment
    if turb_segment.ndim > 1:
        # Multi-dimensional data - plot first column
        ax.plot(turb_segment[:, 0], turb_depth, 'b-', linewidth=1)
        ax.set_xlabel(f'{LOG_COLUMNS[0]} (normalized)')
    else:
        # 1D data
        ax.plot(turb_segment, turb_depth, 'b-', linewidth=1)
        ax.set_xlabel(f'{LOG_COLUMNS[0]} (normalized)')
    
    ax.set_ylabel('Relative Depth (cm)')
    ax.set_title(f'Segment {i+1}\n({len(turb_segment)} pts, {turb_depth[-1]:.1f} cm)')
    ax.grid(True, alpha=0.3)
    ax.invert_yaxis()  # Depth increases downward

# Hide unused subplots
for i in range(n_segments, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.suptitle(f'Turbidite Segment Pool ({len(turb_logs)} segments)', y=1.02, fontsize=16)
plt.show()

# Print summary statistics
segment_lengths = [len(seg) for seg in turb_logs]
segment_depths = [depth[-1] for depth in depth_logs]

print(f"\nSegment Pool Summary:")
print(f"  Total segments: {len(turb_logs)}")
print(f"  Length range: {min(segment_lengths)}-{max(segment_lengths)} points")
print(f"  Depth range: {min(segment_depths):.1f}-{max(segment_depths):.1f} cm")
print(f"  Mean depth: {np.mean(segment_depths):.1f} cm")
print(f"  Target dimensions: {target_dimensions}")

In [None]:
# Cell 4: Create and Plot Synthetic Core Pair

def create_synthetic_log_with_depths(thickness, turb_logs, depth_logs, exclude_inds=None):
    """Create synthetic log using turbidite database approach with picked depths at turbidite bases."""
    fake_log = np.array([]).reshape(0, target_dimensions) if target_dimensions > 1 else np.array([])
    md_log = np.array([])
    max_depth = 0
    inds = []
    picked_depths = []
    
    # Add initial boundary
    picked_depths.append((0, 1))
    
    while max_depth <= thickness:
        ind = random.choices(np.arange(len(turb_logs)), k=1)[0]
        
        # Skip if this index should be excluded
        if exclude_inds is not None and ind in exclude_inds:
            continue
            
        inds.append(ind)
        
        # Get turbidite segment from database
        turb_segment = turb_logs[ind]
        turb_depths = depth_logs[ind]
        
        # Ensure turbidite has proper dimensions
        if turb_segment.ndim == 1:
            turb_segment = turb_segment.reshape(-1, 1)
        
        # Ensure proper dimensions match target
        if turb_segment.shape[1] < target_dimensions:
            # Pad with noise if needed
            padding = np.random.normal(0, 0.1, (len(turb_segment), target_dimensions - turb_segment.shape[1]))
            turb_segment = np.hstack([turb_segment, padding])
        elif turb_segment.shape[1] > target_dimensions:
            # Truncate if needed
            turb_segment = turb_segment[:, :target_dimensions]
        
        # Append log data
        if target_dimensions > 1:
            if len(fake_log) == 0:
                fake_log = turb_segment.copy()
            else:
                fake_log = np.vstack((fake_log, turb_segment))
        else:
            fake_log = np.hstack((fake_log, turb_segment.flatten()))
        
        # Append depth data
        if len(md_log) == 0:
            md_log = np.hstack((md_log, 1 + turb_depths))
        else:
            md_log = np.hstack((md_log, 1 + md_log[-1] + turb_depths))
            
        max_depth = md_log[-1]
        
        # Add picked depth at the base of this turbidite (current max_depth)
        if max_depth <= thickness:
            picked_depths.append((max_depth, 1))
    
    # Truncate to target thickness
    valid_indices = md_log <= thickness
    if target_dimensions > 1:
        log = fake_log[valid_indices]
    else:
        log = fake_log[valid_indices]
    d = md_log[valid_indices]
    
    # Filter picked depths to only include those within the valid range
    valid_picked_depths = [(depth, category) for depth, category in picked_depths if depth <= thickness]
    
    # Ensure we have an end boundary
    if len(valid_picked_depths) == 0 or valid_picked_depths[-1][0] != d[-1]:
        valid_picked_depths.append((d[-1], 1))
    
    return log, d, inds, valid_picked_depths

# Generate synthetic logs for cores A and B
print("Generating synthetic core pair...")

synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a = create_synthetic_log_with_depths(
    core_a_length, turb_logs, depth_logs, exclude_inds=None
)
synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b = create_synthetic_log_with_depths(
    core_b_length, turb_logs, depth_logs, exclude_inds=None
)

# Plot synthetic core pair
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(4, 8))

# Plot synthetic core A
if synthetic_log_a.ndim > 1:
    ax1.plot(synthetic_log_a[:, 0], synthetic_md_a, 'b-', linewidth=1)
else:
    ax1.plot(synthetic_log_a, synthetic_md_a, 'b-', linewidth=1)

# Add picked depths as horizontal lines
for depth, category in synthetic_picked_a:
    ax1.axhline(y=depth, color='red', linestyle='--', alpha=0.7, linewidth=1)

ax1.set_xlabel(f'{LOG_COLUMNS[0]}\n(normalized)')
ax1.set_ylabel('Depth (cm)')
ax1.set_title(f'Synthetic Core A\n({len(inds_a)} turbidites)')
ax1.grid(True, alpha=0.3)
ax1.invert_yaxis()

# Plot synthetic core B
if synthetic_log_b.ndim > 1:
    ax2.plot(synthetic_log_b[:, 0], synthetic_md_b, 'g-', linewidth=1)
else:
    ax2.plot(synthetic_log_b, synthetic_md_b, 'g-', linewidth=1)

# Add picked depths as horizontal lines
for depth, category in synthetic_picked_b:
    ax2.axhline(y=depth, color='red', linestyle='--', alpha=0.7, linewidth=1)

ax2.set_xlabel(f'{LOG_COLUMNS[0]}\n(normalized)')
ax2.set_ylabel('Depth (cm)')
ax2.set_title(f'Synthetic Core B\n({len(inds_b)} turbidites)')
ax2.grid(True, alpha=0.3)
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

print(f"Synthetic Core A: {len(synthetic_log_a)} points, {len(inds_a)} turbidites, {len(synthetic_picked_a)} boundaries")
print(f"Synthetic Core B: {len(synthetic_log_b)} points, {len(inds_b)} turbidites, {len(synthetic_picked_b)} boundaries")
print(f"Turbidite indices used in A: {[int(x) for x in inds_a[:10]]}..." if len(inds_a) > 10 else f"Turbidite indices used in A: {[int(x) for x in inds_a]}")
print(f"Turbidite indices used in B: {[int(x) for x in inds_b[:10]]}..." if len(inds_b) > 10 else f"Turbidite indices used in B: {[int(x) for x in inds_b]}")

In [None]:
import random

# Cell 5: Custom Null Hypothesis with DTW Analysis on Synthetic Cores
print("=== Computing Null Hypothesis Distribution with DTW Analysis ===")

# Configure target characteristics for synthetic cores
target_dimensions = log_a.shape[1] if log_a.ndim > 1 else 1

core_a_config = {
    'target_length': len(log_a),
    'target_dimensions': target_dimensions
}

core_b_config = {
    'target_length': len(log_b), 
    'target_dimensions': target_dimensions
}

print(f"Target Core A: {core_a_config['target_length']} points, {core_a_config['target_dimensions']} dimensions")
print(f"Target Core B: {core_b_config['target_length']} points, {core_b_config['target_dimensions']} dimensions")

print(f"Available turbidites in database: {len(turb_logs)}")

def create_synthetic_log_with_depths(thickness, turb_logs, depth_logs, exclude_inds=None):
    """
    Create synthetic log using turbidite database approach with picked depths at turbidite bases.
    """
    fake_log = np.array([]).reshape(0, target_dimensions) if target_dimensions > 1 else np.array([])
    md_log = np.array([])
    max_depth = 0
    inds = []
    picked_depths = []
    
    # Add initial boundary
    picked_depths.append((0, 1))
    
    while max_depth <= thickness:
        ind = random.choices(np.arange(len(turb_logs)), k=1)[0]
        
        # Skip if this index should be excluded
        if exclude_inds is not None and ind in exclude_inds:
            continue
            
        inds.append(ind)
        
        # Get turbidite segment from database
        turb_segment = turb_logs[ind]
        turb_depths = depth_logs[ind]
        
        # Ensure turbidite has proper dimensions
        if turb_segment.ndim == 1:
            turb_segment = turb_segment.reshape(-1, 1)
        
        # Ensure proper dimensions match target
        if turb_segment.shape[1] < target_dimensions:
            # Pad with noise if needed
            padding = np.random.normal(0, 0.1, (len(turb_segment), target_dimensions - turb_segment.shape[1]))
            turb_segment = np.hstack([turb_segment, padding])
        elif turb_segment.shape[1] > target_dimensions:
            # Truncate if needed
            turb_segment = turb_segment[:, :target_dimensions]
        
        # Append log data
        if target_dimensions > 1:
            if len(fake_log) == 0:
                fake_log = turb_segment.copy()
            else:
                fake_log = np.vstack((fake_log, turb_segment))
        else:
            fake_log = np.hstack((fake_log, turb_segment.flatten()))
        
        # Append depth data
        if len(md_log) == 0:
            md_log = np.hstack((md_log, 1 + turb_depths))
        else:
            md_log = np.hstack((md_log, 1 + md_log[-1] + turb_depths))
            
        max_depth = md_log[-1]
        
        # Add picked depth at the base of this turbidite (current max_depth)
        if max_depth <= thickness:
            picked_depths.append((max_depth, 1))
    
    # Truncate to target thickness
    valid_indices = md_log <= thickness
    if target_dimensions > 1:
        log = fake_log[valid_indices]
    else:
        log = fake_log[valid_indices]
    d = md_log[valid_indices]
    
    # Filter picked depths to only include those within the valid range
    valid_picked_depths = [(depth, category) for depth, category in picked_depths if depth <= thickness]
    
    # Ensure we have an end boundary
    if len(valid_picked_depths) == 0 or valid_picked_depths[-1][0] != d[-1]:
        valid_picked_depths.append((d[-1], 1))
    
    return log, d, inds, valid_picked_depths

# Manual null hypothesis computation with DTW analysis
print("Computing null hypothesis with 10,000 synthetic core pairs and DTW analysis...")
print("This may take considerable time...")

n_iterations = 10000
r_values_null = []
successful_iterations = 0
failed_iterations = 0

for i in tqdm(range(n_iterations), desc="Null hypothesis iterations"):
    try:
        # Generate synthetic logs using turbidite database approach
        synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a = create_synthetic_log_with_depths(
            md_a[-1], turb_logs, depth_logs, exclude_inds=None
        )
        synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b = create_synthetic_log_with_depths(
            md_b[-1], turb_logs, depth_logs, exclude_inds=None
        )
        
        # Run DTW analysis on synthetic cores using synthetic picked depths
        dtw_results, valid_dtw_pairs, _, _, _, _, _ = run_comprehensive_dtw_analysis(
            synthetic_log_a, synthetic_log_b, synthetic_md_a, synthetic_md_b,
            picked_depths_a=synthetic_picked_a,
            picked_depths_b=synthetic_picked_b,
            top_bottom=True,
            independent_dtw=False,
            exclude_deadend=True,
            create_dtw_matrix=False,  # Skip visualization for speed
            creategif=False,  # Skip animation for speed
            age_consideration=False,  # No age constraints
            debug=False
        )
        
        # Extract correlation coefficients from DTW results
        correlations = []
        for pair_key in valid_dtw_pairs:
            if pair_key in dtw_results:
                _, _, quality_metrics = dtw_results[pair_key]
                if 'corr_coef' in quality_metrics and not np.isnan(quality_metrics['corr_coef']):
                    correlations.append(quality_metrics['corr_coef'])
        
        # Use mean correlation if we have valid correlations
        if correlations:
            mean_correlation = np.mean(correlations)
            r_values_null.append(mean_correlation)
            successful_iterations += 1
        else:
            failed_iterations += 1
            
    except Exception as e:
        failed_iterations += 1
        if i < 10:  # Only print first few errors
            print(f"Error in iteration {i}: {e}")

# Calculate distribution statistics
r_values_null = np.array(r_values_null)


In [None]:
# Cell 6: Plot R-value Distribution
print("=== Plotting R-value Distribution ===")

# Create temporary CSV for plot_correlation_distribution compatibility
temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
temp_csv_path = temp_csv.name

# Write data in the format expected by plot_correlation_distribution
with open(temp_csv_path, 'w') as f:
    f.write("mapping_id,corr_coef\n")
    for i, r_val in enumerate(r_values_null):
        f.write(f"{i},{r_val}\n")

temp_csv.close()

# Plot null hypothesis distribution
try:
    fig, ax, fit_params = plot_correlation_distribution(
        csv_file=temp_csv_path,
        quality_index='corr_coef',
        save_png=True,
        png_filename=f'Null_Hypothesis_Distribution_{CORE_A}_{CORE_B}.png',
        core_a_name=CORE_A,
        core_b_name=CORE_B,
        pdf_method='skew-normal',  # Use skew-normal for better fit
        no_bins=50
    )
    
    plt.show()
    
finally:
    # Clean up temporary file
    if os.path.exists(temp_csv_path):
        os.unlink(temp_csv_path)

# Save null hypothesis results for future use
results_filename = f'Null_Hypothesis_Results_{CORE_A}_{CORE_B}.csv'
null_df = pd.DataFrame({
    'iteration': range(len(r_values_null)),
    'r_value': r_values_null
})
null_df.to_csv(results_filename, index=False)
print(f"Null hypothesis r-values saved to: {results_filename}")

print("=== NULL HYPOTHESIS TESTING COMPLETE ===")