In [1]:
# Cell 1: Import Required Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import tempfile
from tqdm import tqdm
warnings.filterwarnings('ignore')

# Import pyCoreRelator functions
from pyCoreRelator import (
    create_segment_pool_from_available_cores,
    generate_synthetic_core_pair,
    create_synthetic_picked_depths,
    run_comprehensive_dtw_analysis,
    load_log_data,
    plot_correlation_distribution
)   

In [None]:
# Cell 2: Load ALL Available Core Data
# Define core names and ALL available log columns
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"
# LOG_COLUMNS = ['hiresMS', 'CT', 'R', 'G', 'B', 'Lumin', 'Den_gm/cc']  # Choose available logs
LOG_COLUMNS = ['hiresMS']  # Choose one log column
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Define directory paths
mother_dir = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites/Cascadia_core_data/OSU_dataset/'

# Define paths for Core A - ALL available log types
core_a_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_hiresMS_MLfilled.csv',
    'CT': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_CT_MLfilled.csv',
    'Lumin': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'R': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'G': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'B': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'Den_gm/cc': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_Den_MLfilled.csv'
}

# Define paths for Core B - ALL available log types
core_b_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_hiresMS_MLfilled.csv',
    'CT': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_CT_MLfilled.csv',
    'Lumin': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'R': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'G': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'B': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'Den_gm/cc': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_Den_MLfilled.csv'
}

# Define column mapping for alternative column names
column_alternatives = {
    'hiresMS': ['MS'],
    'CT': ['CT_value'],
    'R': ['R', 'red', 'Red'],
    'G': ['G', 'green', 'Green'],
    'B': ['B', 'blue', 'Blue'],
    'Lumin': ['luminance', 'Luminance'],
    'Den_gm/cc': ['Density', 'density']
}

# Load data for Core A - ALL available logs
log_a, md_a, available_columns_a, _, _ = load_log_data(
    core_a_log_paths,
    {},  # No images needed for null hypothesis
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

# Load data for Core B - ALL available logs
log_b, md_b, available_columns_b, _, _ = load_log_data(
    core_b_log_paths,
    {},  # No images needed for null hypothesis
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

print(f"Loaded {CORE_A}: {len(log_a)} points, columns: {available_columns_a}")
print(f"Loaded {CORE_B}: {len(log_b)} points, columns: {available_columns_b}")
print(f"Log A shape: {log_a.shape}")
print(f"Log B shape: {log_b.shape}")

In [None]:
# Cell 2.5: Define All Cores for Segment Pool
# Define all available cores to be included in the segment pool
SEGMENT_POOL_CORES = ["M9907-22PC", "M9907-23PC", "M9907-25PC"]  # Add any other available cores

# Initialize data structure for all cores that will contribute to segment pool
segment_pool_cores_data = {
    CORE_A: {
        'log_data': log_a,
        'md_data': md_a,
        'available_columns': available_columns_a
    },
    CORE_B: {
        'log_data': log_b,
        'md_data': md_b,
        'available_columns': available_columns_b
    }
}

# Try to load segment pool cores if they exist
for core_name in SEGMENT_POOL_CORES:
    # Check if core files exist before attempting to load
    core_hiresMS_path = f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv'
    
    if not os.path.exists(core_hiresMS_path):
        print(f"Core files for {core_name} do not exist, skipping...")
        continue
        
    try:
        # Define paths for segment pool core
        core_log_paths = {
            'hiresMS': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv',
            'CT': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_CT_MLfilled.csv',
            'Lumin': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'R': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'G': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'B': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_RGB_MLfilled.csv',
            'Den_gm/cc': f'{mother_dir}_compiled_logs/{core_name}/ML_filled/{core_name}_Den_MLfilled.csv'
        }
        
        # Load segment pool core data - no images needed
        log_core, md_core, available_columns_core, _, _ = load_log_data(
            core_log_paths,
            # img_paths=None,
            log_columns=LOG_COLUMNS,    
            depth_column=DEPTH_COLUMN,
            normalize=True,
            column_alternatives=column_alternatives
        )
        
        # Define this core for segment pool
        segment_pool_cores_data[core_name] = {
            'log_data': log_core,
            'md_data': md_core,
            'available_columns': available_columns_core
        }
        
        print(f"Successfully loaded {core_name}: {len(log_core)} points, columns: {available_columns_core}")
        
    except Exception as e:
        print(f"Could not load {core_name}: {e}")
        continue

print(f"\nTotal cores defined for segment pool: {len(segment_pool_cores_data)}")
print(f"Core names: {list(segment_pool_cores_data.keys())}")

In [None]:
# Cell 3: Load ALL Picked Depths for ALL Segment Pool Cores
print("=== Loading Picked Depths for ALL Segment Pool Cores ===")

# Parameter to select specific categories for segment pool creation
selected_categories = [1]  # Only use category 1 boundaries for segment pool

# Store picked depths for all cores (including segment pool cores)
picked_depths_info = {}

# Initialize turbidite database similar to file_context_0
turb_logs = []
depth_logs = []
log_number = []

def add_turbidites_to_database(turb_logs, depth_logs, log_number, log, md, bases):
    """Add turbidites to database using base depths (picked depths represent bottoms of turbidites)"""
    count = len(log_number)  # Continue numbering from existing logs
    for i in range(len(bases) - 1):
        # Extract turbidite between current base and next base (bases are bottoms)
        turb_logs.append(log[(md >= bases[i]) & (md < bases[i+1])])
        depth_log = md[(md >= bases[i]) & (md < bases[i+1])]
        # Normalize depth to start from 0
        depth_log = depth_log - min(depth_log)
        depth_logs.append(depth_log)
        log_number.append(count)
        count += 1

# Load picked depths for ALL cores in segment pool
for core_name in segment_pool_cores_data.keys():
    pickeddepth_csv = f'pickeddepth/{core_name}_pickeddepth.csv'
    
    if os.path.exists(pickeddepth_csv):
        try:
            picked_data = pd.read_csv(pickeddepth_csv)
            
            # Filter depths to only include selected categories
            all_depths = picked_data['picked_depths_cm'].values.astype('float32')
            all_categories = picked_data['category'].values.astype('int')
            
            # Extract depths only for selected categories
            filtered_depths = []
            filtered_categories = []
            for depth, category in zip(all_depths, all_categories):
                if category in selected_categories:
                    filtered_depths.append(depth)
                    filtered_categories.append(category)
            
            if filtered_depths:
                # Sort depths since they represent turbidite bases
                filtered_depths = sorted(filtered_depths)
                picked_depths_info[core_name] = list(zip(filtered_depths, filtered_categories))
                
                # Add turbidites to database using picked depths as bases
                core_log_data = segment_pool_cores_data[core_name]['log_data']
                core_md_data = segment_pool_cores_data[core_name]['md_data']
                add_turbidites_to_database(turb_logs, depth_logs, log_number, 
                                         core_log_data, core_md_data, filtered_depths)
                
                print(f"Loaded {len(filtered_depths)} picked depths for {core_name} from categories {selected_categories}")
                print(f"Added {len(filtered_depths)-1} turbidites to database from {core_name}")
            else:
                print(f"No depths found for selected categories {selected_categories} in {core_name}, creating default boundaries...")
                # Create default boundaries if no selected category depths found
                core_md = segment_pool_cores_data[core_name]['md_data']
                default_depths = np.linspace(core_md[0], core_md[-1], 15)
                default_categories = [selected_categories[0]] * len(default_depths)
                picked_depths_info[core_name] = list(zip(default_depths.tolist(), default_categories))
                
                # Add turbidites using default boundaries
                core_log_data = segment_pool_cores_data[core_name]['log_data']
                add_turbidites_to_database(turb_logs, depth_logs, log_number, 
                                         core_log_data, core_md, default_depths.tolist())
                
                print(f"Created {len(default_depths)} default boundaries for {core_name}")
                print(f"Added {len(default_depths)-1} turbidites to database from {core_name}")
                
        except Exception as e:
            print(f"Error loading picked depths for {core_name}: {e}")
            # Create default boundaries if CSV loading fails
            core_md = segment_pool_cores_data[core_name]['md_data']
            default_depths = np.linspace(core_md[0], core_md[-1], 15)
            default_categories = [selected_categories[0]] * len(default_depths)
            picked_depths_info[core_name] = list(zip(default_depths.tolist(), default_categories))
            
            # Add turbidites using default boundaries
            core_log_data = segment_pool_cores_data[core_name]['log_data']
            add_turbidites_to_database(turb_logs, depth_logs, log_number, 
                                     core_log_data, core_md, default_depths.tolist())
            
            print(f"Created {len(default_depths)} default boundaries for {core_name}")
            print(f"Added {len(default_depths)-1} turbidites to database from {core_name}")
    else:
        print(f"Picked depths file for {core_name} does not exist, creating default boundaries...")
        # Create default boundaries if CSV doesn't exist
        core_md = segment_pool_cores_data[core_name]['md_data']
        default_depths = np.linspace(core_md[0], core_md[-1], 15)
        default_categories = [selected_categories[0]] * len(default_depths)
        picked_depths_info[core_name] = list(zip(default_depths.tolist(), default_categories))
        
        # Add turbidites using default boundaries
        core_log_data = segment_pool_cores_data[core_name]['log_data']
        add_turbidites_to_database(turb_logs, depth_logs, log_number, 
                                 core_log_data, core_md, default_depths.tolist())
        
        print(f"Created {len(default_depths)} default boundaries for {core_name}")
        print(f"Added {len(default_depths)-1} turbidites to database from {core_name}")

print(f"\nTotal turbidites in database: {len(turb_logs)}")
print(f"Total cores processed: {len(segment_pool_cores_data)}")


In [None]:
import random

# Cell 5: Custom Null Hypothesis with DTW Analysis on Synthetic Cores
print("=== Computing Null Hypothesis Distribution with DTW Analysis ===")

# Configure target characteristics for synthetic cores
target_dimensions = log_a.shape[1] if log_a.ndim > 1 else 1

core_a_config = {
    'target_length': len(log_a),
    'target_dimensions': target_dimensions
}

core_b_config = {
    'target_length': len(log_b), 
    'target_dimensions': target_dimensions
}

print(f"Target Core A: {core_a_config['target_length']} points, {core_a_config['target_dimensions']} dimensions")
print(f"Target Core B: {core_b_config['target_length']} points, {core_b_config['target_dimensions']} dimensions")

print(f"Available turbidites in database: {len(turb_logs)}")

def create_synthetic_log_with_depths(thickness, turb_logs, depth_logs, exclude_inds=None):
    """
    Create synthetic log using turbidite database approach with picked depths at turbidite bases.
    """
    fake_log = np.array([]).reshape(0, target_dimensions) if target_dimensions > 1 else np.array([])
    md_log = np.array([])
    max_depth = 0
    inds = []
    picked_depths = []
    
    # Add initial boundary
    picked_depths.append((0, 1))
    
    while max_depth <= thickness:
        ind = random.choices(np.arange(len(turb_logs)), k=1)[0]
        
        # Skip if this index should be excluded
        if exclude_inds is not None and ind in exclude_inds:
            continue
            
        inds.append(ind)
        
        # Get turbidite segment from database
        turb_segment = turb_logs[ind]
        turb_depths = depth_logs[ind]
        
        # Ensure turbidite has proper dimensions
        if turb_segment.ndim == 1:
            turb_segment = turb_segment.reshape(-1, 1)
        
        # Ensure proper dimensions match target
        if turb_segment.shape[1] < target_dimensions:
            # Pad with noise if needed
            padding = np.random.normal(0, 0.1, (len(turb_segment), target_dimensions - turb_segment.shape[1]))
            turb_segment = np.hstack([turb_segment, padding])
        elif turb_segment.shape[1] > target_dimensions:
            # Truncate if needed
            turb_segment = turb_segment[:, :target_dimensions]
        
        # Append log data
        if target_dimensions > 1:
            if len(fake_log) == 0:
                fake_log = turb_segment.copy()
            else:
                fake_log = np.vstack((fake_log, turb_segment))
        else:
            fake_log = np.hstack((fake_log, turb_segment.flatten()))
        
        # Append depth data
        if len(md_log) == 0:
            md_log = np.hstack((md_log, 1 + turb_depths))
        else:
            md_log = np.hstack((md_log, 1 + md_log[-1] + turb_depths))
            
        max_depth = md_log[-1]
        
        # Add picked depth at the base of this turbidite (current max_depth)
        if max_depth <= thickness:
            picked_depths.append((max_depth, 1))
    
    # Truncate to target thickness
    valid_indices = md_log <= thickness
    if target_dimensions > 1:
        log = fake_log[valid_indices]
    else:
        log = fake_log[valid_indices]
    d = md_log[valid_indices]
    
    # Filter picked depths to only include those within the valid range
    valid_picked_depths = [(depth, category) for depth, category in picked_depths if depth <= thickness]
    
    # Ensure we have an end boundary
    if len(valid_picked_depths) == 0 or valid_picked_depths[-1][0] != d[-1]:
        valid_picked_depths.append((d[-1], 1))
    
    return log, d, inds, valid_picked_depths

# Manual null hypothesis computation with DTW analysis
print("Computing null hypothesis with 10,000 synthetic core pairs and DTW analysis...")
print("This may take considerable time...")

n_iterations = 10000
r_values_null = []
successful_iterations = 0
failed_iterations = 0

for i in tqdm(range(n_iterations), desc="Null hypothesis iterations"):
    try:
        # Generate synthetic logs using turbidite database approach
        synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a = create_synthetic_log_with_depths(
            md_a[-1], turb_logs, depth_logs, exclude_inds=None
        )
        synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b = create_synthetic_log_with_depths(
            md_b[-1], turb_logs, depth_logs, exclude_inds=None
        )
        
        # Run DTW analysis on synthetic cores using synthetic picked depths
        dtw_results, valid_dtw_pairs, _, _, _, _, _ = run_comprehensive_dtw_analysis(
            synthetic_log_a, synthetic_log_b, synthetic_md_a, synthetic_md_b,
            picked_depths_a=synthetic_picked_a,
            picked_depths_b=synthetic_picked_b,
            top_bottom=True,
            independent_dtw=False,
            exclude_deadend=True,
            create_dtw_matrix=False,  # Skip visualization for speed
            creategif=False,  # Skip animation for speed
            age_consideration=False,  # No age constraints
            debug=False
        )
        
        # Extract correlation coefficients from DTW results
        correlations = []
        for pair_key in valid_dtw_pairs:
            if pair_key in dtw_results:
                _, _, quality_metrics = dtw_results[pair_key]
                if 'corr_coef' in quality_metrics and not np.isnan(quality_metrics['corr_coef']):
                    correlations.append(quality_metrics['corr_coef'])
        
        # Use mean correlation if we have valid correlations
        if correlations:
            mean_correlation = np.mean(correlations)
            r_values_null.append(mean_correlation)
            successful_iterations += 1
        else:
            failed_iterations += 1
            
    except Exception as e:
        failed_iterations += 1
        if i < 10:  # Only print first few errors
            print(f"Error in iteration {i}: {e}")

# Calculate distribution statistics
r_values_null = np.array(r_values_null)


In [None]:
# Cell 6: Plot R-value Distribution
print("=== Plotting R-value Distribution ===")

# Create temporary CSV for plot_correlation_distribution compatibility
temp_csv = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False)
temp_csv_path = temp_csv.name

# Write data in the format expected by plot_correlation_distribution
with open(temp_csv_path, 'w') as f:
    f.write("mapping_id,corr_coef\n")
    for i, r_val in enumerate(r_values_null):
        f.write(f"{i},{r_val}\n")

temp_csv.close()

# Plot null hypothesis distribution
try:
    fig, ax, fit_params = plot_correlation_distribution(
        csv_file=temp_csv_path,
        quality_index='corr_coef',
        save_png=True,
        png_filename=f'Null_Hypothesis_Distribution_{CORE_A}_{CORE_B}.png',
        core_a_name=CORE_A,
        core_b_name=CORE_B,
        pdf_method='skew-normal',  # Use skew-normal for better fit
        no_bins=50
    )
    
    plt.show()
    
finally:
    # Clean up temporary file
    if os.path.exists(temp_csv_path):
        os.unlink(temp_csv_path)

# Save null hypothesis results for future use
results_filename = f'Null_Hypothesis_Results_{CORE_A}_{CORE_B}.csv'
null_df = pd.DataFrame({
    'iteration': range(len(r_values_null)),
    'r_value': r_values_null
})
null_df.to_csv(results_filename, index=False)
print(f"Null hypothesis r-values saved to: {results_filename}")

print("=== NULL HYPOTHESIS TESTING COMPLETE ===")