In [1]:
# Cell 1: Import Required Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Cell 2: Extract Core Lengths and Load Segment Pool
from pyCoreRelator import load_log_data

# Define core names and target parameters
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"
LOG_COLUMNS = ['hiresMS']  # Choose one log column for segment pool
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Define directory paths
mother_dir = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites'

# Function to extract core length from depth data
def get_core_length(core_name, depth_column='SB_DEPTH_cm'):
    """Extract maximum depth from core data"""
    # Try hiresMS file first (most common)
    depth_file = f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv'
    try:
        df = pd.read_csv(depth_file)
        return df[depth_column].max()
    except:
        print(f"Warning: Could not read depth from {depth_file}")
        return None

# Extract core lengths
core_a_length = get_core_length(CORE_A, DEPTH_COLUMN)
core_b_length = get_core_length(CORE_B, DEPTH_COLUMN)

print(f"Core A ({CORE_A}) length: {core_a_length} cm")
print(f"Core B ({CORE_B}) length: {core_b_length} cm")

# Define all cores for segment pool
SEGMENT_POOL_CORES = ["M9907-22PC", "M9907-23PC", "M9907-25PC"]

# Load segment pool data (turbidite database)
segment_pool_cores_data = {}
turb_logs = []
depth_logs = []

print("Loading segment pool from available cores...")

for core_name in SEGMENT_POOL_CORES:
    print(f"Processing {core_name}...")
    
    # Define log paths for this core
    core_log_paths = {
        'hiresMS': f'{mother_dir}/Cascadia_core_data/OSU_dataset/_compiled_logs/{core_name}/ML_filled/{core_name}_hiresMS_MLfilled.csv',
    }
    
    # Define column alternatives
    column_alternatives = {
        'hiresMS': ['MS'],
    }
    
    try:
        # Load data for segment pool
        log_data, md_data, available_columns, _, _ = load_log_data(
            core_log_paths,
            {},  # No images needed
            LOG_COLUMNS,
            depth_column=DEPTH_COLUMN,
            normalize=True,
            column_alternatives=column_alternatives
        )
        
        # Store core data
        segment_pool_cores_data[core_name] = {
            'log_data': log_data,
            'md_data': md_data,
            'available_columns': available_columns
        }
        
        # Load turbidite boundaries for this core
        picked_file = f'{mother_dir}/pyCoreRelator/pickeddepth/{core_name}_pickeddepth.csv'
        try:
            picked_df = pd.read_csv(picked_file)
            # Filter for category 1 boundaries only
            category_1_depths = picked_df[picked_df['category'] == 1]['picked_depths_cm'].values
            category_1_depths = np.sort(category_1_depths)  # Ensure sorted order
            
            # Create turbidite segments (from boundary to boundary)
            for i in range(len(category_1_depths) - 1):
                start_depth = category_1_depths[i]
                end_depth = category_1_depths[i + 1]
                
                # Find indices corresponding to these depths
                start_idx = np.argmin(np.abs(md_data - start_depth))
                end_idx = np.argmin(np.abs(md_data - end_depth))
                
                if end_idx > start_idx:
                    # Extract turbidite segment
                    turb_segment = log_data[start_idx:end_idx]
                    turb_depth = md_data[start_idx:end_idx] - md_data[start_idx]  # Relative depths
                    
                    turb_logs.append(turb_segment)
                    depth_logs.append(turb_depth)
            
        except Exception as e:
            print(f"Warning: Could not load turbidite boundaries for {core_name}: {e}")
        
        print(f"  Loaded: {len(log_data)} points, columns: {available_columns}")
        
    except Exception as e:
        print(f"Error loading {core_name}: {e}")

print(f"Segment pool created with {len(turb_logs)} turbidites")
print(f"Total cores processed: {len(segment_pool_cores_data)}")

# Set target dimensions based on segment pool
target_dimensions = turb_logs[0].shape[1] if len(turb_logs) > 0 and turb_logs[0].ndim > 1 else 1

print(f"Target dimensions: {target_dimensions}")
print(f"Core A target length: {core_a_length} cm")
print(f"Core B target length: {core_b_length} cm")

In [None]:
# Cell 3: Plot All Turbidite Segments from Pool
print(f"Plotting {len(turb_logs)} turbidite segments from the pool...")

# Create subplot grid
n_segments = len(turb_logs)
n_cols = 8
n_rows = int(np.ceil(n_segments / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
axes = axes.flatten() if n_segments > 1 else [axes]

for i, (turb_segment, turb_depth) in enumerate(zip(turb_logs, depth_logs)):
    ax = axes[i]
    
    # Plot segment
    if turb_segment.ndim > 1:
        # Multi-dimensional data - plot first column
        ax.plot(turb_segment[:, 0], turb_depth, 'b-', linewidth=1)
        ax.set_xlabel(f'{LOG_COLUMNS[0]} (normalized)')
    else:
        # 1D data
        ax.plot(turb_segment, turb_depth, 'b-', linewidth=1)
        ax.set_xlabel(f'{LOG_COLUMNS[0]} (normalized)')
    
    ax.set_ylabel('Relative Depth (cm)')
    ax.set_title(f'Segment {i+1}\n({len(turb_segment)} pts, {turb_depth[-1]:.1f} cm)')
    ax.grid(True, alpha=0.3)
    ax.invert_yaxis()  # Depth increases downward

# Hide unused subplots
for i in range(n_segments, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.suptitle(f'Turbidite Segment Pool ({len(turb_logs)} segments)', y=1.02, fontsize=16)
plt.show()

# Print summary statistics
segment_lengths = [len(seg) for seg in turb_logs]
segment_depths = [depth[-1] for depth in depth_logs]

print(f"\nSegment Pool Summary:")
print(f"  Total segments: {len(turb_logs)}")
print(f"  Length range: {min(segment_lengths)}-{max(segment_lengths)} points")
print(f"  Depth range: {min(segment_depths):.1f}-{max(segment_depths):.1f} cm")
print(f"  Mean depth: {np.mean(segment_depths):.1f} cm")
print(f"  Target dimensions: {target_dimensions}")

In [None]:
# Cell 4: Create and Plot Synthetic Core Pair
import random

def create_synthetic_log_with_depths(thickness, turb_logs, depth_logs, exclude_inds=None):
    """Create synthetic log using turbidite database approach with picked depths at turbidite bases."""
    fake_log = np.array([]).reshape(0, target_dimensions) if target_dimensions > 1 else np.array([])
    md_log = np.array([])
    max_depth = 0
    inds = []
    picked_depths = []
    
    # Add initial boundary
    picked_depths.append((0, 1))
    
    while max_depth <= thickness:
        ind = random.choices(np.arange(len(turb_logs)), k=1)[0]
        
        # Skip if this index should be excluded
        if exclude_inds is not None and ind in exclude_inds:
            continue
            
        inds.append(ind)
        
        # Get turbidite segment from database
        turb_segment = turb_logs[ind]
        turb_depths = depth_logs[ind]
        
        # Ensure turbidite has proper dimensions
        if turb_segment.ndim == 1:
            turb_segment = turb_segment.reshape(-1, 1)
        
        # Ensure proper dimensions match target
        if turb_segment.shape[1] < target_dimensions:
            # Pad with noise if needed
            padding = np.random.normal(0, 0.1, (len(turb_segment), target_dimensions - turb_segment.shape[1]))
            turb_segment = np.hstack([turb_segment, padding])
        elif turb_segment.shape[1] > target_dimensions:
            # Truncate if needed
            turb_segment = turb_segment[:, :target_dimensions]
        
        # Append log data
        if target_dimensions > 1:
            if len(fake_log) == 0:
                fake_log = turb_segment.copy()
            else:
                fake_log = np.vstack((fake_log, turb_segment))
        else:
            fake_log = np.hstack((fake_log, turb_segment.flatten()))
        
        # Append depth data
        if len(md_log) == 0:
            md_log = np.hstack((md_log, 1 + turb_depths))
        else:
            md_log = np.hstack((md_log, 1 + md_log[-1] + turb_depths))
            
        max_depth = md_log[-1]
        
        # Add picked depth at the base of this turbidite (current max_depth)
        if max_depth <= thickness:
            picked_depths.append((max_depth, 1))
    
    # Truncate to target thickness
    valid_indices = md_log <= thickness
    if target_dimensions > 1:
        log = fake_log[valid_indices]
    else:
        log = fake_log[valid_indices]
    d = md_log[valid_indices]
    
    # Filter picked depths to only include those within the valid range
    valid_picked_depths = [(depth, category) for depth, category in picked_depths if depth <= thickness]
    
    # Ensure we have an end boundary
    if len(valid_picked_depths) == 0 or valid_picked_depths[-1][0] != d[-1]:
        valid_picked_depths.append((d[-1], 1))
    
    return log, d, inds, valid_picked_depths

# Generate synthetic logs for cores A and B
print("Generating synthetic core pair...")

synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a_tuples = create_synthetic_log_with_depths(
    core_a_length, turb_logs, depth_logs, exclude_inds=None
)
synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b_tuples = create_synthetic_log_with_depths(
    core_b_length, turb_logs, depth_logs, exclude_inds=None
)

# Extract just the depths from the tuples
synthetic_picked_a = [depth for depth, category in synthetic_picked_a_tuples]
synthetic_picked_b = [depth for depth, category in synthetic_picked_b_tuples]

# Plot synthetic core pair
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(4, 8))

# Plot synthetic core A
if synthetic_log_a.ndim > 1:
    ax1.plot(synthetic_log_a[:, 0], synthetic_md_a, 'b-', linewidth=1)
else:
    ax1.plot(synthetic_log_a, synthetic_md_a, 'b-', linewidth=1)

# Add picked depths as horizontal lines
for depth in synthetic_picked_a:
    ax1.axhline(y=depth, color='red', linestyle='--', alpha=0.7, linewidth=1)

ax1.set_xlabel(f'{LOG_COLUMNS[0]}\n(normalized)')
ax1.set_ylabel('Depth (cm)')
ax1.set_title(f'Synthetic Core A\n({len(inds_a)} turbidites)')
ax1.grid(True, alpha=0.3)
ax1.invert_yaxis()

# Plot synthetic core B
if synthetic_log_b.ndim > 1:
    ax2.plot(synthetic_log_b[:, 0], synthetic_md_b, 'g-', linewidth=1)
else:
    ax2.plot(synthetic_log_b, synthetic_md_b, 'g-', linewidth=1)

# Add picked depths as horizontal lines
for depth in synthetic_picked_b:
    ax2.axhline(y=depth, color='red', linestyle='--', alpha=0.7, linewidth=1)

ax2.set_xlabel(f'{LOG_COLUMNS[0]}\n(normalized)')
ax2.set_ylabel('Depth (cm)')
ax2.set_title(f'Synthetic Core B\n({len(inds_b)} turbidites)')
ax2.grid(True, alpha=0.3)
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

print(f"Synthetic Core A: {len(synthetic_log_a)} points, {len(inds_a)} turbidites, {len(synthetic_picked_a)} boundaries")
print(f"Synthetic Core B: {len(synthetic_log_b)} points, {len(inds_b)} turbidites, {len(synthetic_picked_b)} boundaries")
print(f"Turbidite indices used in A: {[int(x) for x in inds_a[:10]]}..." if len(inds_a) > 10 else f"Turbidite indices used in A: {[int(x) for x in inds_a]}")
print(f"Turbidite indices used in B: {[int(x) for x in inds_b[:10]]}..." if len(inds_b) > 10 else f"Turbidite indices used in B: {[int(x) for x in inds_b]}")

In [None]:
# Cell 5: DTW Analysis on Synthetic Pair
from pyCoreRelator import run_comprehensive_dtw_analysis, find_complete_core_paths

# Run DTW analysis
dtw_results, valid_dtw_pairs, segments_a, segments_b, _, _, _ = run_comprehensive_dtw_analysis(
    synthetic_log_a, synthetic_log_b, synthetic_md_a, synthetic_md_b,
    picked_depths_a=synthetic_picked_a,
    picked_depths_b=synthetic_picked_b,
    independent_dtw=False,
    top_bottom=False,
    mute_mode=False
)

# Find complete core paths and extract r-values
_ = find_complete_core_paths(
    valid_dtw_pairs,
    segments_a, 
    segments_b, 
    synthetic_log_a, 
    synthetic_log_b,
    synthetic_picked_a, 
    synthetic_picked_b,
    dtw_results,
    output_csv="synthetic_core_pair_metrics.csv",
    output_metric_only=True,
    shortest_path_search=True,
    shortest_path_level=2,
    max_search_path=50000,
    mute_mode=False
)

In [None]:
# Cell 6: Plot R-Values Distribution from Synthetic Pair
from pyCoreRelator import plot_correlation_distribution

# Define quality index and parameters
targeted_quality_index = 'corr_coef'
csv_filename = 'outputs/synthetic_core_pair_metrics.csv'

# Plot correlation distribution
_, _, fit_params = plot_correlation_distribution(
    csv_file=csv_filename,
    quality_index=targeted_quality_index,
    no_bins=30,
    save_png=False,
    pdf_method='normal',  # 'KDE', 'skew-normal', 'normal'
    kde_bandwidth=0.05,
    mute_mode=False
)

In [None]:
# Cell 7: Re-plot Distribution Using Fit Parameters
if 'fit_params' in locals() and fit_params is not None:
    print("Re-plotting fitted curve only from 'fit_params'...")
    
    # Create new figure
    fig, ax = plt.subplots(figsize=(6, 4))
    
    # Plot only the fitted distribution curve
    if 'x_range' in fit_params and 'y_values' in fit_params:
        x = fit_params['x_range']
        y = fit_params['y_values']
        
        method = fit_params.get('method', 'unknown')
        
        if method == 'normal':
            mean_val = fit_params['mean']
            std_val = fit_params['std']
            n_points = fit_params['n_points']
            ax.plot(x, y, 'r-', linewidth=2, alpha=0.8,
                    label=f'Normal Fit\n(mean = {mean_val:.3f})\n(σ = {std_val:.3f})\nn = {n_points:,}')
                      
        elif method == 'skew-normal':
            shape = fit_params['shape']
            location = fit_params['location']
            scale = fit_params['scale']
            n_points = fit_params['n_points']
            ax.plot(x, y, 'r-', linewidth=2, alpha=0.8,
                    label=f'Skew-Normal Fit\n(α = {shape:.3f})\n(μ = {location:.3f})\n(σ = {scale:.3f})\nn = {n_points:,}')
                      
        elif method == 'KDE':
            bandwidth = fit_params['bandwidth']
            n_points = fit_params['n_points']
            ax.plot(x, y, 'r-', linewidth=2, alpha=0.8,
                    label=f'KDE\n(bandwidth = {bandwidth})\nn = {n_points:,}')
    
    # Add median line
    if 'median' in fit_params:
        median_val = fit_params['median']
        ax.axvline(median_val, color='green', linestyle='dashed', linewidth=2,
                  label=f'Median: {median_val:.3f}')
    
    # Formatting
    ax.set_xlabel(f'{targeted_quality_index}')
    ax.set_ylabel('Density (%)')
    ax.set_title(f'Fitted Distribution Curve\nSynthetic Cores {CORE_A} vs {CORE_B}')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Set appropriate x-axis limits
    if targeted_quality_index == 'corr_coef':
        ax.set_xlim(0, 1.0)
    
    plt.tight_layout()
    plt.show()
                
else:
    print("No fit_params available. Please run Cell 6 first to generate the distribution.")

In [None]:
# Cell 8: Run 10000 Iterations for Synthetic Logs R-Value Findings
# Prepare CSV file for incremental saving
output_csv_filename = 'outputs/synthetic_iterations_fit_params.csv'
os.makedirs('outputs', exist_ok=True)

# Run 10000 iterations with progress bar
for iteration in tqdm(range(5), desc="Running synthetic analysis"):
    
    # Generate synthetic core pair (from cell 4)
    synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a_tuples = create_synthetic_log_with_depths(
        core_a_length, turb_logs, depth_logs, exclude_inds=None
    )
    synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b_tuples = create_synthetic_log_with_depths(
        core_b_length, turb_logs, depth_logs, exclude_inds=None
    )
    
    # Extract depths from tuples
    synthetic_picked_a = [depth for depth, category in synthetic_picked_a_tuples]
    synthetic_picked_b = [depth for depth, category in synthetic_picked_b_tuples]
    
    # Run DTW analysis (from cell 5)
    dtw_results, valid_dtw_pairs, segments_a, segments_b, _, _, _ = run_comprehensive_dtw_analysis(
        synthetic_log_a, synthetic_log_b, synthetic_md_a, synthetic_md_b,
        picked_depths_a=synthetic_picked_a,
        picked_depths_b=synthetic_picked_b,
        independent_dtw=False,
        top_bottom=False,
        mute_mode=True
    )
    
    # Find complete core paths
    _ = find_complete_core_paths(
        valid_dtw_pairs,
        segments_a, 
        segments_b, 
        synthetic_log_a, 
        synthetic_log_b,
        synthetic_picked_a, 
        synthetic_picked_b,
        dtw_results,
        output_csv="synthetic_core_pair_metrics.csv",
        output_metric_only=True,
        shortest_path_search=True,
        shortest_path_level=2,
        max_search_path=50000,
        mute_mode=True
    )
    
    # Extract fit_params (from cell 6) - suppress all plotting
    
    # Plot correlation distribution to get fit_params only
    _, _, fit_params = plot_correlation_distribution(
        csv_file="outputs/synthetic_core_pair_metrics.csv",
        quality_index='corr_coef',
        no_bins=30,
        save_png=False,
        pdf_method='normal',
        kde_bandwidth=0.05,
        mute_mode=True
    )
    
    # Store fit_params with iteration number and incrementally save to CSV
    if fit_params is not None:
        fit_params_copy = fit_params.copy()
        fit_params_copy['iteration'] = iteration
        
        # Incrementally save to CSV
        df_single = pd.DataFrame([fit_params_copy])
        if iteration == 0:
            # Write header for first iteration
            df_single.to_csv(output_csv_filename, mode='w', index=False, header=True)
        else:
            # Append subsequent iterations without header
            df_single.to_csv(output_csv_filename, mode='a', index=False, header=False)
        
        del df_single, fit_params_copy
    
    # Clear memory after each iteration
    del synthetic_log_a, synthetic_md_a, inds_a, synthetic_picked_a_tuples
    del synthetic_log_b, synthetic_md_b, inds_b, synthetic_picked_b_tuples
    del synthetic_picked_a, synthetic_picked_b
    del dtw_results, valid_dtw_pairs, segments_a, segments_b
    del fit_params
    
    # Force garbage collection
    import gc
    gc.collect()

print(f"\nCompleted 3 iterations")
print(f"All distribution curves parameters saved to: {output_csv_filename}")

In [None]:
# Cell 9: Plot all distribution curves

# Load fit params from CSV
output_csv_filename = 'outputs/synthetic_iterations_fit_params.csv'
df_fit_params = pd.read_csv(output_csv_filename)

# Convert to list of dictionaries containing only necessary columns
all_fit_params = []
for _, row in df_fit_params.iterrows():
    fit_params = {
        'x_range': np.fromstring(row['x_range'].strip('[]'), sep=' ') if 'x_range' in row and pd.notna(row['x_range']) else None,
        'y_values': np.fromstring(row['y_values'].strip('[]'), sep=' ') if 'y_values' in row and pd.notna(row['y_values']) else None
    }
    all_fit_params.append(fit_params)

# Plot all distribution curves (adapted from cell 7)
fig, ax = plt.subplots(figsize=(6, 4))

# Plot all curves as transparent red lines
for fit_params in all_fit_params:
    if 'x_range' in fit_params and 'y_values' in fit_params:
        x = fit_params['x_range']
        y = fit_params['y_values']
        if x is not None and y is not None:
            ax.plot(x, y, 'r-', linewidth=1, alpha=0.3)

# Formatting
ax.set_xlabel(f"Pearson's r\n(Correlation Coefficient)")
ax.set_ylabel('Probability Density (%)')
ax.set_title(f'Synthetic Core Correlation: {len(all_fit_params)} Iterations')
ax.grid(True, alpha=0.3)

# Set appropriate x-axis limits
if targeted_quality_index == 'corr_coef':
    ax.set_xlim(0, 1.0)

plt.tight_layout()
plt.savefig('synthetic_iterations.png', dpi=150, bbox_inches='tight')
plt.show()