# Correlating well log pairs: Complex Dynamic Time Warping with boundary constraints

## Introduction to dynamic time warping

In [1]:
# Data manipulation and analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import csv
import glob
from IPython.display import Image as IPImage, display
from mpl_toolkits.mplot3d import Axes3D
warnings.filterwarnings('ignore')

from pyCoreRelator import (
    run_comprehensive_dtw_analysis,
    find_complete_core_paths,
    diagnose_chain_breaks,
    calculate_interpolated_ages,
    visualize_combined_segments,
    visualize_dtw_results_from_csv,
    load_log_data,
    plot_core_data,
    plot_dtw_matrix_with_paths,
    plot_correlation_distribution
)

<hr>

### Test with Cascadia hi-res MS logs

### Define core pairs

In [2]:
# Define core names as variables for easy reference
# CORE_A = "M9907-22PC"
# CORE_B = "M9907-23PC"
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"

#### Data structures and core images

In [None]:
%matplotlib inline

# Define paths to the CSV files
pickeddepth_a_csv = f'pickeddepth/{CORE_A}_pickeddepth.csv'
pickeddepth_b_csv = f'pickeddepth/{CORE_B}_pickeddepth.csv'

# Load picked depths and categories from CSV files
if os.path.exists(pickeddepth_b_csv):
    picked_data_b = pd.read_csv(pickeddepth_b_csv)
    # Combine depths and categories into tuples
    picked_b = list(zip(picked_data_b['picked_depths_cm'].values.tolist(), 
                        picked_data_b['category'].values.tolist()))
    print(f"Loaded {len(picked_b)} picked depths for {CORE_B}")
else:
    print(f"Warning: {pickeddepth_b_csv} not found. Using empty list for picked_b.")
    picked_b = []

if os.path.exists(pickeddepth_a_csv):
    picked_data_a = pd.read_csv(pickeddepth_a_csv)
    # Combine depths and categories into tuples
    picked_a = list(zip(picked_data_a['picked_depths_cm'].values.tolist(), 
                        picked_data_a['category'].values.tolist()))
    print(f"Loaded {len(picked_a)} picked depths for {CORE_A}")
else:
    print(f"Warning: {pickeddepth_a_csv} not found. Using empty list for picked_a.")
    picked_a = []

# Create uncertainty arrays (assuming uncertainty size is 2 cm)
picked_uncertainty_b = [1] * len(picked_b)
picked_uncertainty_a = [1] * len(picked_a)

In [None]:
# Define log columns to extract
# LOG_COLUMNS = ['hiresMS', 'CT', 'R', 'G', 'B']  # Choose which logs to include
# LOG_COLUMNS = ['hiresMS', 'CT', 'Lumin']  # Choose which logs to include
LOG_COLUMNS = ['hiresMS']  # Choose which logs to include
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Define directory paths
mother_dir = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites/Cascadia_core_data/OSU_dataset/'

# Define paths for Core A
core_a_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_hiresMS_MLfilled.csv',
    'CT': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_CT_MLfilled.csv',
    'Lumin': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'R': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'G': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
    'B': f'{mother_dir}_compiled_logs/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv'
}

core_a_rgb_img_path = f"{mother_dir}_compiled_logs/{CORE_A}/{CORE_A}_RGB.tiff"
core_a_ct_img_path = f"{mother_dir}_compiled_logs/{CORE_A}/{CORE_A}_CT.tiff"

# Define paths for Core B
core_b_log_paths = {
    'hiresMS': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_hiresMS_MLfilled.csv',
    'CT': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_CT_MLfilled.csv',
    'Lumin': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'R': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'G': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
    'B': f'{mother_dir}_compiled_logs/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv'
}
core_b_rgb_img_path = f"{mother_dir}_compiled_logs/{CORE_B}/{CORE_B}_RGB.tiff"
core_b_ct_img_path = f"{mother_dir}_compiled_logs/{CORE_B}/{CORE_B}_CT.tiff"

# Define column mapping for alternative column names
column_alternatives = {
    'hiresMS': ['MS'],
    'CT': ['CT_value'],
    'R': ['R', 'red', 'Red'],
    'G': ['G', 'green', 'Green'],
    'B': ['B', 'blue', 'Blue'],
    'Lumin': ['luminance', 'Luminance'],
    'Den_gm/cc': ['Density', 'density']
}

# Load data for Core A
log_a, md_a, available_columns_a, rgb_img_a, ct_img_a = load_log_data(
    core_a_log_paths,
    {'rgb': core_a_rgb_img_path, 'ct': core_a_ct_img_path},
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

print("\n=== DEBUG: Core A Loading ===")
print(f"LOG_COLUMNS requested: {LOG_COLUMNS}")
print(f"Available columns loaded: {available_columns_a}")
print(f"Shape of log_a: {log_a.shape}")
print(f"Type of log_a: {type(log_a)}")
if hasattr(log_a, 'ndim'):
    print(f"log_a dimensions: {log_a.ndim}")
    if log_a.ndim > 1:
        print(f"log_a has {log_a.shape[1]} columns\n")
    else:
        print("log_a is 1D (single column)\n")

# Load data for Core B
log_b, md_b, available_columns_b, rgb_img_b, ct_img_b = load_log_data(
    core_b_log_paths,
    {'rgb': core_b_rgb_img_path, 'ct': core_b_ct_img_path},
    LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

print("\n=== DEBUG: Core B Loading ===")
print(f"LOG_COLUMNS requested: {LOG_COLUMNS}")
print(f"Available columns loaded: {available_columns_b}")
print(f"Shape of log_b: {log_b.shape}")
print(f"Type of log_b: {type(log_b)}")
if hasattr(log_b, 'ndim'):
    print(f"log_b dimensions: {log_b.ndim}")
    if log_b.ndim > 1:
        print(f"log_b has {log_b.shape[1]} columns\n")
    else:
        print("log_b is 1D (single column)\n")

In [None]:
# Extract depths and categories from the loaded tuples
picked_depths_a = [depth for depth, category in picked_a] if picked_a else []
picked_categories_a = [category for depth, category in picked_a] if picked_a else []

picked_depths_b = [depth for depth, category in picked_b] if picked_b else []
picked_categories_b = [category for depth, category in picked_b] if picked_b else []

# Now plot the cores with enhanced plot_core_data function
is_multilog = log_a.ndim > 1 and log_a.shape[1] > 1
fig_a, ax_a = plot_core_data(
    md_a, log_a, 
    f"{CORE_A}",
    rgb_img=rgb_img_a, 
    ct_img=ct_img_a,
    figsize=(20, 4),
    available_columns=available_columns_a,
    is_multilog=is_multilog,
    picked_depths=picked_depths_a,
    picked_categories=picked_categories_a,
    picked_uncertainties=picked_uncertainty_a,
    show_category= [1],  # Show all categories, or [1] for specific ones
    show_bed_number=True
)

# Do the same for Core B
is_multilog = log_b.ndim > 1 and log_b.shape[1] > 1
fig_b, ax_b = plot_core_data(
    md_b, log_b, 
    f"{CORE_B}",
    rgb_img=rgb_img_b, 
    ct_img=ct_img_b,
    figsize=(20, 4),
    available_columns=available_columns_b,
    is_multilog=is_multilog,
    picked_depths=picked_depths_b,
    picked_categories=picked_categories_b,
    picked_uncertainties=picked_uncertainty_b,
    show_category=[1],
    show_bed_number=True
)

plt.tight_layout()
plt.show()

<hr>

# Usage Examples and Executions

In [8]:
# Extract a subset of picked depths of category 1 for both cores
all_depths_a_cat1 = np.array([depth for depth, category in picked_a if category == 1]).astype('float32')
all_depths_b_cat1 = np.array([depth for depth, category in picked_b if category == 1]).astype('float32')

#### extract ages

In [46]:
def load_age_constraints(core_name, consider_adjacent_core=False):
    """
    Load age constraints for a specific core, optionally including data from adjacent cores.
    
    Args:
        core_name: Name of the core to load data for
        consider_adjacent_core: If True, also load data from cores with similar names
    
    Returns:
        Dictionary containing all age constraint data
    """
    base_path = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites/Cascadia_core_data/Age constraints/Goldfinger2012'
    csv_files = []
    
    # Add primary core CSV
    primary_csv = f'{base_path}/{core_name}_age.csv'
    csv_files.append(primary_csv)
    
    # Add adjacent core CSVs if specified
    if consider_adjacent_core:
        # Get base part of core name (without last two characters)
        core_base = core_name[:-2]
        # Look for similar core names in the directory
        if os.path.exists(base_path):
            for file in os.listdir(base_path):
                if file.endswith('_age.csv') and file.startswith(f'{core_base}'):
                    potential_core = file.split('_age.csv')[0]
                    if potential_core != core_name:  # Skip the primary core
                        csv_files.append(f'{base_path}/{file}')
    
    # Initialize result containers
    all_data = pd.DataFrame()
    result = {
        'depths': [],
        'ages': [],
        'pos_errors': [],
        'neg_errors': [],
        'in_sequence_flags': [],
        'in_sequence_depths': [],
        'in_sequence_ages': [],
        'in_sequence_pos_errors': [],
        'in_sequence_neg_errors': [],
        'out_sequence_depths': [],
        'out_sequence_ages': [],
        'out_sequence_pos_errors': [],
        'out_sequence_neg_errors': [],
        'core': [],
        'interpreted_bed': []
    }
    
    # Define required columns
    required_columns = ['calib502_agebp', 'calib502_2sigma_pos', 'calib502_2sigma_neg', 
                      'mindepth_cm', 'maxdepth_cm', 'in_sequence', 'core', 'interpreted_bed']
    
    # Process each CSV file
    loaded_files = 0
    for csv_file in csv_files:
        if os.path.exists(csv_file):
            data = pd.read_csv(csv_file)
            # Filter rows with all required columns available
            for col in required_columns:
                data = data.dropna(subset=[col])
            
            all_data = pd.concat([all_data, data])
            loaded_files += 1
    
    if loaded_files > 0:
        print(f"Loaded {len(all_data)} age constraints for {core_name}")
        
        # Sort by age if multiple cores were combined
        if consider_adjacent_core:
            all_data = all_data.sort_values(by='mindepth_cm')
        
        # Extract all age constraints
        result['depths'] = (all_data['mindepth_cm'] + all_data['maxdepth_cm']) / 2
        result['ages'] = all_data['calib502_agebp'].tolist()
        result['pos_errors'] = all_data['calib502_2sigma_pos'].tolist()
        result['neg_errors'] = all_data['calib502_2sigma_neg'].tolist()
        result['in_sequence_flags'] = all_data['in_sequence'].tolist()
        result['core'] = all_data['core'].tolist()
        result['interpreted_bed'] = all_data['interpreted_bed'].tolist()
        
        # Separate in-sequence and out-of-sequence constraints
        for i in range(len(result['in_sequence_flags'])):
            if result['in_sequence_flags'][i] == 1:
                result['in_sequence_depths'].append(result['depths'].iloc[i] if isinstance(result['depths'], pd.Series) else result['depths'][i])
                result['in_sequence_ages'].append(result['ages'][i])
                result['in_sequence_pos_errors'].append(result['pos_errors'][i])
                result['in_sequence_neg_errors'].append(result['neg_errors'][i])
            else:
                result['out_sequence_depths'].append(result['depths'].iloc[i] if isinstance(result['depths'], pd.Series) else result['depths'][i])
                result['out_sequence_ages'].append(result['ages'][i])
                result['out_sequence_pos_errors'].append(result['pos_errors'][i])
                result['out_sequence_neg_errors'].append(result['neg_errors'][i])
    else:
        print(f"Warning: No age constraint files found for {core_name}")
    
    return result

In [None]:
# Define core names
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"

# Load age constraints for both cores
consider_adjacent_core = True
age_data_a = load_age_constraints(CORE_A, consider_adjacent_core)
age_data_b = load_age_constraints(CORE_B, consider_adjacent_core)

# Extract variables for backward compatibility
age_constraint_a_depths = age_data_a['depths']
age_constraint_a_ages = age_data_a['ages']
age_constraint_a_pos_errors = age_data_a['pos_errors']
age_constraint_a_neg_errors = age_data_a['neg_errors']
age_constraint_a_in_sequence_flags = age_data_a['in_sequence_flags']
age_constraint_a_in_sequence_depths = age_data_a['in_sequence_depths']
age_constraint_a_in_sequence_ages = age_data_a['in_sequence_ages']
age_constraint_a_in_sequence_pos_errors = age_data_a['in_sequence_pos_errors']
age_constraint_a_in_sequence_neg_errors = age_data_a['in_sequence_neg_errors']
age_constraint_a_out_sequence_depths = age_data_a['out_sequence_depths']
age_constraint_a_out_sequence_ages = age_data_a['out_sequence_ages']
age_constraint_a_out_sequence_pos_errors = age_data_a['out_sequence_pos_errors']
age_constraint_a_out_sequence_neg_errors = age_data_a['out_sequence_neg_errors']
age_constraint_a_source_cores = age_data_a['core']
age_constraint_a_interpreted_beds = age_data_a['interpreted_bed']

age_constraint_b_depths = age_data_b['depths']
age_constraint_b_ages = age_data_b['ages']
age_constraint_b_pos_errors = age_data_b['pos_errors']
age_constraint_b_neg_errors = age_data_b['neg_errors']
age_constraint_b_in_sequence_flags = age_data_b['in_sequence_flags']
age_constraint_b_in_sequence_depths = age_data_b['in_sequence_depths']
age_constraint_b_in_sequence_ages = age_data_b['in_sequence_ages']
age_constraint_b_in_sequence_pos_errors = age_data_b['in_sequence_pos_errors']
age_constraint_b_in_sequence_neg_errors = age_data_b['in_sequence_neg_errors']
age_constraint_b_out_sequence_depths = age_data_b['out_sequence_depths']
age_constraint_b_out_sequence_ages = age_data_b['out_sequence_ages']
age_constraint_b_out_sequence_pos_errors = age_data_b['out_sequence_pos_errors']
age_constraint_b_out_sequence_neg_errors = age_data_b['out_sequence_neg_errors']
age_constraint_b_source_cores = age_data_b['core']
age_constraint_b_interpreted_beds = age_data_b['interpreted_bed']

In [None]:
# Calculate interpolated ages for Core A using the function
pickeddepth_ages_a = calculate_interpolated_ages(
    picked_depths=all_depths_a_cat1,
    age_constraints_depths=age_constraint_a_depths,
    age_constraints_ages=age_constraint_a_ages,
    age_constraints_pos_errors=age_constraint_a_pos_errors,
    age_constraints_neg_errors=age_constraint_a_neg_errors,
    age_constraints_in_sequence_flags=age_constraint_a_in_sequence_flags, # optional. If not provided, all age constraints are treated as in-sequence.
    age_constraint_source_core=age_constraint_a_source_cores,
    top_bottom=True, #whether to include top and bottom depths/ages in the results
    top_age=0,  # Default age at top of core
    top_age_pos_error=75,  # Default uncertainty of the top age
    top_age_neg_error=75,  # Default uncertainty of the top age
    top_depth=0.0,  # Assuming top of core is 0 cm depth
    bottom_depth=md_a[-1],  # Max depth of core a
    uncertainty_method='MonteCarlo', #'MonteCarlo', 'Linear', or 'Gaussian'. Default is 'MonteCarlo'. Linear is the most conservative.
    n_monte_carlo=10000, #number of Monte Carlo sampling iterations. Default is 10000.
    show_plot=True,
    core_name=CORE_A,
    export_csv=True
)

# Print the age constraint data for Core A
print("\nAge Constraints for Core A:")
if len(age_constraint_a_depths) > 0:
    for i in range(len(age_constraint_a_depths)):
        depth_val = age_constraint_a_depths.iloc[i] if isinstance(age_constraint_a_depths, pd.Series) else age_constraint_a_depths[i]
        age_val = age_constraint_a_ages[i]
        pos_err_val = age_constraint_a_pos_errors[i]
        neg_err_val = age_constraint_a_neg_errors[i]
        in_seq = age_constraint_a_in_sequence_flags[i]
        
        # Add source core and interpreted bed info if they exist
        source_core_info = f", Source Core: {age_constraint_a_source_cores[i]}" if 'age_constraint_a_source_cores' in locals() and i < len(age_constraint_a_source_cores) else ""
        bed_info = f", Interpreted Bed: {age_constraint_a_interpreted_beds[i]}" if 'age_constraint_a_interpreted_beds' in locals() and i < len(age_constraint_a_interpreted_beds) else ""
        
        print(f"Depth: {depth_val:.2f} cm, Age: {age_val:.1f} years BP (+{pos_err_val:.1f} ; -{neg_err_val:.1f}), In Sequence: {in_seq}{source_core_info}{bed_info}")
else:
    print(f"No age constraints available in {CORE_A}")

# Print the interpolated ages
print(f"\nEstimated Ages for picked depths in {CORE_A}:")
for i, depth in enumerate(pickeddepth_ages_a['depths']):
    print(f"Depth: {depth:.2f} cm, Age: {pickeddepth_ages_a['ages'][i]:.1f} years BP (+{pickeddepth_ages_a['pos_uncertainties'][i]:.1f} ; -{pickeddepth_ages_a['neg_uncertainties'][i]:.1f})")

In [None]:
# Calculate interpolated ages for Core B using the function
pickeddepth_ages_b = calculate_interpolated_ages(
    picked_depths=all_depths_b_cat1,
    age_constraints_depths=age_constraint_b_depths,
    age_constraints_ages=age_constraint_b_ages,
    age_constraints_pos_errors=age_constraint_b_pos_errors,
    age_constraints_neg_errors=age_constraint_b_neg_errors,
    age_constraints_in_sequence_flags=age_constraint_b_in_sequence_flags, # optional. If not provided, all age constraints are treated as in-sequence.
    age_constraint_source_core=age_constraint_b_source_cores,
    uncertainty_method='MonteCarlo', #'MonteCarlo', 'Linear', or 'Gaussian'. Default is 'MonteCarlo'. Linear is the most conservative.
    n_monte_carlo=10000, #number of Monte Carlo sampling iterations. Default is 10000.
    top_bottom=True, #whether to include top and bottom depths/ages in the results
    top_age=0,  # Default age at top of core
    top_age_pos_error=75,  # Default uncertainty of the top age
    top_age_neg_error=75,  # Default uncertainty of the top age
    top_depth=0.0,  # Assuming top of core is 0 cm depth
    bottom_depth=md_b[-1],  # Max depth of core b
    show_plot=True,
    core_name=CORE_B,
    export_csv=True
)

# Print the age constraint data for Core B
print("\nAge Constraints for Core B:")
if len(age_constraint_b_depths) > 0:
    for i in range(len(age_constraint_b_depths)):
        depth_val = age_constraint_b_depths.iloc[i] if isinstance(age_constraint_b_depths, pd.Series) else age_constraint_b_depths[i]
        age_val = age_constraint_b_ages[i]
        pos_err_val = age_constraint_b_pos_errors[i]
        neg_err_val = age_constraint_b_neg_errors[i]
        in_seq = age_constraint_b_in_sequence_flags[i]
        
        # Add source core and interpreted bed info if they exist
        source_core_info = f", Source Core: {age_constraint_b_source_cores[i]}" if 'age_constraint_b_source_cores' in locals() and i < len(age_constraint_b_source_cores) else ""
        bed_info = f", Interpreted Bed: {age_constraint_b_interpreted_beds[i]}" if 'age_constraint_b_interpreted_beds' in locals() and i < len(age_constraint_b_interpreted_beds) else ""
        
        print(f"Depth: {depth_val:.2f} cm, Age: {age_val:.1f} years BP (+{pos_err_val:.1f} ; -{neg_err_val:.1f}), In Sequence: {in_seq}{source_core_info}{bed_info}")
else:
    print(f"No age constraints available in {CORE_B}")

print(f"\nEstimated Ages for picked depths in {CORE_B}:")
for i, depth in enumerate(pickeddepth_ages_b['depths']):
    print(f"Depth: {depth:.2f} cm, Age: {pickeddepth_ages_b['ages'][i]:.1f} years BP (+{pickeddepth_ages_b['pos_uncertainties'][i]:.1f} ; -{pickeddepth_ages_b['neg_uncertainties'][i]:.1f})")

In [None]:
# Load the age data from CSV files


# Load age data for Core A
core_a_age_csv = f"{CORE_A}_pickeddepth_age.csv"
if os.path.exists(core_a_age_csv):
    df_ages_a = pd.read_csv(core_a_age_csv)
    pickeddepth_ages_a = {
        'depths': df_ages_a['picked_depths_cm'].values.astype('float32').tolist(),
        'ages': df_ages_a['est_age'].values.astype('float32').tolist(),
        'pos_uncertainties': df_ages_a['est_age_poserr'].values.astype('float32').tolist(),
        'neg_uncertainties': df_ages_a['est_age_negerr'].values.astype('float32').tolist()
    }
    print(f"Loaded age data for {CORE_A} from CSV file")
else:
    print(f"Warning: Could not find age data CSV for {CORE_A}")

# Load age data for Core B
core_b_age_csv = f"{CORE_B}_pickeddepth_age.csv"
if os.path.exists(core_b_age_csv):
    df_ages_b = pd.read_csv(core_b_age_csv)
    pickeddepth_ages_b = {
        'depths': df_ages_b['picked_depths_cm'].values.astype('float32').tolist(),
        'ages': df_ages_b['est_age'].values.astype('float32').tolist(),
        'pos_uncertainties': df_ages_b['est_age_poserr'].values.astype('float32').tolist(),
        'neg_uncertainties': df_ages_b['est_age_negerr'].values.astype('float32').tolist()
    }
    print(f"Loaded age data for {CORE_B} from CSV file")
else:
    print(f"Warning: Could not find age data CSV for {CORE_B}")


#### Find out all segment pairs among boundaries

In [51]:
# Define file names for age consideration or not
age_consideration=True
restricted_age_correlation=True

if age_consideration:
    if restricted_age_correlation:
        YES_NO_AGE = 'restricted_age'
    else:
        YES_NO_AGE = 'loose_age'
else:
    if restricted_age_correlation:
        YES_NO_AGE = 'no_age_restricted'
    else:
        YES_NO_AGE = 'no_age_full'

# Define whether to use independent DTW or not
independent_dtw=False # If False (default), it performs dependent DTW. If True, it performs independent DTW.

In [None]:
%matplotlib inline

# Example usage:
# Set picked_depths_a and picked_depths_b to None to use auto-segmentation

# Define the folder path
frames_folder = "outputs/SegmentPair_DTW_frames"

# Check if the folder exists
if os.path.exists(frames_folder):
    # Get all PNG files in the folderf d
    png_files = glob.glob(os.path.join(frames_folder, "*.png"))
    
    # Delete each PNG file
    for png_file in png_files:
        try:
            os.remove(png_file)
            # print(f"Deleted: {png_file}")
        except Exception as e:
            print(f"Error deleting {png_file}: {e}")
    
    print(f"Cleaned up {len(png_files)} PNG files from {frames_folder}")
else:
    print(f"Folder '{frames_folder}' does not exist. Creating it...")
    os.makedirs(frames_folder, exist_ok=True)
    
# Run comprehensive DTW analysis
dtw_results, valid_dtw_pairs, segments_a, segments_b, depth_boundaries_a, depth_boundaries_b, dtw_distance_matrix_full = run_comprehensive_dtw_analysis(
    log_a, log_b, md_a, md_b, 
    picked_depths_a=all_depths_a_cat1, 
    picked_depths_b=all_depths_b_cat1,
    top_bottom=True,
    top_depth=0.0,
    independent_dtw=independent_dtw,
    exclude_deadend=True,
    visualize_pairs=True,
    visualize_segment_labels=False,
    create_dtw_matrix=True, 
    dtwmatrix_output_filename=f'SegmentPair_DTW_matrix_{CORE_A}_{CORE_B}_{YES_NO_AGE}.png',
    creategif=True,
    gif_output_filename=f'SegmentPair_DTW_animation_{CORE_A}_{CORE_B}_{YES_NO_AGE}.gif',
    max_frames=50,
    color_interval_size=5,
    keep_frames=True,
    debug=False,
    age_consideration=age_consideration,
    ages_a=pickeddepth_ages_a,
    ages_b=pickeddepth_ages_b,
    restricted_age_correlation=restricted_age_correlation,
    all_constraint_ages_a=age_constraint_a_in_sequence_ages,
    all_constraint_ages_b=age_constraint_b_in_sequence_ages,
    all_constraint_depths_a=age_constraint_a_in_sequence_depths,
    all_constraint_depths_b=age_constraint_b_in_sequence_depths,
    all_constraint_pos_errors_a=age_constraint_a_in_sequence_pos_errors,
    all_constraint_pos_errors_b=age_constraint_b_in_sequence_pos_errors,
    all_constraint_neg_errors_a=age_constraint_a_in_sequence_neg_errors,
    all_constraint_neg_errors_b=age_constraint_b_in_sequence_neg_errors,
    # Age constraint visualization parameters
    age_constraint_a_source_cores=age_constraint_a_source_cores,
    age_constraint_b_source_cores=age_constraint_b_source_cores,
    core_a_name=CORE_A,
    core_b_name=CORE_B
)

In [None]:
diagnostic_result = diagnose_chain_breaks(
    valid_dtw_pairs, 
    segments_a, 
    segments_b, 
    depth_boundaries_a, 
    depth_boundaries_b
)

<hr>

##

In [None]:
complete_path_search_result = find_complete_core_paths(
    valid_dtw_pairs, segments_a, segments_b,
    log_a, log_b,
    depth_boundaries_a, depth_boundaries_b,
    dtw_results,
    output_csv=f"sequential_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.csv",
    start_from_top_only=True,
    batch_size=1000,
    n_jobs=-1,                 #Number of CPU cores to use for parallel processing. -1 means to use all available cores.
    debug=False,
    batch_grouping=False,      #A faster way to quickly assess complete paths for large datasets yet risk of missing several complete paths
    n_groups=4,                #The larger the number, the faster the process but the higher risk of missing some complete paths,
    shortest_path_search=True, #Minimizes the number of segments in the path - preferring less numbers of pinch outs
    shortest_path_level=2,     #The higher the number, the more segments in the path - preferring more numbers of pinch outs
    max_search_path=100000
)

In [None]:
%matplotlib inline

correlation_save_path=f'CombinedDTW_correlation_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.gif'
matrix_save_path=f'CombinedDTW_matrix_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.gif'

# 1. First, read all available mappings from a CSV (assuming it was created by find_all_sequential_mappings)
sequential_mappings_csv = f"outputs/sequential_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.csv"

# 3. Visualize a representative subset of the mappings
visualize_dtw_results_from_csv(
    sequential_mappings_csv,
    log_a, log_b, md_a, md_b, 
    dtw_results, valid_dtw_pairs, 
    segments_a, segments_b, 
    depth_boundaries_a, depth_boundaries_b,
    dtw_distance_matrix_full,
    color_interval_size=5,
    debug=False,
    # GIF output
    creategif=True,
    correlation_gif_output_filename=correlation_save_path,
    matrix_gif_output_filename=matrix_save_path,
    max_frames=50,  # Limits visualization to 100 frames
    keep_frames=True,
    # Mark depths and ages
    visualize_pairs=False,
    visualize_segment_labels=False,
    mark_depths=True,
    mark_ages=True,
    ages_a=pickeddepth_ages_a,  # Age data for core A
    ages_b=pickeddepth_ages_b,  # Age data for core B
    all_constraint_depths_a=age_constraint_a_in_sequence_depths,
    all_constraint_depths_b=age_constraint_b_in_sequence_depths,
    all_constraint_ages_a=age_constraint_a_in_sequence_ages,
    all_constraint_ages_b=age_constraint_b_in_sequence_ages,
    all_constraint_pos_errors_a=age_constraint_a_in_sequence_pos_errors,
    all_constraint_pos_errors_b=age_constraint_b_in_sequence_pos_errors,
    all_constraint_neg_errors_a=age_constraint_a_in_sequence_neg_errors,
    all_constraint_neg_errors_b=age_constraint_b_in_sequence_neg_errors,
    # ADD these new parameters:
    age_constraint_a_source_cores=age_constraint_a_source_cores,
    age_constraint_b_source_cores=age_constraint_b_source_cores,
    core_a_name=CORE_A,
    core_b_name=CORE_B
)

# Display the GIFs
print("DTW Correlation Mappings GIF:")
display(IPImage(f"outputs/{correlation_save_path}"))

print("DTW Matrix Mappings GIF:")
display(IPImage(f"outputs/{matrix_save_path}"))

In [None]:
# Load the saved DTW results
sequential_mappings_csv = f'outputs/sequential_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.csv'
output_matrix_png_filename = f'CombinedDTW_matrix_mappings_colored_{CORE_A}_{CORE_B}_{YES_NO_AGE}.png'

%matplotlib inline

_ = plot_dtw_matrix_with_paths(
    dtw_distance_matrix_full,
    mode='all_paths_colored',
    color_metric='perc_age_overlap', # Default is None, which means the mapping_id is used for coloring.
    # Available quality indices: 'corr_coef', 'norm_dtw', 'dtw_ratio', 'perc_diag', 'variance_deviation', 'match_min', 'match_mean', 'perc_age_overlap'
    sequential_mappings_csv=sequential_mappings_csv,
    output_filename=output_matrix_png_filename,
    n_jobs=-1,
    # ADD these new parameters:
    age_constraint_a_depths=age_constraint_a_in_sequence_depths,
    age_constraint_a_ages=age_constraint_a_in_sequence_ages,
    age_constraint_a_source_cores=age_constraint_a_source_cores,
    age_constraint_b_depths=age_constraint_b_in_sequence_depths,
    age_constraint_b_ages=age_constraint_b_in_sequence_ages,
    age_constraint_b_source_cores=age_constraint_b_source_cores,
    md_a=md_a,
    md_b=md_b,
    core_a_name=CORE_A,
    core_b_name=CORE_B
)

In [None]:
%matplotlib inline
# %matplotlib widget

# Load the DTW results
sequential_mappings_csv = f'outputs/sequential_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.csv'
dtw_results_df = pd.read_csv(sequential_mappings_csv)

# Remove any infinite or NaN values
dtw_results_df = dtw_results_df.replace([np.inf, -np.inf], np.nan).dropna(subset=['corr_coef', 'perc_diag', 'perc_age_overlap'])

# Extract the three metrics we want to visualize
corr_coef = dtw_results_df['corr_coef']
perc_diag = dtw_results_df['perc_diag']
perc_age_overlap = dtw_results_df['perc_age_overlap']

# Calculate the 95th percentile for perc_diag
perc_diag_threshold = np.percentile(perc_diag, 99.9)

# Create a figure for 3D plotting
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Identify points with high percent diagonality
high_diag_indices = perc_diag >= perc_diag_threshold
high_diag_mappings = dtw_results_df.loc[high_diag_indices, 'mapping_id'].tolist()

# Create the scatter plot
# First plot all points in gray
ax.scatter(corr_coef[~high_diag_indices], perc_age_overlap[~high_diag_indices], perc_diag[~high_diag_indices], 
           color='gray', s=10, alpha=0.3)

# Then plot the high diagonality points in red
ax.scatter(corr_coef[high_diag_indices], perc_age_overlap[high_diag_indices], perc_diag[high_diag_indices], 
           color='red', s=30, alpha=0.9)

# Set labels and title
ax.set_xlabel('Correlation Coefficient')
ax.set_ylabel('Percent Age Overlap')
ax.set_zlabel('Percent Diagonality')
ax.set_title('3D Visualization of DTW Quality Indices')

# Use the full range of the data for axis limits
x_min, x_max = corr_coef.min(), corr_coef.max()
y_min, y_max = perc_age_overlap.min(), perc_age_overlap.max()
z_min, z_max = perc_diag.min(), perc_diag.max()

# Add a tiny margin (0.5% of range) for aesthetic purposes
x_margin = 0.005 * (x_max - x_min)
y_margin = 0.005 * (y_max - y_min)
z_margin = 0.005 * (z_max - z_min)

# Set axis limits to cover the full data range
ax.set_xlim(x_min - x_margin, x_max + x_margin)
ax.set_ylim(y_min - y_margin, y_max + y_margin)
ax.set_zlim(z_min - z_margin, z_max + z_margin)

# Set the viewing angle to elevation=20°, azimuth=-30°
ax.view_init(elev=25, azim=-15)

# Add a legend
ax.scatter([], [], [], c='red', s=50, label=f'High Diagonality (≥ {perc_diag_threshold:.1f}%)')
ax.scatter([], [], [], c='gray', s=10, alpha=0.3, label='Other Points')
ax.legend()

# Get total mappings from complete_path_search_result
total_mappings = complete_path_search_result['total_complete_paths_theoretical']

# Add statistics as text on the plot
stats_text = (
    f"Analyzed mappings: {len(dtw_results_df)}/{total_mappings}\n"
    f"High diagonality mappings: {len(high_diag_mappings)} ({len(high_diag_mappings)/len(dtw_results_df)*100:.1f}%)"
)
plt.figtext(0.27, 0.825, stats_text, bbox=dict(facecolor='white', alpha=0.8))

# Show the plot
plt.show()

# Print statistics and list of high diagonality mappings
print(f"=== High Diagonality Mappings (≥ {perc_diag_threshold:.1f}%) ===")
print(f"Found {len(high_diag_mappings)} mappings out of {len(dtw_results_df)} total ({len(high_diag_mappings)/len(dtw_results_df)*100:.1f}%)")
print("\nMapping IDs with high diagonality:")

# Get details for high diagonality mappings
high_diag_df = dtw_results_df[high_diag_indices].sort_values(by='perc_diag', ascending=False)

# Print detailed information about each high diagonality mapping
for idx, row in high_diag_df.iterrows():
    print(f"Mapping ID {int(row['mapping_id'])}: perc_diag={row['perc_diag']:.1f}%, corr_coef={row['corr_coef']:.3f}, perc_age_overlap={row['perc_age_overlap']:.1f}%")
# Calculate a combined score for overall ranking
print("\n=== Top 5 Overall Best Mappings (Shortest DTW Path Length) ===")

# Filter for mappings with the shortest DTW path length
if 'length' in dtw_results_df.columns:
    # Use the length column directly from the CSV file
    dtw_results_df['dtw_path_length'] = dtw_results_df['length']
    
    # Find the shortest DTW path length
    min_length = dtw_results_df['dtw_path_length'].min()
    shortest_mappings = dtw_results_df[dtw_results_df['dtw_path_length'] == min_length]
else:
    print("Warning: No 'length' column found in the dataframe. Using all mappings.")
    shortest_mappings = dtw_results_df
    min_length = "N/A"

print(f"Considering only the shortest DTW path length mappings")
print(f"Number of mappings considered: {len(shortest_mappings)} out of {len(dtw_results_df)}")
print(f"DTW path length: {min_length}")

# Create a copy for scoring calculations
df_for_ranking = shortest_mappings.copy()

# Define the metrics to use for scoring
metrics = {
    'perc_diag': {'higher_is_better': True, 'weight': 1.0},
    'norm_dtw': {'higher_is_better': False, 'weight': 1.0},
    'dtw_ratio': {'higher_is_better': False, 'weight': 0.0},
    'corr_coef': {'higher_is_better': True, 'weight': 3.0},
    'wrapping_deviation': {'higher_is_better': False, 'weight': 0.0},
    'mean_matching_function': {'higher_is_better': False, 'weight': 0.0},
    'perc_age_overlap': {'higher_is_better': True, 'weight': 1.0}
}

# Initialize the combined score column
df_for_ranking['combined_score'] = 0.0
total_weight = 0.0

# Calculate scores for each metric and add to combined score
for metric, props in metrics.items():
    if metric in df_for_ranking.columns:
        # Make sure we have valid data to work with
        valid_data = df_for_ranking[~df_for_ranking[metric].isna()]
        
        if len(valid_data) > 0:
            min_val = valid_data[metric].min()
            max_val = valid_data[metric].max()
            
            # Only normalize if there's a range
            if max_val > min_val:
                if props['higher_is_better']:
                    # For metrics where higher values are better (like perc_diag, correlation)
                    df_for_ranking[f'{metric}_score'] = (df_for_ranking[metric] - min_val) / (max_val - min_val)
                else:
                    # For metrics where lower values are better (like norm_dtw, dtw_ratio)
                    df_for_ranking[f'{metric}_score'] = 1 - ((df_for_ranking[metric] - min_val) / (max_val - min_val))
            else:
                # If all values are the same, assign a score of 1
                df_for_ranking[f'{metric}_score'] = 1.0
                
            # Add to weighted sum
            weight = props['weight']
            df_for_ranking['combined_score'] += df_for_ranking[f'{metric}_score'].fillna(0) * weight
            total_weight += weight

# Normalize the combined score by the total weight
if total_weight > 0:
    df_for_ranking['combined_score'] = df_for_ranking['combined_score'] / total_weight
else:
    print("Warning: No valid metrics found for scoring.")
    df_for_ranking['combined_score'] = 0.0

# Verify we don't have NaN values in the combined score
if df_for_ranking['combined_score'].isna().any():
    print("Warning: NaN values detected in combined scores. Replacing with zeros.")
    df_for_ranking['combined_score'] = df_for_ranking['combined_score'].fillna(0)

# Get top 5 mappings by combined score
top_overall = df_for_ranking.sort_values(by='combined_score', ascending=False).head(5)

print("\nTop 5 mappings considering all metrics combined (with higher weights for perc_diag, norm_dtw, and dtw_ratio):")
for idx, row in top_overall.iterrows():
    if 'mapping_id' in row:
        print(f"Mapping ID {int(row['mapping_id'])}: Combined Score={row['combined_score']:.3f}")
    if 'dtw_path_length' in row:
        print(f"  dtw_path_length={row['dtw_path_length']:.1f}")
    if 'corr_coef' in row:
        print(f"  correlation coefficient r={row['corr_coef']:.3f}")
    if 'perc_diag' in row:
        print(f"  perc_diag={row['perc_diag']:.1f}%")
    if 'norm_dtw' in row:
        print(f"  norm_dtw={row['norm_dtw']:.3f}")
    if 'dtw_ratio' in row:
        print(f"  dtw_ratio={row['dtw_ratio']:.3f}")


    # Print post_wrap_corr

    # Print additional metrics if available
    if 'perc_age_overlap' in row:
        print(f"  perc_age_overlap={row['perc_age_overlap']:.1f}%")
    if 'wrapping_deviation' in row:
        print(f"  wrapping_deviation={row['wrapping_deviation']:.3f}")
    if 'post_wrap_corr' in row:
        print(f"  post_wrap_corr={row['post_wrap_corr']:.3f}")
    if 'mean_matching_function' in row:
        print(f"  mean_matching_function={row['mean_matching_function']:.3f}")
    print("")

# Save the top 1 mapping ID from the best overall combined score
if not top_overall.empty:
    best_mapping_id = int(top_overall.iloc[0]['mapping_id'])
    print(f"\nSaving best mapping ID: {best_mapping_id}")
else:
    best_mapping_id = None
    print("\nWarning: No valid mappings found to save as best mapping ID")


In [None]:
# Specify a list of segment pairs to combine (1-based index). For example: [(1,2), (4,5), ...]

# Load the sequential mappings from the CSV file


def parse_compact_path(compact_path_str):
    """Parse compact path format "2,3;4,5;6,7" back to list of tuples"""
    if not compact_path_str or compact_path_str == "":
        return []
    return [tuple(map(int, pair.split(','))) for pair in compact_path_str.split(';')]

csv_file = f'outputs/sequential_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.csv'
target_mapping_id = best_mapping_id
target_data_row = None

with open(csv_file, newline='') as f:
    reader = csv.DictReader(f)
    for row in reader:
        if int(row['mapping_id']) == target_mapping_id:
            # UPDATED: Parse compact format instead of ast.literal_eval
            target_data_row = parse_compact_path(row['path'])
            break

if target_data_row is None:
    raise KeyError(f"Mapping ID {target_mapping_id} not found in {csv_file}")

# convert 1-based to 0-based indices for python
valid_pairs_to_combine = [(a-1, b-1) for a, b in target_data_row]

print("Using mapping ID", target_mapping_id)
print(target_data_row)

# convert 1-based to 0-based indices for python
valid_pairs_to_combine = [(a-1, b-1) for a, b in target_data_row]

In [None]:
%matplotlib inline

visualize_pairs=False

if visualize_pairs:
    visualize_type='pairs'
    visualize_segment_labels=True
    mark_depths=True
else:
    visualize_type='fullpath'
    visualize_segment_labels=False
    mark_depths=False

# Visualize the combined segments
combined_wp, combined_quality, _, _ = visualize_combined_segments(
    log_a, log_b, md_a, md_b, 
    dtw_results, valid_dtw_pairs, 
    segments_a, segments_b, 
    depth_boundaries_a, depth_boundaries_b,
    dtw_distance_matrix_full,
    valid_pairs_to_combine,
    color_interval_size=5,
    visualize_pairs=visualize_pairs,
    visualize_segment_labels=visualize_segment_labels,
    correlation_save_path=f'CombinedDTW_correlation_{CORE_A}_{CORE_B}_{YES_NO_AGE}_{target_mapping_id}_{visualize_type}.png',
    matrix_save_path=f'CombinedDTW_matrix_{CORE_A}_{CORE_B}_{YES_NO_AGE}_{target_mapping_id}_{visualize_type}.png',
    mark_depths=mark_depths,
    mark_ages=True,
    ages_a=pickeddepth_ages_a,
    ages_b=pickeddepth_ages_b,
    all_constraint_ages_a=age_constraint_a_in_sequence_ages,
    all_constraint_ages_b=age_constraint_b_in_sequence_ages,
    all_constraint_depths_a=age_constraint_a_in_sequence_depths,
    all_constraint_depths_b=age_constraint_b_in_sequence_depths,
    all_constraint_pos_errors_a=age_constraint_a_in_sequence_pos_errors,
    all_constraint_pos_errors_b=age_constraint_b_in_sequence_pos_errors,
    all_constraint_neg_errors_a=age_constraint_a_in_sequence_neg_errors,
    all_constraint_neg_errors_b=age_constraint_b_in_sequence_neg_errors,
    age_constraint_a_source_cores=age_constraint_a_source_cores,
    age_constraint_b_source_cores=age_constraint_b_source_cores,
    core_a_name=CORE_A,
    core_b_name=CORE_B
)

In [None]:
# Define constants for the file naming
CORE_A = "M9907-23PC"
CORE_B = "M9907-25PC"

# YES_NO_AGE = "loose_age"

if age_consideration:
    if restricted_age_correlation:
        YES_NO_AGE = "restricted_age"
    else:
        YES_NO_AGE = "loose_age"

targeted_quality_index = 'corr_coef' 
# Available quality indices: 'corr_coef', 'norm_dtw', 'dtw_ratio', 'perc_diag', 'variance_deviation', 'match_min', 'match_mean', 'perc_age_overlap'

# Example usage:
plot_correlation_distribution(f'outputs/sequential_mappings_{CORE_A}_{CORE_B}_{YES_NO_AGE}.csv', 
                              target_mapping_id, 
                              targeted_quality_index,
                              no_bins=None,
                              save_png=True,
                              png_filename=f'{"r-values" if targeted_quality_index == "corr_coef" else targeted_quality_index}_distribution_{CORE_A}_{CORE_B}_{YES_NO_AGE}.png',
                              pdf_method='skew-normal', #'KDE', 'skew-normal', 'normal'
                              kde_bandwidth=0.05)