# Correlating well log pairs: Complex Dynamic Time Warping with boundary constraints

## Introduction to dynamic time warping

In [1]:
# Data manipulation and analysis
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from pyCoreRelator import (
    load_log_data,
    load_core_age_constraints,
    load_pickeddepth_ages_from_csv,
    run_multi_parameter_analysis,
    calculate_quality_comparison_t_statistics,
    plot_quality_comparison_t_statistics
)

<hr>

### Define basic parameters

### Define core pairs

In [2]:
CORE_A = "M9907-25PC"
# CORE_A = "M9907-23PC"
# CORE_A = "M9907-11PC"

In [3]:
CORE_B = "M9907-23PC"
# CORE_B = "M9907-11PC"

#### Log data paths and column name structure

In [4]:
# Define log columns to extract
LOG_COLUMNS = ['hiresMS', 'CT', 'Lumin']  # Choose which logs to include
# LOG_COLUMNS = ['hiresMS']  # Choose which logs to include
# LOG_COLUMNS = ['CT']  # Choose which logs to include
# LOG_COLUMNS = ['Lumin']  # Choose which logs to include

# Define depth column
DEPTH_COLUMN = 'SB_DEPTH_cm'

# Define paths for Core A
core_a_log_paths = {
    'hiresMS': f'example_data/processed_data/{CORE_A}/ML_filled/{CORE_A}_hiresMS_MLfilled.csv',
    'CT': f'example_data/processed_data/{CORE_A}/ML_filled/{CORE_A}_CT_MLfilled.csv',
    'Lumin': f'example_data/processed_data/{CORE_A}/ML_filled/{CORE_A}_RGB_MLfilled.csv',
}

# Define paths for Core B
core_b_log_paths = {
    'hiresMS': f'example_data/processed_data/{CORE_B}/ML_filled/{CORE_B}_hiresMS_MLfilled.csv',
    'CT': f'example_data/processed_data/{CORE_B}/ML_filled/{CORE_B}_CT_MLfilled.csv',
    'Lumin': f'example_data/processed_data/{CORE_B}/ML_filled/{CORE_B}_RGB_MLfilled.csv',
}

# Define column mapping for alternative column names
column_alternatives = {
    'hiresMS': ['MS'],
    'CT': ['CT_value'],
    'Lumin': ['luminance', 'Luminance'],
}

<hr>

### Load log data

In [5]:
# Load data for Core A
log_a, md_a, _, _, _ = load_log_data(
    core_a_log_paths,
    log_columns=LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

# Load data for Core B
log_b, md_b, _, _, _ = load_log_data(
    core_b_log_paths,
    log_columns=LOG_COLUMNS,
    depth_column=DEPTH_COLUMN,
    normalize=True,
    column_alternatives=column_alternatives
)

Error loading example_data/processed_data/M9907-25PC/ML_filled/M9907-25PC_hiresMS_MLfilled.csv: [Errno 2] No such file or directory: 'example_data/processed_data/M9907-25PC/ML_filled/M9907-25PC_hiresMS_MLfilled.csv'
Error loading example_data/processed_data/M9907-25PC/ML_filled/M9907-25PC_CT_MLfilled.csv: [Errno 2] No such file or directory: 'example_data/processed_data/M9907-25PC/ML_filled/M9907-25PC_CT_MLfilled.csv'
Error loading example_data/processed_data/M9907-25PC/ML_filled/M9907-25PC_RGB_MLfilled.csv: [Errno 2] No such file or directory: 'example_data/processed_data/M9907-25PC/ML_filled/M9907-25PC_RGB_MLfilled.csv'
No log datasets were loaded
Error loading example_data/processed_data/M9907-23PC/ML_filled/M9907-23PC_hiresMS_MLfilled.csv: [Errno 2] No such file or directory: 'example_data/processed_data/M9907-23PC/ML_filled/M9907-23PC_hiresMS_MLfilled.csv'
Error loading example_data/processed_data/M9907-23PC/ML_filled/M9907-23PC_CT_MLfilled.csv: [Errno 2] No such file or directory

### Load picked depth boundaries

In [6]:
%matplotlib inline

# Define paths to the CSV files
pickeddepth_a_csv = f'pickeddepth/{CORE_A}_pickeddepth.csv'
pickeddepth_b_csv = f'pickeddepth/{CORE_B}_pickeddepth.csv'

# Load picked depths and extract category 1 depths
if os.path.exists(pickeddepth_a_csv):
    picked_data_a = pd.read_csv(pickeddepth_a_csv)
    all_depths_a_cat1 = picked_data_a[picked_data_a['category'] == 1]['picked_depths_cm'].values.astype('float32')
    intepreted_bed_a_cat1 = picked_data_a[picked_data_a['category'] == 1]['interpreted_bed'].fillna('').values.astype('str')
else:
    print(f"Warning: {pickeddepth_a_csv} not found. Using empty array for all_depths_a_cat1.")
    all_depths_a_cat1 = np.array([]).astype('float32')
    intepreted_bed_a_cat1 = np.array([]).astype('str').fillna('')

if os.path.exists(pickeddepth_b_csv):
    picked_data_b = pd.read_csv(pickeddepth_b_csv)
    all_depths_b_cat1 = picked_data_b[picked_data_b['category'] == 1]['picked_depths_cm'].values.astype('float32')
    intepreted_bed_b_cat1 = picked_data_b[picked_data_b['category'] == 1]['interpreted_bed'].fillna('').values.astype('str')
else:
    print(f"Warning: {pickeddepth_b_csv} not found. Using empty array for all_depths_b_cat1.")
    all_depths_b_cat1 = np.array([]).astype('float32')
    intepreted_bed_b_cat1 = np.array([]).astype('str').fillna('')

### Load age data

In [7]:
# Load age constraints for both cores
consider_adjacent_core = True

data_columns = {
    'age': 'calib810_agebp',
    'pos_error': 'calib810_2sigma_pos', 
    'neg_error': 'calib810_2sigma_neg',
    'min_depth': 'mindepth_cm',
    'max_depth': 'maxdepth_cm',
    'in_sequence': 'in_sequence',
    'core': 'core',
    'interpreted_bed': 'interpreted_bed'
}

# Configuration: Define the path to the age constraints csv file
age_base_path = '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites/Cascadia_core_data/Age constraints/Goldfinger2012'

# Load age constraints for both cores: Load the age constraints for both cores from the csv file
age_data_a = load_core_age_constraints(CORE_A, age_base_path, consider_adjacent_core, data_columns, mute_mode=True)
age_data_b = load_core_age_constraints(CORE_B, age_base_path, consider_adjacent_core, data_columns, mute_mode=True)

### Load estimated ages for each picked depth boundary from csv

In [8]:
# Load estimated boundary age data for both cores from CSV files using the imported function
cores = [CORE_A, CORE_B]
pickeddepth_ages = {}

# Define the uncertainty method: 'MonteCarlo', 'Linear', or 'Gaussian'
uncertainty_method='MonteCarlo'   

for core in cores:
    core_age_csv = f"pickeddepth_ages/{core}_pickeddepth_ages_{uncertainty_method}.csv"
    pickeddepth_ages[core] = load_pickeddepth_ages_from_csv(core_age_csv)

# Assign to individual variables for backward compatibility
if CORE_A in pickeddepth_ages:
    pickeddepth_ages_a = pickeddepth_ages[CORE_A]
if CORE_B in pickeddepth_ages:
    pickeddepth_ages_b = pickeddepth_ages[CORE_B]

Loaded 20 pickeddepth ages from M9907-25PC_pickeddepth_ages_MonteCarlo.csv
Loaded 21 pickeddepth ages from M9907-23PC_pickeddepth_ages_MonteCarlo.csv


<hr>

## Compute quality metric distribution for all stituation

In [12]:
# Cell: Multi-Parameter Distribution Analysis

# Run all parameter combinations and plot distribution curves together

# Define all parameter combinations to test
parameter_combinations = [
    {'age_consideration': True, 'restricted_age_correlation': True, 'shortest_path_search': True},
    {'age_consideration': False, 'restricted_age_correlation': False, 'shortest_path_search': True}
]

# Define all quality indices to process
target_quality_indices = ['corr_coef', 'norm_dtw']

output_csv_filenames = {}
for quality_index in target_quality_indices:
    output_csv_filenames[quality_index] = f'example_outputs/{CORE_A}_{CORE_B}/{"_".join(LOG_COLUMNS)}/{quality_index}_fit_params.csv'

# Define synthetic CSV filenames for consistent bin sizing
synthetic_csv_filenames = {}
for quality_index in target_quality_indices:
    synthetic_csv_filenames[quality_index] = f'example_outputs/synthetic_PDFs_{"_".join(LOG_COLUMNS)}_{quality_index}.csv'

# Execute the analysis function
run_multi_parameter_analysis(
    # Core data inputs
    log_a=log_a, 
    log_b=log_b, 
    md_a=md_a, 
    md_b=md_b,
    all_depths_a_cat1=all_depths_a_cat1,
    all_depths_b_cat1=all_depths_b_cat1,
    pickeddepth_ages_a=pickeddepth_ages_a,
    pickeddepth_ages_b=pickeddepth_ages_b,
    age_data_a=age_data_a,
    age_data_b=age_data_b,
    uncertainty_method=uncertainty_method,
    
    # Analysis parameters
    parameter_combinations=parameter_combinations,
    target_quality_indices=target_quality_indices,
    test_age_constraint_removal=True,       # Set to False to disable age constraint removal testing,
    
    # Core identifiers
    core_a_name=CORE_A,
    core_b_name=CORE_B,
    
    # Output configuration
    output_csv_filenames=output_csv_filenames,
    
    # Optional parameter
    pca_for_dependent_dtw=False,  # For multidimensional DTW, choose DTW method

    # Maximum number of scenarios per constraint removal layer
    max_search_per_layer= 50,   # Higher number of scenarios to be searched per layer, better coverage of the parameter space with the cost of longer runtime

    # Number of CPU cores used in parallel processing
    n_jobs=-1     # -1: uses all available CPU cores
)

Validating age data for age-based analysis...
✓ Age data validation passed - age-based analysis is possible
Running 2 parameter combinations for 2 quality indices...
Using all available CPU cores for parallel processing
Age constraint removal testing enabled:
- Core B has 5 age constraints
- Additional 30 scenarios exist
- As max_search_per_layer is defined: 50 scenarios are randomly sampled from each constraint removal layer
-     This just provides statistical approximation while maintaining computational feasibility

=== PHASE 1: Running original parameter combinations ===


Original parameter combinations: 100%|██████████| 2/2 [00:00<00:00, 2906.66it/s]


✗ Error in restricted_age_optimal: index -1 is out of bounds for axis 0 with size 0
✗ Error in no_age_optimal: index -1 is out of bounds for axis 0 with size 0
✓ All original parameter combinations processed

=== PHASE 2: Running age constraint removal scenarios ===
- Core B has 5 age constraints
- Processing 30 additional constraint removal scenarios
max_search_per_layer is defined: randomly sampling up to 50 scenarios per layer of search
- Layer 1 constraints: 5 scenarios (all processed)
- Layer 2 constraints: 10 scenarios (all processed)
- Layer 3 constraints: 10 scenarios (all processed)
- Layer 4 constraints: 5 scenarios (all processed)


Age constraint removal scenarios: 100%|██████████| 30/30 [00:00<00:00, 221.79it/s]


✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_optimal_subset_error: index -1 is out of bounds for axis 0 with size 0
✗ Error in restricted_age_opti

### Plotting: compare the quality metric to the null hypothesis

In [10]:
# Define file names outside the function
target_quality_indices = ['corr_coef', 'norm_dtw']

# Define input path for master CSV filenames
master_csv_filenames = {}
for quality_index in target_quality_indices:
    master_csv_filenames[quality_index] = f'example_outputs/{CORE_A}_{CORE_B}/{"_".join(LOG_COLUMNS)}/{quality_index}_fit_params.csv'

# Define input path for synthetic CSV filenames
synthetic_csv_filenames = {}
for quality_index in target_quality_indices:
    synthetic_csv_filenames[quality_index] = f'example_outputs/synthetic_PDFs_{"_".join(LOG_COLUMNS)}_{quality_index}.csv'

# Step 1: Calculate statistics (run once)
calculate_quality_comparison_t_statistics(
    target_quality_indices=target_quality_indices,
    master_csv_filenames=master_csv_filenames,
    synthetic_csv_filenames=synthetic_csv_filenames,
    CORE_A=CORE_A,
    CORE_B=CORE_B,
    mute_mode=False
)

✗ Error: Master CSV file not found: example_outputs/M9907-25PC_M9907-23PC/hiresMS_CT_Lumin/corr_coef_fit_params.csv
✗ Error: Master CSV file not found: example_outputs/M9907-25PC_M9907-23PC/hiresMS_CT_Lumin/norm_dtw_fit_params.csv


In [11]:
# Define mapping CSV filename: Try restricted_age_optimal first, fallback to no_age_optimal if it doesn't exist
sequential_mappings_csv = f'example_outputs/{CORE_A}_{CORE_B}/{"_".join(LOG_COLUMNS)}/mappings_restricted_age_optimal.csv'
if not os.path.exists(sequential_mappings_csv):
    sequential_mappings_csv = f'example_outputs/{CORE_A}_{CORE_B}/{"_".join(LOG_COLUMNS)}/mappings_no_age_optimal.csv'
    if not os.path.exists(sequential_mappings_csv):
        sequential_mappings_csv = None

# Define output figure filenames
output_figure_filenames = {}
for quality_index in target_quality_indices:
    output_figure_filenames[quality_index] = f'example_outputs/{CORE_A}_{CORE_B}/{"_".join(LOG_COLUMNS)}/{quality_index}_compare2null.png'

# Step 2: Plot results
plot_quality_comparison_t_statistics(
    target_quality_indices=target_quality_indices,
    master_csv_filenames=master_csv_filenames,
    synthetic_csv_filenames=synthetic_csv_filenames,
    CORE_A=CORE_A,
    CORE_B=CORE_B,
    mute_mode=True,
    save_fig=False,
    output_figure_filenames=output_figure_filenames,   # Acceptable image formats: png, jpg, svg, pdf.
    save_gif=False, 
    max_frames=40,
    plot_real_data_histogram=True,
    plot_age_removal_step_pdf=False,
    show_best_datum_match=True,
    sequential_mappings_csv=sequential_mappings_csv
)

Error reading example_outputs/M9907-25PC_M9907-23PC/hiresMS_CT_Lumin/corr_coef_fit_params.csv: [Errno 2] No such file or directory: 'example_outputs/M9907-25PC_M9907-23PC/hiresMS_CT_Lumin/corr_coef_fit_params.csv'


FileNotFoundError: [Errno 2] No such file or directory: 'example_outputs/M9907-25PC_M9907-23PC/hiresMS_CT_Lumin/corr_coef_fit_params.csv'

In [None]:
# Define animated gifs showing progressive constraint addition
output_gif_filenames = {}
for quality_index in target_quality_indices:
    output_gif_filenames[quality_index] = f'example_outputs/{CORE_A}_{CORE_B}/{"_".join(LOG_COLUMNS)}/{quality_index}_compare2null.gif'

plot_quality_comparison_t_statistics(
    target_quality_indices=target_quality_indices,
    master_csv_filenames=master_csv_filenames,
    synthetic_csv_filenames=synthetic_csv_filenames,
    CORE_A=CORE_A,
    CORE_B=CORE_B,
    mute_mode=True,
    save_gif=True, 
    output_gif_filenames=output_gif_filenames,
    max_frames=40,
    plot_real_data_histogram=False,
    plot_age_removal_step_pdf=True,
    show_best_datum_match=True,
    sequential_mappings_csv=sequential_mappings_csv
)