# **Log Data Processing**

### **Load Packages**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.collections as mcoll  # For PolyCollection
import numpy as np
import os
import warnings
from scipy.signal import correlate, find_peaks
from scipy.ndimage import gaussian_filter1d
from PIL import Image

# Create interaction features
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from joblib import Parallel, delayed

## **Functions for Data Cleaning**

### Function for cleaning artifacts and noises

In [None]:
def preprocess_core_data(data_config):
    """
    Preprocess core data by cleaning and scaling depth values using configurable parameters.
    """
    # Get primary depth column from config
    depth_col = data_config['depth_column']
    
    # Validate threshold conditions from column configs
    valid_conditions = ['>', '<', '<=', '>=']
    
    # Check thresholds in MST configs
    for log_type, config in data_config['column_configs']['mst'].items():
        if 'threshold' in config:
            threshold = config['threshold']
            if threshold[0] not in valid_conditions:
                raise ValueError(f"Invalid condition '{threshold[0]}' for {log_type}.")
    
    # Check threshold in HRMS config
    if 'threshold' in data_config['column_configs']['hrms']:
        threshold = data_config['column_configs']['hrms']['threshold']
        if threshold[0] not in valid_conditions:
            raise ValueError(f"Invalid condition '{threshold[0]}' for hrms.")

    # Create output directories
    os.makedirs(data_config['mother_dir'] + data_config['clean_output_folder'], exist_ok=True)

    # Process CT data
    ct_path = data_config['mother_dir'] + data_config['data_folder'] + f"{data_config['core_name']}_CT.csv"
    if os.path.exists(ct_path):
        ct_data = pd.read_csv(ct_path).astype('float64')
        if ct_data is not None:
            # Scale depth using configurable depth column
            ct_depth_scale = data_config['core_length'] / ct_data[depth_col].max()
            ct_data[depth_col] = ct_data[depth_col] * ct_depth_scale
            # Use direct file path from config
            ct_output_path = data_config['mother_dir'] + data_config['clean_output_folder'] + data_config['clean_file_paths']['ct']
            ct_data.to_csv(ct_output_path, index=False)

    # Process RGB data
    rgb_path = data_config['mother_dir'] + data_config['data_folder'] + f"{data_config['core_name']}_RGB.csv"
    if os.path.exists(rgb_path):
        rgb_data = pd.read_csv(rgb_path).astype('float64')
        if rgb_data is not None:
            # Get RGB column names from config
            rgb_config = data_config['column_configs']['rgb']
            rgb_columns = rgb_config['data_cols']
            
            # Remove buffer for extreme RGB values (same logic as original)
            buffer_indices_rgb = []
            for col in rgb_columns:
                if col in rgb_data.columns:
                    extreme_rgb = (rgb_data[col] <= 35) | (rgb_data[col] >= 220)
                    for i in range(len(rgb_data)):
                        if extreme_rgb[i]:
                            buffer_indices_rgb.extend(range(max(0, i-2), min(len(rgb_data), i+2+1)))
         
            if buffer_indices_rgb:
                rgb_data.loc[buffer_indices_rgb, rgb_columns + [f'{col}_std' for col in rgb_columns]] = np.nan
            
            # Scale depth using configurable depth column
            rgb_depth_scale = data_config['core_length'] / rgb_data[depth_col].max()
            rgb_data[depth_col] = rgb_data[depth_col] * rgb_depth_scale
            # Use direct file path from config
            rgb_output_path = data_config['mother_dir'] + data_config['clean_output_folder'] + data_config['clean_file_paths']['rgb']
            rgb_data.to_csv(rgb_output_path, index=False)

    # Determine subfolder paths based on core name (same logic as original)
    if data_config['core_name'].startswith('M99'):
        mst_subfolder = "OSU orignal dataset/R-V_Melville99/Calibrated_MST/"
        hrms_subfolder = "OSU orignal dataset/R-V_Melville99/M9907_point_mag/"
    elif data_config['core_name'].startswith('RR02'):
        mst_subfolder = "OSU orignal dataset/R-V_Revelle02/Calibrated_MST/"
        hrms_subfolder = "OSU orignal dataset/R-V_Revelle02/RR0207_point_mag/"
    else:
        mst_subfolder = "OSU orignal dataset/R-V_Melville99/Calibrated_MST/"
        hrms_subfolder = "OSU orignal dataset/R-V_Melville99/M9907_point_mag/"

    # Process MST data
    mst_path = data_config['mother_dir'] + mst_subfolder + f"{data_config['core_name']}_MST.csv"
    if os.path.exists(mst_path):
        mst_data = pd.read_csv(mst_path).astype('float64')
        
        if mst_data is not None:
            # Get MST configs
            mst_configs = data_config['column_configs']['mst']
            
            # Map original column names to config keys for threshold lookup
            column_to_config_map = {}
            for log_type, config in mst_configs.items():
                if 'data_col' in config:
                    column_to_config_map[config['data_col']] = log_type

            # Get density column from config for extreme detection
            density_col = mst_configs['density']['data_col']
            density_extreme_indices = []
            
            if density_col in mst_data.columns and 'threshold' in mst_configs['density']:
                condition, threshold_value, buffer_size = mst_configs['density']['threshold']
                density_extreme = eval(f"mst_data['{density_col}'] {condition} {threshold_value}")
                for i in range(len(mst_data)):
                    if density_extreme[i]:
                        density_extreme_indices.extend(range(max(0, i-buffer_size), min(len(mst_data), i+buffer_size+1)))

            # Process each MST column using config-based thresholds
            for column in mst_data.columns:
                if column in column_to_config_map:
                    config_key = column_to_config_map[column]
                    if 'threshold' in mst_configs[config_key]:
                        condition, threshold_value, buffer_size = mst_configs[config_key]['threshold']
                        extreme_values = eval(f"mst_data['{column}'] {condition} {threshold_value}")
                        
                        extreme_indices = []
                        for i in range(len(mst_data)):
                            if extreme_values[i]:
                                extreme_indices.extend(range(max(0, i-buffer_size), min(len(mst_data), i+buffer_size+1)))
                        
                        # Combine with density extreme indices
                        all_extreme_indices = list(set(extreme_indices + density_extreme_indices))
                        if all_extreme_indices:
                            mst_data.loc[all_extreme_indices, column] = np.nan

            # Scale depth using configurable depth column
            mst_depth_scale = data_config['core_length'] / mst_data[depth_col].max()
            mst_data[depth_col] = mst_data[depth_col] * mst_depth_scale
            # Use direct file path from config
            mst_output_path = data_config['mother_dir'] + data_config['clean_output_folder'] + data_config['clean_file_paths']['mst']
            mst_data.to_csv(mst_output_path, index=False)

    # Process HRMS data
    hrms_path = data_config['mother_dir'] + hrms_subfolder + f"{data_config['core_name']}_point_mag.csv"
    if os.path.exists(hrms_path):
        hrms_data = pd.read_csv(hrms_path).astype('float64')
        if hrms_data is not None and len(hrms_data) > 3:
            # Get HRMS column name from config
            hrms_col = data_config['column_configs']['hrms']['data_col']
            
            # Apply threshold using config
            if 'threshold' in data_config['column_configs']['hrms']:
                condition, threshold_value, buffer_size = data_config['column_configs']['hrms']['threshold']
                extreme_hrms = eval(f"hrms_data['{hrms_col}'] {condition} {threshold_value}")
                
                extreme_indices = []
                for i in range(len(hrms_data)):
                    if extreme_hrms[i]:
                        extreme_indices.extend(range(max(0, i-buffer_size), min(len(hrms_data), i+buffer_size+1)))
                
                if extreme_indices:
                    hrms_data.loc[extreme_indices, hrms_col] = np.nan

            # Scale depth using configurable depth column (same as other data types)
            depth_scale_factor = data_config['core_length'] / hrms_data[depth_col].max()
            hrms_data[depth_col] = hrms_data[depth_col] * depth_scale_factor
            
            # Use direct file path from config
            hrms_output_path = data_config['mother_dir'] + data_config['clean_output_folder'] + data_config['clean_file_paths']['hrms']
            hrms_data.to_csv(hrms_output_path, index=False)

### Function for plotting cleanned core images and logs

In [None]:
def plot_core_logs(data_config, file_type='clean', title=None):
    """Plot core logs using configurable parameters"""
    # Get primary depth column from config
    depth_col = data_config['depth_column']
    
    # Get file paths based on type
    if file_type == 'clean':
        data_paths = data_config.get('clean_file_paths', {})
        output_folder = data_config['clean_output_folder']
    else:
        data_paths = data_config.get('filled_file_paths', {})
        output_folder = data_config['filled_output_folder']
    
    # Get available column configs
    available_columns = data_config.get('column_configs', {})
    
    # Only process data types that have both file path and column config
    valid_data_types = set(data_paths.keys()) & set(available_columns.keys())
    
    # Build full file paths
    full_paths = {}
    for data_type in valid_data_types:
        full_paths[data_type] = data_config['mother_dir'] + output_folder + data_paths[data_type]

    # Load images
    ct_img_path = data_config['mother_dir'] + data_config['ct_image_path']
    rgb_img_path = data_config['mother_dir'] + data_config['rgb_image_path']
    
    ct_img = plt.imread(ct_img_path) if os.path.exists(ct_img_path) else None
    rgb_img = plt.imread(rgb_img_path) if os.path.exists(rgb_img_path) else None
    
    # Load Core Length and Name
    core_length = data_config['core_length']
    core_name = data_config['core_name']
    
    if title is None:
        file_type_title = 'Cleaned' if file_type == 'clean' else 'ML-Filled'
        title = f'{core_name} {file_type_title} Logs'
    
    # Load available data
    data = {}
    for key, path in full_paths.items():
        if os.path.exists(path):
            loaded_data = pd.read_csv(path)
            if depth_col in loaded_data.columns:  # Use configurable depth column
                data[key] = loaded_data
    
    if not data:
        raise ValueError("No valid data files found to plot")
    
    # Calculate number of plots based on available data
    n_plots = 0
    
    if ct_img is not None and 'ct' in data:
        n_plots += 2
    if rgb_img is not None and 'rgb' in data:
        n_plots += 2  # RGB Image + RGB Channels (with luminance overlay) = 2 plots
        
    # MS panel - check using configurable column names
    has_ms = False
    if 'mst' in data and 'mst' in available_columns:
        ms_config = available_columns['mst']['ms']
        ms_col = ms_config['data_col']
        if ms_col in data['mst'].columns and not data['mst'][ms_col].isna().all():
            has_ms = True
    if 'hrms' in data and 'hrms' in available_columns:
        hrms_config = available_columns['hrms']
        hrms_col = hrms_config['data_col']
        if not data['hrms'][hrms_col].isna().all():
            has_ms = True
    if has_ms:
        n_plots += 1
        
    # Other MST logs - count using EXACT same logic as plotting
    if 'mst' in data and 'mst' in available_columns:
        mst_configs = available_columns['mst']
        for log_type, config in mst_configs.items():
            if (log_type != 'ms' and 
                'data_col' in config and 
                config['data_col'] in data['mst'].columns and 
                not data['mst'][config['data_col']].isna().all()):
                n_plots += 1

    if n_plots == 0:
        raise ValueError("No data available to plot")

    # Create subplot
    fig, axes = plt.subplots(1, n_plots, figsize=(1.2*n_plots, 12))
    if n_plots == 1:
        axes = [axes]

    fig.suptitle(title, fontweight='bold', fontsize=14)

    current_ax = 0

    # Plot CT image and data
    if ct_img is not None and 'ct' in data:
        # CT Image
        axes[current_ax].imshow(ct_img, aspect='auto', extent=[0, 1, core_length, 0], cmap='gray')
        axes[current_ax].set_xticks([])
        axes[current_ax].set_xlabel('Sediment\nCore\nCT Scan', fontweight='bold', fontsize='small')
        # Only set y-label for the leftmost subplot
        if current_ax == 0:
            axes[current_ax].set_ylabel('Depth (cm)', fontweight='bold')
        current_ax += 1

        # CT Data - use configurable column names
        ct_config = available_columns['ct']
        ct_col = ct_config['data_col']
        ct_std_col = ct_config['std_col']
        ct_depth = data['ct'][depth_col].astype(np.float64)
        
        axes[current_ax].plot(data['ct'][ct_col].astype(np.float64), ct_depth, 
                             color='black', linewidth=0.7)
        
        # Standard deviation fill
        if ct_std_col in data['ct'].columns:
            axes[current_ax].fill_betweenx(
                ct_depth,
                data['ct'][ct_col].astype(np.float64) - data['ct'][ct_std_col].astype(np.float64),
                data['ct'][ct_col].astype(np.float64) + data['ct'][ct_std_col].astype(np.float64),
                color='black', alpha=0.2, linewidth=0
            )
        
        # Color-coded CT values using PolyCollection
        ct_values = data['ct'][ct_col].astype(np.float64).values
        depths = ct_depth.values
        norm = plt.Normalize(300, 1600)
        cmap = plt.cm.jet
        
        ct_polys = []
        ct_facecolors = []
        for i in range(len(depths) - 1):
            # Ignore segments with NaN values
            if not (np.isnan(ct_values[i]) or np.isnan(ct_values[i+1])):
                poly = [
                    (0, depths[i]),
                    (ct_values[i], depths[i]),
                    (ct_values[i+1], depths[i+1]),
                    (0, depths[i+1])
                ]
                ct_polys.append(poly)
                # Use the average value for smoother color transition
                avg_val = (ct_values[i] + ct_values[i+1]) / 2
                ct_facecolors.append(cmap(norm(avg_val)))
                
        if ct_polys:
            pc_ct = mcoll.PolyCollection(ct_polys, facecolors=ct_facecolors, edgecolors='none', alpha=0.95)
            axes[current_ax].add_collection(pc_ct)
        
        axes[current_ax].set_xlabel('CT\nBrightness', fontweight='bold', fontsize='small')
        axes[current_ax].grid(True)
        axes[current_ax].set_xlim(300, None)
        axes[current_ax].tick_params(axis='x', labelsize='x-small')
        current_ax += 1

    # Plot RGB image and data
    if rgb_img is not None and 'rgb' in data:
        # RGB Image
        axes[current_ax].imshow(rgb_img, aspect='auto', extent=[0, 0.5, core_length, 0])
        axes[current_ax].set_xticks([])
        axes[current_ax].set_xlabel('Sediment\nCore\nPhoto', fontweight='bold', fontsize='small')
        current_ax += 1
        
        # RGB data (R, G, B channels)
        rgb_config = available_columns['rgb']
        rgb_cols = rgb_config['data_cols']
        rgb_stds = rgb_config['std_cols']
        rgb_depth = data['rgb'][depth_col].astype(np.float64)
        colors = ['red', 'green', 'blue']
        
        for col, std, color in zip(rgb_cols[:3], rgb_stds[:3], colors):
            if col in data['rgb'].columns:
                axes[current_ax].plot(data['rgb'][col].astype(np.float64), rgb_depth,
                                     color=color, linewidth=0.7)
                if std in data['rgb'].columns:
                    axes[current_ax].fill_betweenx(
                        rgb_depth,
                        data['rgb'][col].astype(np.float64) - data['rgb'][std].astype(np.float64),
                        data['rgb'][col].astype(np.float64) + data['rgb'][std].astype(np.float64),
                        color=color, alpha=0.2, linewidth=0
                    )
        
        # Luminance plot using PolyCollection with Inferno colormap (ON SAME SUBPLOT)
        if 'Lumin' in data['rgb'].columns:
            lumin_values = data['rgb']['Lumin'].astype(np.float64).values
            lumin_depths = rgb_depth.values
            
            # Compute normalization range ignoring NaNs
            valid_lumin = lumin_values[~np.isnan(lumin_values)]
            if len(valid_lumin) > 0:
                vmin, vmax = valid_lumin.min(), valid_lumin.max()
                if not np.isclose(vmin, vmax):
                    lumin_norm = plt.Normalize(vmin, vmax)
                    cmap_inferno = plt.cm.inferno
                    
                    lumin_polys = []
                    lumin_facecolors = []
                    for i in range(len(lumin_depths) - 1):
                        if not (np.isnan(lumin_values[i]) or np.isnan(lumin_values[i+1])):
                            poly = [
                                (0, lumin_depths[i]),
                                (lumin_values[i], lumin_depths[i]),
                                (lumin_values[i+1], lumin_depths[i+1]),
                                (0, lumin_depths[i+1])
                            ]
                            lumin_polys.append(poly)
                            avg_lumin = (lumin_values[i] + lumin_values[i+1]) / 2
                            lumin_facecolors.append(cmap_inferno(lumin_norm(avg_lumin)))
                    
                    if lumin_polys:
                        pc_lumin = mcoll.PolyCollection(lumin_polys, facecolors=lumin_facecolors, edgecolors='none', alpha=0.95)
                        axes[current_ax].add_collection(pc_lumin)
        
        axes[current_ax].set_xlabel('RGB\nChannels', fontweight='bold', fontsize='small')
        axes[current_ax].grid(True)
        axes[current_ax].tick_params(axis='x', labelsize='x-small')
        current_ax += 1

    # Plot MS data using configurable column names
    if has_ms:
        # Plot MST MS data if available
        if 'mst' in data and 'mst' in available_columns:
            ms_config = available_columns['mst']['ms']
            ms_col = ms_config['data_col']
            if ms_col in data['mst'].columns and not data['mst'][ms_col].isna().all():
                axes[current_ax].plot(data['mst'][ms_col].astype(np.float64), 
                                     data['mst'][depth_col].astype(np.float64), 
                                     color='green', linewidth=0.7)

        # Plot HRMS data if available
        if 'hrms' in data and 'hrms' in available_columns:
            hrms_config = available_columns['hrms']
            hrms_col = hrms_config['data_col']
            if not data['hrms'][hrms_col].isna().all():
                axes[current_ax].plot(data['hrms'][hrms_col].astype(np.float64), 
                                     data['hrms'][depth_col].astype(np.float64), 
                                     color='red', linewidth=0.7)

        axes[current_ax].set_xlabel('Magnetic\nSusceptibility\n(Î¼SI)', fontweight='bold', fontsize='small')
        axes[current_ax].tick_params(axis='x', labelsize='x-small')
        axes[current_ax].grid(True)
        current_ax += 1

    # Plot other MST logs if available - use configurable labels and colors
    if 'mst' in data and 'mst' in available_columns:
        mst_configs = available_columns['mst']
        
        for log_type, config in mst_configs.items():
            if log_type != 'ms' and config['data_col'] in data['mst'].columns and not data['mst'][config['data_col']].isna().all():
                data_col = config['data_col']
                plot_label = config.get('plot_label', data_col)  # Use configured label or fallback to column name
                plot_color = config.get('plot_color', 'black')   # Use configured color or fallback to black
                
                axes[current_ax].plot(
                    data['mst'][data_col].astype(np.float64), 
                    data['mst'][depth_col].astype(np.float64), 
                    color=plot_color, 
                    linewidth=0.7
                )
                axes[current_ax].set_xlabel(plot_label, fontweight='bold', fontsize='small')
                axes[current_ax].tick_params(axis='x', labelsize='x-small')
                axes[current_ax].grid(True)
                if log_type == 'density':
                    axes[current_ax].set_xlim(1, 2)
                current_ax += 1
    
    # Set common y-axis properties
    for i, ax in enumerate(axes):
        ax.invert_yaxis()
        ax.set_ylim(core_length, 0)
        # Hide y-axis tick labels for all columns except the first
        if i > 0:
            ax.tick_params(axis='y', labelleft=False)
    
    plt.tight_layout()
    return fig, axes

## **Functions for Machine Learning to fill data gaps**

### Function for plotting filled data

In [None]:
def plot_filled_data(target_log, original_data, filled_data, core_length, core_name, data_config, ML_type='ML'):
    """
    Plot original and ML-filled data for a given log using configurable parameters.
    
    Args:
        target_log (str): Name of the log to plot
        original_data (pd.DataFrame): Original data containing the log
        filled_data (pd.DataFrame): Data with ML-filled gaps
        core_length (int): Length of the core in cm
        core_name (str): Name of the core for plot title
        data_config (dict): Configuration containing depth column and other parameters
        ML_type (str): Type of ML method used for title
    """
    # Get primary depth column from config
    depth_col = data_config['depth_column']
    
    # Check if there are any gaps
    has_gaps = original_data[target_log].isna().any()
    
    # Create figure
    fig, ax = plt.subplots(figsize=(15, 3))
    title_suffix = f'Use {ML_type} for Data Gap Filling' if has_gaps else "(No Data Gap to be filled by ML)"
    fig.suptitle(f'{core_name} {target_log} Values {title_suffix}', fontweight='bold')

    # Plot data with ML-predicted gaps only if gaps exist
    if has_gaps:
        ax.plot(filled_data[depth_col], filled_data[target_log], 
                color='red', label=f'ML Predicted {target_log}', linewidth=0.7, alpha=0.7)

    # Plot original data
    ax.plot(original_data[depth_col], original_data[target_log], 
            color='black', label=f'Original {target_log}', linewidth=0.7)

    # Add uncertainty shade if std column exists
    std_col = f'{target_log}_std'
    if std_col in original_data.columns:
        ax.fill_between(original_data[depth_col],
                       original_data[target_log] - original_data[std_col],
                       original_data[target_log] + original_data[std_col],
                       color='black', alpha=0.2, linewidth=0)

    # Customize plot
    ax.set_ylabel(f'{target_log}\nBrightness', fontweight='bold', fontsize='small')
    ax.set_xlabel('Depth (cm)')
    ax.grid(True)
    ax.invert_xaxis()
    ax.set_xlim(0, core_length)
    ax.tick_params(axis='y', labelsize='x-small')
    ax.legend()

    plt.tight_layout()
    plt.show()

### Functions for Machine Learning Data Gap filling

#### Helper Functions for fill_gaps_with_ml

In [None]:
def prepare_feature_data(target_log, All_logs, merge_tolerance, data_config):
    """Prepare merged feature data for ML training using configurable parameters."""
    # Get primary depth column from config
    depth_col = data_config['depth_column']
    
    # Get target data from All_logs
    target_data = None
    for df, cols in All_logs.values():
        if target_log in cols:
            target_data = df.copy()
            break
    
    if target_data is None:
        raise ValueError(f"Target log '{target_log}' not found in any dataset")

    # Convert depth column to float64 in target data
    target_data[depth_col] = target_data[depth_col].astype('float64')
    
    # Prepare training data by merging all available logs
    merged_data = target_data[[depth_col, target_log]].copy()
    features = []
    
    # Merge feature dataframes one by one, using their own depth column
    for df_name, (df, cols) in All_logs.items():
        if target_log not in cols:  # Skip the target dataframe
            df = df.copy()
            df[depth_col] = df[depth_col].astype('float64')
            # Rename depth column temporarily to avoid conflicts during merging
            temp_depth_col = f'{depth_col}_{df_name}'
            df = df.rename(columns={depth_col: temp_depth_col})
            # Convert all numeric columns to float64
            for col in cols:
                if col != depth_col and df[col].dtype.kind in 'biufc':
                    df[col] = df[col].astype('float64')
            # Rename feature columns for merging
            df_renamed = df.rename(columns={col: f'{df_name}_{col}' for col in cols if col != depth_col})
            df_renamed = df_renamed.sort_values(temp_depth_col)
            
            # Perform merge_asof with tolerance for data alignment
            merged_data = pd.merge_asof(
                merged_data.sort_values(depth_col),
                df_renamed,
                left_on=depth_col,
                right_on=temp_depth_col,
                direction='nearest',
                tolerance=merge_tolerance
            )
            
            # Check for unmatched rows due to the tolerance constraint
            unmatched = merged_data[temp_depth_col].isna().sum()
            if unmatched > 0:
                warnings.warn(f"{unmatched} rows did not have a matching depth within tolerance for log '{df_name}'.")
            
            # Add renamed feature columns to features list
            features.extend([f'{df_name}_{col}' for col in cols if col != depth_col])
            # Drop the temporary depth column used for merging
            merged_data = merged_data.drop(columns=[temp_depth_col])
    
    # Add depth column as a feature
    features.append(depth_col)
    
    return target_data, merged_data, features

In [None]:
def apply_feature_weights(X, data_config):
    """Apply feature weights using configurable parameters."""
    X_weighted = X.copy()
    column_configs = data_config['column_configs']
    
    # Apply RGB weights (list corresponding to data_cols)
    if 'rgb' in column_configs and 'feature_weights' in column_configs['rgb']:
        rgb_config = column_configs['rgb']
        rgb_cols = rgb_config['data_cols']
        rgb_weights = rgb_config['feature_weights']
        
        # Apply weights to each RGB column
        for col, weight in zip(rgb_cols, rgb_weights):
            matching_cols = [x_col for x_col in X_weighted.columns if col in x_col]
            for x_col in matching_cols:
                X_weighted[x_col] = (X_weighted[x_col] * weight).astype('float32')
    
    # Apply MST weights (individual feature_weight for each log type)
    if 'mst' in column_configs:
        mst_configs = column_configs['mst']
        for log_type, config in mst_configs.items():
            if 'feature_weight' in config:
                data_col = config['data_col']
                weight = config['feature_weight']
                
                # Find matching columns in X that contain this data column name
                matching_cols = [x_col for x_col in X_weighted.columns if data_col in x_col]
                for x_col in matching_cols:
                    X_weighted[x_col] = (X_weighted[x_col] * weight).astype('float32')
    
    # Apply HRMS weight (single feature_weight)
    if 'hrms' in column_configs and 'feature_weight' in column_configs['hrms']:
        hrms_config = column_configs['hrms']
        hrms_col = hrms_config['data_col']
        hrms_weight = hrms_config['feature_weight']
        
        # Find matching columns in X that contain the HRMS column name
        matching_cols = [x_col for x_col in X_weighted.columns if hrms_col in x_col]
        for x_col in matching_cols:
            X_weighted[x_col] = (X_weighted[x_col] * hrms_weight).astype('float32')
    
    return X_weighted

In [None]:
def adjust_gap_predictions(df, gap_mask, ml_preds, target_log, data_config):
    """
    Adjust ML predictions for gap rows in 'df' so that for each contiguous gap
    segment (with both left and right boundaries available) the predictions are
    blended with the linear interpolation between the boundary values.
    """
    # Get primary depth column from config
    depth_col = data_config['depth_column']
    
    # Get the integer positions (row numbers) of missing values
    gap_positions = np.where(gap_mask.values)[0]
    # Create a Series for easier handling; index = positions in df
    preds_series = pd.Series(ml_preds, index=gap_positions)
    
    # Identify contiguous segments in the gap positions
    segments = np.split(gap_positions, np.where(np.diff(gap_positions) != 1)[0] + 1)
    
    adjusted = preds_series.copy()
    for seg in segments:
        # seg is an array of row positions (in df) for a contiguous gap segment.
        start_pos = seg[0]
        end_pos = seg[-1]
        
        # Enforce trend constraints only if both boundaries exist.
        if start_pos == 0 or end_pos == len(df) - 1:
            continue  # Skip segments at the very beginning or end.
        
        # Retrieve boundary (observed) values and depths using configurable depth column
        left_value = df.iloc[start_pos - 1][target_log]
        right_value = df.iloc[end_pos + 1][target_log]
        # Skip if boundaries are missing (should not happen if gap_mask is correct)
        if pd.isna(left_value) or pd.isna(right_value):
            continue
        left_depth = df.iloc[start_pos - 1][depth_col]
        right_depth = df.iloc[end_pos + 1][depth_col]
        
        # For each gap row in the segment, blend the ML prediction with linear interpolation
        for pos in seg:
            current_depth = df.iloc[pos][depth_col]
            # Normalize the depth position (x in [0, 1])
            if right_depth == left_depth:
                x = 0.5
            else:
                x = (current_depth - left_depth) / (right_depth - left_depth)
            # Compute the linear interpolation value at this depth
            interp_val = left_value + (right_value - left_value) * x
            # Define a weight that is 0 at the boundaries and 1 at the middle.
            # Here we use: weight = 1 - 2*|x - 0.5|
            weight = 1 - 2 * abs(x - 0.5)
            weight = max(0, min(weight, 1))  # Ensure weight is between 0 and 1
            # Blend: final = interpolation + weight*(ML_prediction - interpolation)
            adjusted[pos] = interp_val + weight * (preds_series[pos] - interp_val)
    
    return adjusted.values

In [None]:
def train_model(model):
    """Helper function for parallel model training."""
    def train_wrapper(X_train, y_train, X_pred):
        model.fit(X_train, y_train)
        return model.predict(X_pred)
    return train_wrapper

#### ML functions

In [None]:
def fill_gaps_with_ml(target_log, All_logs, output_csv=True, output_dir=None, core_name=None, 
                      merge_tolerance=3.0, ml_method='xgblgbm', data_config=None):
    """
    Fill gaps in target data using specified ML method.
    
    Args:
        target_log (str): Name of the target column to fill gaps in.
        All_logs (dict): Dictionary of dataframes containing feature data and target data.
        output_csv (bool): Whether to output filled data to CSV file.
        output_dir (str): Directory to save output CSV file.
        core_name (str): Name of the core for CSV filename.
        merge_tolerance (float): Maximum allowed difference in depth for merging rows.
        ml_method (str): ML method to use - 'rf', 'rftc', 'xgb', 'xgblgbm' (default)
        data_config (dict): Configuration containing depth column and other parameters.
        
    Returns:
        tuple: (target_data_filled, gap_mask)
    """
    # Input validation
    if target_log is None or All_logs is None:
        raise ValueError("Both target_log and All_logs must be provided")
        
    if output_csv and (output_dir is None or core_name is None):
        raise ValueError("output_dir and core_name must be provided when output_csv is True")
    
    if ml_method not in ['rf', 'rftc', 'xgb', 'xgblgbm']:
        raise ValueError("ml_method must be one of: 'rf', 'rftc', 'xgb', 'xgblgbm'")
    
    # Prepare feature data
    target_data, merged_data, features = prepare_feature_data(target_log, All_logs, merge_tolerance, data_config)
    
    # Create a copy of the original data to hold the interpolated results
    target_data_filled = target_data.copy()

    # Identify gaps in target data
    gap_mask = target_data[target_log].isna()
    
    # If no gaps exist, save to CSV if requested and return original data
    if not gap_mask.any():
        if output_csv:
            output_path = os.path.join(output_dir, f'{core_name}_{target_log.split("_")[0]}_MLfilled.csv')
            target_data_filled.to_csv(output_path, index=False)
        return target_data_filled, gap_mask

    # Prepare features and target for ML
    X = merged_data[features].copy()
    y = merged_data[target_log].copy()

    # Convert all features to float64
    for col in X.columns:
        if X[col].dtype.kind in 'biufc':
            X[col] = X[col].astype('float64')
    y = y.astype('float64')

    # Split into training (non-gap) and prediction (gap) sets
    X_train = X[~gap_mask]
    y_train = y[~gap_mask]
    X_pred = X[gap_mask]

    # Apply specific ML method
    if ml_method == 'rf':
        predictions = _apply_random_forest(X_train, y_train, X_pred)
    elif ml_method == 'rftc':
        predictions = _apply_random_forest_with_trend_constraints(X_train, y_train, X_pred, merged_data, gap_mask, target_log, data_config)
    elif ml_method == 'xgb':
        predictions = _apply_xgboost(X_train, y_train, X_pred, data_config)
    elif ml_method == 'xgblgbm':
        predictions = _apply_xgboost_lightgbm(X_train, y_train, X_pred, data_config)

    # Fill gaps with predictions
    target_data_filled.loc[gap_mask, target_log] = predictions
    
    # Save to CSV if requested
    if output_csv:
        output_path = os.path.join(output_dir, f"{core_name}_{target_log.split('_')[0]}_MLfilled.csv")
        target_data_filled.to_csv(output_path, index=False)

    return target_data_filled, gap_mask


def _apply_random_forest(X_train, y_train, X_pred):
    """Apply Random Forest method."""
    # Handle outliers using IQR method
    quantile_cutoff = 0.025
    Q1 = y_train.quantile(quantile_cutoff)
    Q3 = y_train.quantile(1 - quantile_cutoff)
    IQR = Q3 - Q1
    outlier_mask = (y_train >= Q1 - 1.5 * IQR) & (y_train <= Q3 + 1.5 * IQR)
    X_train = X_train[outlier_mask]
    y_train = y_train[outlier_mask]

    def train_model(model):
        model.fit(X_train, y_train)
        return model.predict(X_pred)

    # Initialize two ensemble models
    models = [
        RandomForestRegressor(n_estimators=1000,
                              max_depth=30,
                              min_samples_split=5,
                              min_samples_leaf=5,
                              max_features='sqrt',
                              bootstrap=True,
                              random_state=42,
                              n_jobs=-1),
        HistGradientBoostingRegressor(max_iter=800,
                                      learning_rate=0.05,
                                      max_depth=5,
                                      min_samples_leaf=50,
                                      l2_regularization=1.0,
                                      random_state=42,
                                      verbose=0)
    ]

    # Train models in parallel
    predictions = Parallel(n_jobs=-1)(delayed(train_model)(model) for model in models)

    # Ensemble predictions by averaging
    ensemble_predictions = np.mean(predictions, axis=0)
    
    return ensemble_predictions


def _apply_random_forest_with_trend_constraints(X_train, y_train, X_pred, merged_data, gap_mask, target_log):
    """Apply Random Forest with trend constraints method."""
    # Handle outliers using IQR method
    quantile_cutoff = 0.15
    Q1 = y_train.quantile(quantile_cutoff)
    Q3 = y_train.quantile(1 - quantile_cutoff)
    IQR = Q3 - Q1
    outlier_mask = (y_train >= Q1 - 1.5 * IQR) & (y_train <= Q3 + 1.5 * IQR)
    X_train = X_train[outlier_mask]
    y_train = y_train[outlier_mask]
    
    def train_model(model):
        model.fit(X_train, y_train)
        return model.predict(X_pred)
    
    # Initialize two ensemble models
    models = [
        RandomForestRegressor(n_estimators=1000,
                              max_depth=30,
                              min_samples_split=5,
                              min_samples_leaf=5,
                              max_features='sqrt',
                              bootstrap=True,
                              random_state=42,
                              n_jobs=-1),
        HistGradientBoostingRegressor(max_iter=800,
                                      learning_rate=0.05,
                                      max_depth=5,
                                      min_samples_leaf=50,
                                      l2_regularization=1.0,
                                      random_state=42,
                                      verbose=-1)
    ]
    
    # Train models in parallel and average their predictions
    predictions = Parallel(n_jobs=-1)(delayed(train_model)(model) for model in models)
    ensemble_predictions = np.mean(predictions, axis=0)
    
    # Apply trend constraints using the helper function from original
    adjusted_predictions = _adjust_gap_predictions(merged_data, gap_mask, ensemble_predictions, target_log, data_config)
    
    return adjusted_predictions


def _adjust_gap_predictions(df, gap_mask, ml_preds, target_log, data_config):
    """
    Adjust ML predictions for gap rows in 'df' so that for each contiguous gap
    segment (with both left and right boundaries available) the predictions are
    blended with the linear interpolation between the boundary values.
    """
    # Get primary depth column from config
    depth_col = data_config['depth_column']
    
    # Get the integer positions (row numbers) of missing values
    gap_positions = np.where(gap_mask.values)[0]
    # Create a Series for easier handling; index = positions in df
    preds_series = pd.Series(ml_preds, index=gap_positions)
    
    # Identify contiguous segments in the gap positions
    segments = np.split(gap_positions, np.where(np.diff(gap_positions) != 1)[0] + 1)
    
    adjusted = preds_series.copy()
    for seg in segments:
        # seg is an array of row positions (in df) for a contiguous gap segment.
        start_pos = seg[0]
        end_pos = seg[-1]
        
        # Enforce trend constraints only if both boundaries exist.
        if start_pos == 0 or end_pos == len(df) - 1:
            continue  # Skip segments at the very beginning or end.
        
        # Retrieve boundary (observed) values and depths using configurable depth column
        left_value = df.iloc[start_pos - 1][target_log]
        right_value = df.iloc[end_pos + 1][target_log]
        # Skip if boundaries are missing (should not happen if gap_mask is correct)
        if pd.isna(left_value) or pd.isna(right_value):
            continue
        left_depth = df.iloc[start_pos - 1][depth_col]
        right_depth = df.iloc[end_pos + 1][depth_col]
        
        # For each gap row in the segment, blend the ML prediction with linear interpolation
        for pos in seg:
            current_depth = df.iloc[pos][depth_col]
            # Normalize the depth position (x in [0, 1])
            if right_depth == left_depth:
                x = 0.5
            else:
                x = (current_depth - left_depth) / (right_depth - left_depth)
            # Compute the linear interpolation value at this depth
            interp_val = left_value + (right_value - left_value) * x
            # Define a weight that is 0 at the boundaries and 1 at the middle.
            # Here we use: weight = 1 - 2*|x - 0.5|
            weight = 1 - 2 * abs(x - 0.5)
            weight = max(0, min(weight, 1))  # Ensure weight is between 0 and 1
            # Blend: final = interpolation + weight*(ML_prediction - interpolation)
            adjusted[pos] = interp_val + weight * (preds_series[pos] - interp_val)
    
    return adjusted.values


def _apply_xgboost(X_train, y_train, X_pred, data_config):
    """Apply XGBoost method with configurable feature weights."""
    # Apply feature weights BEFORE processing
    X_train_weighted = apply_feature_weights(X_train, data_config)
    X_pred_weighted = apply_feature_weights(X_pred, data_config)
    
    # Handle outliers using IQR method
    quantile_cutoff = 0.025
    Q1 = y_train.quantile(quantile_cutoff)
    Q3 = y_train.quantile(1 - quantile_cutoff)
    IQR = Q3 - Q1
    outlier_mask = (y_train >= Q1 - 1.5 * IQR) & (y_train <= Q3 + 1.5 * IQR)
    X_train_weighted = X_train_weighted[outlier_mask]
    y_train = y_train[outlier_mask]

    # Create feature pipeline
    feature_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=True)),
        ('selector', SelectKBest(score_func=f_regression, k='all'))
    ])

    # Process features
    X_train_processed = feature_pipeline.fit_transform(X_train_weighted, y_train)
    X_pred_processed = feature_pipeline.transform(X_pred_weighted)

    # Convert processed arrays to float32
    X_train_processed = X_train_processed.astype('float32')
    X_pred_processed = X_pred_processed.astype('float32')
    y_train = y_train.astype('float32')

    # Initialize and train XGBoost model
    model = xgb.XGBRegressor(
        n_estimators=5000,
        learning_rate=0.003,
        max_depth=10,
        min_child_weight=5,
        subsample=0.75,
        colsample_bytree=0.75,
        gamma=0.2,
        reg_alpha=0.3,
        reg_lambda=3.0,
        random_state=42,
        n_jobs=-1,
    )
    
    model.fit(X_train_processed, y_train)
    predictions = model.predict(X_pred_processed).astype('float32')
    
    return predictions

def _apply_xgboost_lightgbm(X_train, y_train, X_pred, data_config):
    """Apply XGBoost + LightGBM ensemble method with configurable feature weights."""
    # Apply feature weights BEFORE processing
    X_train_weighted = apply_feature_weights(X_train, data_config)
    X_pred_weighted = apply_feature_weights(X_pred, data_config)
    
    # Create feature pipeline
    feature_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=True))
    ])

    # Process features without selector first to get actual feature count
    X_train_processed = feature_pipeline.fit_transform(X_train_weighted, y_train)
    
    # Now add selector with correct feature count
    max_features = min(50, X_train.shape[0]//10, X_train_processed.shape[1])
    selector = SelectKBest(score_func=f_regression, k=max_features)
    X_train_processed = selector.fit_transform(X_train_processed, y_train)
    X_pred_processed = feature_pipeline.transform(X_pred_weighted)
    X_pred_processed = selector.transform(X_pred_processed)

    # Convert processed arrays to float32
    X_train_processed = X_train_processed.astype('float32')
    X_pred_processed = X_pred_processed.astype('float32')
    y_train = y_train.astype('float32')

    # Initialize models
    xgb_model = xgb.XGBRegressor(
        n_estimators=3000,
        learning_rate=0.003,
        max_depth=10,
        min_child_weight=5,
        subsample=0.75,
        colsample_bytree=0.75,
        gamma=0.2,
        reg_alpha=0.3,
        reg_lambda=3.0,
        random_state=42,
        n_jobs=-1,
    )

    lgb_model = lgb.LGBMRegressor(
        n_estimators=3000,
        learning_rate=0.003,
        max_depth=6,
        num_leaves=20,
        min_child_samples=50,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0.3,
        reg_lambda=3.0,
        random_state=42,
        n_jobs=-1,
        force_col_wise=True,
        verbose=-1
    )

    # Train both models with warnings suppressed
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        xgb_model.fit(X_train_processed, y_train)
        lgb_model.fit(X_train_processed, y_train, feature_name='auto')

    # Make predictions with both models
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        xgb_predictions = xgb_model.predict(X_pred_processed).astype('float32')
        lgb_predictions = lgb_model.predict(X_pred_processed).astype('float32')

    # Ensemble predictions (simple average)
    predictions = (xgb_predictions + lgb_predictions) / 2

    return predictions

### Function to process and fill logs with chosen ML methods

In [None]:
def process_and_fill_logs(data_config, ml_method='xgblgbm'):
    """
    Process and fill gaps in log data using ML methods with configurable parameters.
    """
    # Get configurable parameters
    depth_col = data_config['depth_column']
    mother_dir = data_config['mother_dir']
    core_name = data_config['core_name']
    core_length = data_config['core_length']
    clean_output_folder = data_config['clean_output_folder']
    filled_output_folder = data_config['filled_output_folder']
    
    os.makedirs(mother_dir + filled_output_folder, exist_ok=True)
    
    clean_paths = data_config.get('clean_file_paths', {})
    available_columns = data_config.get('column_configs', {})
    valid_data_types = set(clean_paths.keys()) & set(available_columns.keys())
    
    # Load data with correct path construction
    data_dict = {}
    for data_type in valid_data_types:
        full_path = mother_dir + clean_output_folder + clean_paths[data_type]
        if os.path.exists(full_path):
            data = pd.read_csv(full_path)
            if not data.empty:
                data_dict[data_type] = data

    if not data_dict:
        print("No valid data files found for processing")
        return

    # Create feature data dictionary using configurable column names
    feature_data = {}
    
    if 'ct' in data_dict and 'ct' in available_columns:
        ct_col = available_columns['ct']['data_col']
        if ct_col in data_dict['ct'].columns:
            feature_data['ct'] = (data_dict['ct'], [depth_col, ct_col])
    
    if 'rgb' in data_dict and 'rgb' in available_columns:
        valid_rgb_cols = [depth_col] + [col for col in available_columns['rgb']['data_cols'] 
                                       if col in data_dict['rgb'].columns]
        if len(valid_rgb_cols) > 1:
            feature_data['rgb'] = (data_dict['rgb'], valid_rgb_cols)
    
    if 'mst' in data_dict and 'mst' in available_columns:
        mst_cols = [depth_col]
        for log_type, config in available_columns['mst'].items():
            if config['data_col'] in data_dict['mst'].columns:
                mst_cols.append(config['data_col'])
        if len(mst_cols) > 1:
            feature_data['mst'] = (data_dict['mst'], mst_cols)
    
    if 'hrms' in data_dict and 'hrms' in available_columns:
        hrms_col = available_columns['hrms']['data_col']
        if hrms_col in data_dict['hrms'].columns:
            feature_data['hrms'] = (data_dict['hrms'], [depth_col, hrms_col])

    if not feature_data:
        print("No valid feature data found for ML processing")
        return

    # ML method names for plotting
    ml_names = {
        'rf': 'Random Forest', 
        'rftc': 'Random Forest with Trend Constraints',
        'xgb': 'XGBoost', 
        'xgblgbm': 'XGBoost + LightGBM'
    }

    # Process each target log
    target_logs = []
    
    # Add RGB targets
    if 'rgb' in feature_data:
        rgb_cols = available_columns['rgb']['data_cols']
        for col in rgb_cols:
            if col in data_dict['rgb'].columns:
                target_logs.append((col, 'rgb'))
    
    # Add CT target
    if 'ct' in feature_data:
        ct_col = available_columns['ct']['data_col']
        target_logs.append((ct_col, 'ct'))
    
    # Add MST targets
    if 'mst' in feature_data:
        mst_configs = available_columns['mst']
        for log_type, config in mst_configs.items():
            if 'data_col' in config:
                mst_col = config['data_col']
                if mst_col in data_dict['mst'].columns:
                    target_logs.append((mst_col, 'mst'))
    
    # Add HRMS target
    if 'hrms' in feature_data:
        hrms_col = available_columns['hrms']['data_col']
        target_logs.append((hrms_col, 'hrms'))

    # Process each target log
    mst_filled_results = {}  # Store MST results without saving individual files
    ct_processed = False  # Track if CT was processed
    
    for target_log, data_type in target_logs:
        print(f"Processing {target_log}...")
        
        # Get source data
        data = data_dict[data_type]
        plot_name = target_log
        
        # Create filtered feature data based on target log type
        if target_log in available_columns['rgb']['data_cols']:
            # For RGB targets, use only density from MST if available
            filtered_features = {k: v for k, v in feature_data.items() if k != 'rgb'}
            filtered_features['rgb'] = (data, [depth_col, target_log])
            
            # Add only density from MST if available
            if 'mst' in feature_data:
                df, cols = feature_data['mst']
                density_col = available_columns['mst']['density']['data_col']
                if density_col in cols and density_col in df.columns:
                    filtered_features['mst'] = (df, [depth_col, density_col])
                    
            filled_data, gap_mask = fill_gaps_with_ml(
                target_log=target_log,
                All_logs=filtered_features,
                output_csv=True,
                output_dir=mother_dir + filled_output_folder,
                core_name=core_name,
                ml_method=ml_method,
                data_config=data_config
            )
            plot_filled_data(plot_name, data, filled_data, core_length, core_name, data_config, ML_type=ml_names[ml_method])
        elif data_type == 'mst':
            # For MST targets, don't create individual files - store results for consolidation
            filled_data, gap_mask = fill_gaps_with_ml(
                target_log=target_log,
                All_logs=feature_data,
                output_csv=False,  # Don't create individual files for MST
                output_dir=None,
                core_name=None,
                ml_method=ml_method,
                data_config=data_config
            )
            # Store the filled results for this MST column
            mst_filled_results[target_log] = filled_data[target_log]
            # Plot filled data for each MST column
            plot_filled_data(plot_name, data, filled_data, core_length, core_name, data_config, ML_type=ml_names[ml_method])
        else:
            filled_data, gap_mask = fill_gaps_with_ml(
                target_log=target_log,
                All_logs=feature_data,
                output_csv=True,
                output_dir=mother_dir + filled_output_folder,
                core_name=core_name,
                ml_method=ml_method,
                data_config=data_config
            )
            # Track CT processing but don't print message yet
            if data_type == 'ct':
                ct_processed = True
            plot_filled_data(plot_name, data, filled_data, core_length, core_name, data_config, ML_type=ml_names[ml_method])

    # Create consolidated MST file directly from results
    if mst_filled_results and 'mst' in data_dict:
        mst_data = data_dict['mst'].copy()
        filled_columns = []
        for col, filled_values in mst_filled_results.items():
            mst_data[col] = filled_values
            filled_columns.append(col)
        mst_data.to_csv(mother_dir + filled_output_folder + f'{core_name}_MST_MLfilled.csv', index=False)
        print(f"Saved [{', '.join(filled_columns)}] to {core_name}_MST_MLfilled.csv")

    # Print CT message if it was processed
    if ct_processed:
        print(f"Saved [CT] to {core_name}_CT_MLfilled.csv")

    # Consolidate RGB data
    if 'rgb' in data_dict and 'rgb' in available_columns:
        rgb_data = data_dict['rgb'].copy()
        rgb_columns = available_columns['rgb']['data_cols']
        updated_columns = []
        
        for col in rgb_columns:
            if col in rgb_data.columns:
                filled_file = mother_dir + filled_output_folder + f'{core_name}_{col}_MLfilled.csv'
                if os.path.exists(filled_file):
                    filled_data = pd.read_csv(filled_file)
                    if col in filled_data.columns:
                        rgb_data[col] = filled_data[col]
                        updated_columns.append(col)
        
        if updated_columns:
            rgb_data.to_csv(mother_dir + filled_output_folder + f'{core_name}_RGB_MLfilled.csv', index=False)
            print(f"Saved [{', '.join(updated_columns)}] to {core_name}_RGB_MLfilled.csv")
            for col in rgb_columns:
                filled_file = mother_dir + filled_output_folder + f'{core_name}_{col}_MLfilled.csv'
                if os.path.exists(filled_file):
                    os.remove(filled_file)

    # Consolidate MST data - already handled above, no individual files to consolidate

    # Consolidate HRMS data
    if 'hrms' in data_dict and 'hrms' in available_columns:
        hrms_data = data_dict['hrms'].copy()
        hrms_col = available_columns['hrms']['data_col']
        
        if hrms_col in hrms_data.columns:
            filled_file = mother_dir + filled_output_folder + f'{core_name}_{hrms_col}_MLfilled.csv'
            final_file = mother_dir + filled_output_folder + f'{core_name}_hiresMS_MLfilled.csv'
            
            if os.path.exists(filled_file):
                filled_data = pd.read_csv(filled_file)
                if hrms_col in filled_data.columns:
                    hrms_data[hrms_col] = filled_data[hrms_col]
                    hrms_data.to_csv(final_file, index=False)
                    print(f"Saved [{hrms_col}] to {core_name}_hiresMS_MLfilled.csv")
                    # Only delete individual file if it's different from final file
                    if filled_file != final_file:
                        os.remove(filled_file)

    print("ML-based gap filling completed for all target logs.")

<hr>

### **Define data structure**

#### Define core name and core length

In [None]:
core_name = "M9907-11PC"  # Core name
total_length_cm = 439     # Core length in cm

# core_name = "M9907-12PC"  # Core name
# total_length_cm = 488     # Core length in cm

# core_name = "M9907-14TC"  # Core name
# total_length_cm = 199     # Core length in cm

# core_name = "M9907-22PC"  # Core name
# total_length_cm = 501     # Core length in cm

# core_name = "M9907-22TC"  # Core name
# total_length_cm = 173     # Core length in cm

# core_name = "M9907-23PC"  # Core name
# total_length_cm = 783     # Core length in cm

# core_name = "M9907-25PC"  # Core name
# total_length_cm = 797     # Core length in cm

# core_name = "RR0207-56PC"  # Core name
# total_length_cm = 794     # Core length in cm

# core_name = "M9907-30PC"  # Core name
# total_length_cm = 781     # Core length in cm

# core_name = "M9907-31PC"  # Core name
# total_length_cm = 767     # Core length in cm

#### Define file path, data configuration, and outliner cut-off thresholds for ML data processing

In [None]:
# Enhanced data configuration for ML data imputation
# This replaces all hardcoded column names and parameters in the functions

data_config = {
    # Existing configuration (unchanged)
    'mother_dir': '/Users/larryslai/Library/CloudStorage/Dropbox/My Documents/University of Texas Austin/(Project) NWP turbidites/Cascadia_core_data/OSU_dataset/',
    'core_name': core_name,
    'core_length': total_length_cm,
    'data_folder': f'_compiled_logs/{core_name}/',
    'clean_output_folder': f'_compiled_logs/{core_name}/ML_clean/',
    'filled_output_folder': f'_compiled_logs/{core_name}/ML_filled/',
    
    # Existing file paths (unchanged)
    'clean_file_paths': {
        'ct': f'{core_name}_CT_clean.csv',
        'rgb': f'{core_name}_RGB_clean.csv',
        'mst': f'{core_name}_MST_clean.csv',
        'hrms': f'{core_name}_hiresMS_clean.csv'
    },
    
    'filled_file_paths': {
        'ct': f'{core_name}_CT_MLfilled.csv',
        'rgb': f'{core_name}_RGB_MLfilled.csv',
        'mst': f'{core_name}_MST_MLfilled.csv',
        'hrms': f'{core_name}_hiresMS_MLfilled.csv'
    },
    
    'ct_image_path': f'_compiled_logs/{core_name}/{core_name}_CT.tiff',
    'rgb_image_path': f'_compiled_logs/{core_name}/{core_name}_RGB.tiff',
    
    # Primary depth column name used throughout all functions
    'depth_column': 'SB_DEPTH_cm',
    
    # Enhanced column configs with all information consolidated
    'column_configs': {
        'ct': {
            'data_col': 'CT', 
            'std_col': 'CT_std', 
            'depth_col': 'SB_DEPTH_cm'
        },
        'rgb': {
            'data_cols': ['R', 'G', 'B', 'Lumin'],
            'std_cols': ['R_std', 'G_std', 'B_std', 'Lumin_std'],
            'depth_col': 'SB_DEPTH_cm',
            'feature_weights': [0.3, 0.3, 0.3, 0.3]  # corresponds to ['R', 'G', 'B', 'Lumin']
        },
        'mst': {
            'density': {
                'data_col': 'Den_gm/cc', 
                'depth_col': 'SB_DEPTH_cm',
                'plot_label': 'Density\n(g/cc)',
                'plot_color': 'orange',
                'feature_weight': 0.5,
                'threshold': ['<', 1.14, 1]
            },
            'pwvel': {
                'data_col': 'PWVel_m/s', 
                'depth_col': 'SB_DEPTH_cm',
                'plot_label': 'P-wave\nVelocity\n(m/s)',
                'plot_color': 'purple',
                'feature_weight': 0.01,
                'threshold': ['>=', 1076, 1]
            },
            'pwamp': {
                'data_col': 'PWAmp', 
                'depth_col': 'SB_DEPTH_cm',
                'plot_label': 'P-wave\nAmplitude',
                'plot_color': 'purple',
                'feature_weight': 0.01,
                'threshold': ['>=', 30, 1]
            },
            'elecres': {
                'data_col': 'ElecRes_ohmm', 
                'depth_col': 'SB_DEPTH_cm',
                'plot_label': 'Electrical\nResistivity\n(ohm-m)',
                'plot_color': 'brown',
                'threshold': ['<', 0, 1]
            },
            'ms': {
                'data_col': 'MS', 
                'depth_col': 'SB_DEPTH_cm',
                'feature_weight': 0.05,
                'threshold': ['>', 180, 1]
            }
        },
        'hrms': {
            'data_col': 'hiresMS', 
            'depth_col': 'SB_DEPTH_cm',
            'feature_weight': 3.0,
            'threshold': ['<=', 19, 1]
        }
    }
}

### Data cleaning

In [None]:
# Run data cleaning function
print("Starting data cleaning...")
preprocess_core_data(data_config)

# Plot processed logs using new function signature
fig, axes = plot_core_logs(
    data_config,                           # Data configuration containing all parameters
    file_type='clean',                     # Type of data files to plot ('clean' or 'filled')
    title=f'{core_name} Cleaned Logs'      # Title for the plot figure
)
plt.show()

### ML-based data gap filling

In [None]:
process_and_fill_logs(data_config,              # Data configuration containing all parameters
                      ml_method='xgblgbm')      # Available ml_method options: 'rf', 'rftc', 'xgb', 'xgblgbm'
                                                # - 'rf': Random Forest ML
                                                # - 'rftc': Random Forest ML with trend constraints
                                                # - 'xgb': XGBoost ML
                                                # - 'xgblgbm': XGBoost + LightGBM ML         

#### Plot ML-based gap-filled log diagram

In [None]:
# Plot ML-based gap-filled log diagram
fig, axes = plot_core_logs(
    data_config,                                              # Data configuration containing all parameters
    file_type='filled',                                       # Type of data files to plot ('filled' for gap-filled data)
    title=f'{core_name} XGBoost + LightGBM ML-Filled Logs'    # Title for the plot figure
)
plt.show()