# Multi-node training
> Grid search training on multiple nodes

In [92]:
#| default_exp training.multi_node

In [93]:
#| hide
%load_ext autoreload
%autoreload 2


In [94]:
!which python

In [95]:
#| export
import sys
from pathlib import Path
from typing import List, Dict, Any, Union, Optional, Tuple
from fastcore.script import call_parse


In [96]:
#| export
from dataclasses import dataclass, asdict
import json
import configparser
from dotenv import load_dotenv

In [97]:
#| export
load_dotenv(dotenv_path=f'/home/ai_dsx.work/data/projects/be-vision-ad-tools/be-vision-ad-tools/.env')

In [98]:
CURRETNT_NB='/home/ai_dsx.work/data/projects/be-vision-ad-tools/nbs'

In [99]:
#| export
# Import our training functions
from be_vision_ad_tools.training.flexible_trainer import (
    FlexibleTrainingConfig, train_anomaly_model, ModelType, BackboneType
)
from be_vision_ad_tools.training.hyperparameter_search import (
    simple_hyperparameter_search, create_modular_batch_comparison_poster
)


In [100]:
#| export
from typing import List, Dict, Any, Union, Optional, Tuple
from pathlib import Path
from datetime import datetime
import json
import time
import subprocess
from itertools import product
import configparser
import shutil
import os
import yaml
from dataclasses import dataclass, asdict

In [101]:
#| export
def generate_training_tasks(
    data_root: Union[str, Path],  # Root directory containing data
    normal_dir: str = "good",     # Normal images subdirectory  
    abnormal_dir: str = "bad",    # Abnormal images subdirectory
    class_name: str = "multinode_search",  # Base class name
    
    # Parameter combinations to distribute
    model_names: List[str] = None,        # Models to test
    backbones: List[str] = None,          # Backbones to test
    n_features_list: List[int] = None,    # Features to test
    layers: List[List[str]] = None,       # Layers to test
    
    # Training settings
    max_epochs: int = 10,                 # Training epochs
    output_base: Union[str, Path] = None, # Base output directory
    
) -> List[Dict[str, Any]]:
    """
    Generate individual training tasks for multi-node execution.
    
    Each task represents one parameter combination that can be executed
    independently on a separate node.
    """
    
    # Set defaults
    if model_names is None:
        model_names = ['padim']
    if backbones is None:
        backbones = ['wide_resnet50_2']
    if n_features_list is None:
        n_features_list = [64, 128]
    if layers is None:
        layers = [['layer1'], ['layer1', 'layer2', 'layer3']]
    
    if output_base is None:
        output_base = Path(data_root) / 'multinode_results'
    
    # Generate all parameter combinations
    param_combinations = list(product(model_names, backbones, n_features_list, layers))
    
    print(f"üß™ Generating {len(param_combinations)} training tasks")
    print(f"üì¶ Models: {model_names}")
    print(f"üèóÔ∏è Backbones: {backbones}")
    print(f"üî¢ Features: {n_features_list}")
    print(f"üìä Layers: {layers}")
    
    tasks = []
    for i, (model_name, backbone, n_features, layer_list) in enumerate(param_combinations):
        
        # Create unique task identifier
        task_id = f"task_{i:03d}_{model_name}_{backbone}_{n_features}_{'-'.join(layer_list)}"
        
        # Define output paths for this task
        task_output = Path(output_base) / task_id
        model_save_path = task_output / "model"
        results_path = task_output / "results"
        
        task = {
            'task_id': task_id,
            'index': i,
            'model_name': model_name,
            'backbone': backbone,
            'n_features': n_features,
            'layers': layer_list,
            'data_root': str(data_root),
            'normal_dir': normal_dir,
            'abnormal_dir': abnormal_dir,
            'class_name': f"{class_name}_{task_id}",
            'max_epochs': max_epochs,
            'output_folder': str(results_path),
            'save_path': str(model_save_path),
            'task_output_base': str(task_output)
        }
        
        tasks.append(task)
    
    print(f"‚úÖ Generated {len(tasks)} training tasks")
    return tasks


In [46]:
#| eval: false
#tasks = generate_training_tasks(
    #data_root="/home/ai_dsx.work/data/projects/AD_tool_test/images",
    #normal_dir="good",
    #abnormal_dir="bad",
    #class_name="test_hyperparam",
    #output_base="/home/ai_dsx.work/data/projects/AD_tool_test/images/multinode_results"
#)
#tasks

In [102]:
#| eval: true
def test_generate_training_tasks():
    """Test the generate_training_tasks function with various parameter combinations."""
    from fastcore.test import test_eq, test_ne, test
    import tempfile
    
    # Create a temporary directory for testing
    with tempfile.TemporaryDirectory() as temp_dir:
        
        # Test basic functionality
        tasks = generate_training_tasks(
            data_root=temp_dir,
            normal_dir="good",
            abnormal_dir="bad",
            class_name="test_hyperparam",
            output_base=None  # Should auto-generate path
        )
        
        # Test that we get the expected number of tasks (default parameters)
        # Default: 1 model * 1 backbone * 2 n_features * 2 layer combinations = 4 tasks
        test_eq(len(tasks), 4)
        
        # Test task structure
        first_task = tasks[0]
        required_keys = ['task_id', 'index', 'model_name', 'backbone', 'n_features', 
                        'layers', 'data_root', 'normal_dir', 'abnormal_dir', 
                        'class_name', 'max_epochs', 'output_folder', 'save_path', 
                        'task_output_base']
        
        for key in required_keys:
            assert key in first_task, f"Task should contain '{key}' key"
        
        # Test unique task IDs
        task_ids = [task['task_id'] for task in tasks]
        test_eq(len(task_ids), len(set(task_ids)))
        
        # Test index sequence
        indices = [task['index'] for task in tasks]
        test_eq(indices, list(range(len(tasks))))
        
        # Test with custom parameters
        custom_tasks = generate_training_tasks(
            data_root=temp_dir,
            normal_dir="normal",
            abnormal_dir="anomaly", 
            class_name="custom_test",
            model_names=['padim'],
            backbones=['resnet18'],
            n_features_list=[32],
            layers=[['layer1']],
            max_epochs=5
        )
        
        test_eq(len(custom_tasks), 1)
        test_eq(custom_tasks[0]['normal_dir'], "normal")
        test_eq(custom_tasks[0]['abnormal_dir'], "anomaly")
        test_eq(custom_tasks[0]['max_epochs'], 5)
        print("‚úÖ All tests passed for generate_training_tasks!")


In [48]:
# Run the test
#test_generate_training_tasks()

In [49]:
#| export
def create_lsf_config(
    session_name: str,  # Session name for identification
    tasks: List[Dict[str, Any]],  # Tasks generated by generate_training_tasks
    output_dir: Union[str, Path],  # Directory for LSF files
    
    # LSF Worker Configuration
    worker_ui: str = "python",        # User interface (e.g., "python", "R")
    worker_um: str = "background",    # User mode 
    threads_per_task: int = 1,        # Threads per task
    mem_per_task: int = 8000,         # Memory per task (MB)
    os_constraint: str = "(LINUX80)", # OS constraint
    tasks_per_worker: int = 1,        # Tasks per worker (usually 1 for training)
    num_workers: int = None,          # Number of workers (auto-calculated if None)
    
    # Session Configuration
    keep_running: int = 0,            # Keep controller running after tasks
    linger_time: int = 300,           # Time to keep workers alive for dynamic imports
    
) -> Tuple[Path, Path, Path]:
    """
    Create LSF configuration files for multi-node training session.
    
    This function creates the LSF configuration file but does NOT create the task file
    or connection file. The task file needs to be created separately using create_task_file()
    function. The connection file is created automatically by LSF when the session starts.
    
    Returns:
        Tuple of (config_file_path, task_file_path, connection_file_path)
        Note: Only config_file_path actually exists after this function runs
    """
    
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Calculate number of workers if not specified
    if num_workers is None:
        num_workers = min(len(tasks), 50)  # Max 50 workers by default
    
    print(f"üîß Creating LSF configuration for {len(tasks)} tasks")
    print(f"üë• Workers: {num_workers}")
    print(f"üìã Tasks per worker: {tasks_per_worker}")
    print(f"üíæ Memory per task: {mem_per_task}MB")
    
    # Define file paths (but only create config file in this function)
    config_file = output_dir / f"{session_name}_session.cfg"
    task_file = output_dir / f"{session_name}_tasks.txt"
    connection_file = output_dir / f"{session_name}_connection.file"
    
    # Generate LSF configuration
    config = configparser.ConfigParser()
    config.optionxform = str  # Preserve case
    
    # Common section
    config['COMMON'] = {
        'connection_file': str(connection_file),
        'session_name': f'"{session_name}"'
    }
    
    # Controller section
    config['CONTROLLER'] = {
        'keep_running': str(keep_running)
    }
    
    # Worker section
    worker_name = f"{session_name}_worker"
    config[f'WORKER {worker_name}'] = {
        'tasks': str(tasks_per_worker),
        'workload': f'{session_name}_workload',
        'linger_time': str(linger_time)
    }
    
    # Workload section
    config[f'WORKLOAD {session_name}_workload'] = {
        'input_file': str(task_file),
        'worker_ui': worker_ui,
        'worker_um': worker_um,
        'threads_per_task': str(threads_per_task),
        'mem_per_task': str(mem_per_task),
        'os': os_constraint
    }
    
    # Schedule section
    config['SCHEDULE _startup_'] = {
        'workloads': f'({session_name}_workload)',
        'workers': f'({worker_name}={num_workers})'
    }
    
    # Write ONLY the configuration file
    with open(config_file, 'w') as f:
        config.write(f, space_around_delimiters=False)
    
    print(f"‚úÖ Configuration saved: {config_file}")
    print(f"üìù Task file path (not created yet): {task_file}")
    print(f"üîó Connection file path (created by LSF): {connection_file}")
    print(f"‚ö†Ô∏è  Note: Use create_task_file() to create the actual task file")
    
    return config_file, task_file, connection_file


In [50]:
#config_file, task_file, connection_file = create_lsf_config(
    #session_name="test_session",
    #tasks=tasks,
    #output_dir="/home/ai_dsx.work/data/projects/AD_tool_test/images/multinode_results"
#)

In [51]:
#| export
def create_task_file(
    tasks: List[Dict[str, Any]],  # Tasks generated by generate_training_tasks
    task_file_path: Union[str, Path],  # Path to task file
    training_script_path: Union[str, Path],  # Path to training script
) -> Path:
    """
    Create task file for LSF workload containing all training commands.
    
    Each line in the task file represents one training job that will be
    executed by an LSF worker.
    """
    
    task_file_path = Path(task_file_path)
    training_script_path = Path(training_script_path)
    
    print(f"üìù Creating task file with {len(tasks)} training tasks")
    
    # Generate task commands
    task_commands = []
    for task in tasks:
        # Convert task to JSON string (properly escaped for shell)
        task_json = json.dumps(task)
        # Escape quotes for shell
        task_json_escaped = task_json.replace('"', '\\"')
        
        # Create command line for this task
        command = f'{training_script_path} "{task_json_escaped}"'
        task_commands.append(command)
    
    # Write task file
    with open(task_file_path, 'w') as f:
        for command in task_commands:
            f.write(command + '\n')
    
    print(f"‚úÖ Task file created: {task_file_path}")
    print(f"üìã Contains {len(task_commands)} training commands")
    
    return task_file_path


In [52]:
#| export
def create_training_script(
    script_path: Union[str, Path],  # Path where to save the training script
) -> Path:
    """
    Create a Python training script that can be executed by LSF workers.
    """
    
    script_path = Path(script_path)
    script_path.parent.mkdir(parents=True, exist_ok=True)
    
    script_content = '''#!/usr/bin/env python3
"""
Multi-node training script for BE Vision AD Tools
Executed by LSF workers with task parameters as JSON argument
"""

import sys
import json
import time
from pathlib import Path
from datetime import datetime

# Add project paths
sys.path.append('/home/ai_dsx.work/data/projects/cv_tools')
sys.path.append('/home/ai_warstein/homes/goni/custom_libs')
sys.path.append('/home/ai_dsx.work/data/projects/be-vision-ad-tools')

from be_vision_ad_tools.training.flexible_trainer import (
    FlexibleTrainingConfig, train_anomaly_model, ModelType, BackboneType
)

def execute_single_training_task(task_json_str):
    """Execute a single training task from JSON parameters"""
    
    # Parse task parameters
    task = json.loads(task_json_str)
    
    print(f'üöÄ Starting task: {task["task_id"]}')
    print(f'üì¶ Model: {task["model_name"]} + {task["backbone"]}')
    print(f'üî¢ Features: {task["n_features"]}, Layers: {task["layers"]}')
    print(f'üìÅ Output: {task["task_output_base"]}')
    
    try:
        # Create output directories
        Path(task['output_folder']).mkdir(parents=True, exist_ok=True)
        Path(task['save_path']).mkdir(parents=True, exist_ok=True)
        
        # Create training configuration
        config = FlexibleTrainingConfig(
            data_root=task['data_root'],
            normal_dir=task['normal_dir'],
            abnormal_dir=task['abnormal_dir'],
            model_name=task['model_name'],
            backbone=task['backbone'],
            layers=task['layers'],
            n_features=task['n_features'],
            max_epochs=task['max_epochs'],
            class_name=task['class_name'],
            save_path=task['save_path']
        )
        
        # Train the model
        print(f'üéØ Training {task["model_name"]} model...')
        start_time = time.time()
        
        result = train_anomaly_model(config)
        
        training_time = time.time() - start_time
        
        if result and result.get('success', False):
            # Save detailed results
            task_result = {
                'task_id': task['task_id'],
                'success': True,
                'model_name': task['model_name'],
                'backbone': task['backbone'],
                'n_features': task['n_features'],
                'layers': task['layers'],
                'training_time_seconds': training_time,
                'model_path': result.get('model_path'),
                'config_used': task,
                'training_results': result,
                'timestamp': datetime.now().isoformat()
            }
            
            # Save result to JSON file
            result_file = Path(task['task_output_base']) / 'task_result.json'
            with open(result_file, 'w') as f:
                json.dump(task_result, f, indent=2, default=str)
            
            print(f'‚úÖ Task {task["task_id"]} completed successfully in {training_time:.1f}s')
            print(f'üíæ Results saved: {result_file}')
            
        else:
            raise Exception('Training failed - no success result returned')
            
    except Exception as e:
        print(f'‚ùå Task {task["task_id"]} failed: {str(e)}')
        
        # Save error result
        error_result = {
            'task_id': task['task_id'],
            'success': False,
            'error': str(e),
            'config_used': task,
            'timestamp': datetime.now().isoformat()
        }
        
        error_file = Path(task['task_output_base']) / 'task_error.json'
        error_file.parent.mkdir(parents=True, exist_ok=True)
        with open(error_file, 'w') as f:
            json.dump(error_result, f, indent=2, default=str)
        
        raise  # Re-raise to signal failure to LSF

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python training_script.py <task_json>")
        sys.exit(1)
    
    task_json = sys.argv[1]
    execute_single_training_task(task_json)
'''
    
    # Write the script
    with open(script_path, 'w') as f:
        f.write(script_content)
    
    # Make it executable
    script_path.chmod(0o755)
    
    print(f"‚úÖ Training script created: {script_path}")
    return script_path


In [33]:
script_path = '/home/ai_dsx.work/data/projects/AD_tool_test/images/multinode_results/py_script.py'

In [None]:
create_training_script(script_path)

In [53]:
#| hide
#create_task_file(
    #tasks=tasks,
    #task_file_path=task_file,
    #training_script_path=script_path
#)



In [56]:
#| eval: false
data_root = Path(r'/home/ai_dsx.work/data/projects/AD_tool_test/images')
normal_dir = "good"
abnormal_dir = "bad"
class_name = "test_hyperparam"
time_str = datetime.now().strftime("%Y%m%d_%H%M%S")

output_base = Path(data_root / f'multinode_results_{time_str}')
Path(output_base).mkdir(parents=True, exist_ok=True)
model_names = ['padim']
backbones = ['wide_resnet50_2']
n_features_list = [64]
layers = [['layer1','layer2', 'layer3'],['layer1','layer2']]
max_epochs = 10

# part 1

tasks = generate_training_tasks(
        data_root=data_root,
        normal_dir=normal_dir,
        abnormal_dir=abnormal_dir,
        class_name=class_name,
        model_names=model_names,
        backbones=backbones,
        n_features_list=n_features_list,
        layers=layers,
        max_epochs=max_epochs,
        output_base=output_base
    )
# part 2
session_name =f"anomaly_search_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
lsf_dir = output_base / 'lsf_files'
mem_per_task = 8000
num_workers = None
config_file, task_file, connection_file = create_lsf_config(
        session_name=session_name,
        tasks=tasks,
        output_dir=lsf_dir,
        mem_per_task=mem_per_task,
        num_workers=num_workers
    )

# part 3
training_script = create_training_script(
    script_path=lsf_dir / 'multinode_training.py'
)

# part 4
task_file_path = create_task_file(
    tasks=tasks,
    task_file_path=task_file,
    training_script_path=training_script
)
print(task_file_path)
# part 5

cmd = f"lsf_tflex --session {config_file}"
print(f"Executing: {cmd}")
result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
if result.returncode == 0:
    print("‚úÖ LSF session submitted successfully!")
    print(result.stdout)
else:
    print("‚ùå Failed to submit LSF session:")
    print(result.stderr)



In [57]:
#| eval: false

# Create output directories
Path(task['output_folder']).mkdir(parents=True, exist_ok=True)
Path(task['save_path']).mkdir(parents=True, exist_ok=True)

# Create training configuration
config = FlexibleTrainingConfig(
    data_root=task['data_root'],
    normal_dir=task['normal_dir'],
    abnormal_dir=task['abnormal_dir'],
    model_name=task['model_name'],
    backbone=task['backbone'],
    layers=task['layers'],
    n_features=task['n_features'],
    max_epochs=task['max_epochs'],
    class_name=task['class_name'],
    save_path=task['save_path']
)

# Train the model
print(f'üéØ Training {task["model_name"]} model...')
start_time = time.time()

result = train_anomaly_model(config)

In [103]:
#| export
def collect_training_results(
    output_base: Union[str, Path],  # Base directory containing task results
    wait_for_completion: bool = True,  # Wait for all tasks to complete
    max_wait_time: int = 3600,         # Maximum wait time in seconds
    check_interval: int = 30,          # Check interval in seconds
) -> Dict[str, Any]:
    """
    Collect and combine results from all distributed training tasks.
    
    This function monitors the output directory and waits for all tasks
    to complete, then combines their results into a unified format.
    """
    
    output_base = Path(output_base)
    
    print(f"üìä Collecting training results from: {output_base}")
    
    if wait_for_completion:
        print(f"‚è≥ Waiting for training completion (max {max_wait_time}s)...")
        
        start_time = time.time()
        while True:
            # Find all task directories
            task_dirs = [d for d in output_base.iterdir() if d.is_dir() and d.name.startswith('task_')]
            
            # Check completion status
            completed_tasks = []
            failed_tasks = []
            pending_tasks = []
            
            for task_dir in task_dirs:
                result_file = task_dir / 'task_result.json'
                error_file = task_dir / 'task_error.json'
                
                if result_file.exists():
                    completed_tasks.append(task_dir)
                elif error_file.exists():
                    failed_tasks.append(task_dir)
                else:
                    pending_tasks.append(task_dir)
            
            total_tasks = len(task_dirs)
            completed_count = len(completed_tasks)
            failed_count = len(failed_tasks)
            pending_count = len(pending_tasks)
            
            print(f"üìà Progress: {completed_count}/{total_tasks} completed, "
                  f"{failed_count} failed, {pending_count} pending")
            
            # Check if all tasks are done
            if pending_count == 0:
                print("‚úÖ All tasks completed!")
                break
            
            # Check timeout
            elapsed = time.time() - start_time
            if elapsed > max_wait_time:
                print(f"‚è∞ Timeout reached ({max_wait_time}s). Collecting available results...")
                break
            
            # Wait before next check
            time.sleep(check_interval)
    
    # Collect all results
    print("üìã Collecting all available results...")
    
    all_results = []
    successful_trainings = 0
    failed_trainings = 0
    
    task_dirs = [d for d in output_base.iterdir() if d.is_dir() and d.name.startswith('task_')]
    
    for task_dir in task_dirs:
        result_file = task_dir / 'task_result.json'
        error_file = task_dir / 'task_error.json'
        
        if result_file.exists():
            try:
                with open(result_file, 'r') as f:
                    result = json.load(f)
                all_results.append(result)
                successful_trainings += 1
                print(f"‚úÖ {result['task_id']}: {result['training_time_seconds']:.1f}s")
            except Exception as e:
                print(f"‚ùå Error reading {result_file}: {e}")
                failed_trainings += 1
                
        elif error_file.exists():
            try:
                with open(error_file, 'r') as f:
                    error_result = json.load(f)
                all_results.append(error_result)
                failed_trainings += 1
                print(f"‚ùå {error_result['task_id']}: {error_result.get('error', 'Unknown error')}")
            except Exception as e:
                print(f"‚ùå Error reading {error_file}: {e}")
                failed_trainings += 1
        else:
            print(f"‚è≥ {task_dir.name}: No result file found")
    
    # Calculate summary statistics
    total_tasks = len(task_dirs)
    success_rate = (successful_trainings / total_tasks * 100) if total_tasks > 0 else 0
    
    # Get successful results only
    successful_results = [r for r in all_results if r.get('success', False)]
    
    # Calculate timing statistics
    if successful_results:
        training_times = [r['training_time_seconds'] for r in successful_results]
        avg_time = sum(training_times) / len(training_times)
        total_training_time = sum(training_times)
    else:
        avg_time = 0
        total_training_time = 0
    
    # Create combined results in the same format as simple_hyperparameter_search
    combined_results = {
        'search_completed': True,
        'total_combinations_tested': total_tasks,
        'successful_trainings': successful_trainings,
        'failed_trainings': failed_trainings,
        'success_rate': success_rate,
        'average_training_time': avg_time,
        'total_training_time': total_training_time,
        'timestamp': datetime.now().isoformat(),
        'results': all_results,
        'successful_results': successful_results,
        'output_base': str(output_base)
    }
    
    # Save combined results
    summary_file = output_base / 'multinode_training_summary.json'
    with open(summary_file, 'w') as f:
        json.dump(combined_results, f, indent=2, default=str)
    
    print(f"\n{'='*70}")
    print("üìä MULTI-NODE TRAINING RESULTS SUMMARY")
    print(f"{'='*70}")
    print(f"‚úÖ Successful trainings: {successful_trainings}/{total_tasks}")
    print(f"‚ùå Failed trainings: {failed_trainings}/{total_tasks}")
    print(f"üìà Success rate: {success_rate:.1f}%")
    if successful_trainings > 0:
        print(f"‚è±Ô∏è  Average training time: {avg_time:.1f}s")
        print(f"‚è±Ô∏è  Total training time: {total_training_time:.1f}s")
    print(f"üíæ Summary saved: {summary_file}")
    
    return combined_results


In [None]:
#| hide
data_root = Path(r'/home/ai_dsx.work/data/projects/AD_tool_test/images')
normal_dir = "good"
abnormal_dir = "bad"
class_name = "test_hyperparam"
output_base = Path(r'/home/ai_dsx.work/data/projects/AD_tool_test/images/multinode_results')




##### Collect training results after training completion

In [None]:
#| hide
task_dirs = [d for d in Path(output_base).iterdir() if d.is_dir() and d.name.startswith('task_')]
sn_task = task_dirs[0]
print(sn_task)
result_file = sn_task / 'task_result.json'
error_file = sn_task / 'task_error.json'
result_file.exists()


In [60]:
#output_base="/home/ai_dsx.work/data/projects/AD_tool_test/multi_node_results"
#max_wait_time=3600
#check_interval=30
#trn_res = collect_training_results(
    #output_base=output_base,
    #wait_for_completion=True,
    #max_wait_time=max_wait_time,
    #check_interval=check_interval
#)


In [None]:
#| hide
test_images='/home/ai_dsx.work/data/projects/AD_tool_test/images/bad'
search_results=trn_res['results']


In [104]:
#| export

def multinode_diff_parameter_and_save_poster(
    data_root: Union[str, Path],  # Data root path
    normal_dir: str = "good",     # Normal directory name
    abnormal_dir: str = "bad",    # Abnormal directory name
    class_name: str = "multinode_search",  # Class name
    test_images: Union[str, Path, List] = None,  # Test images path
    
    # Parameter combinations for grid search
    model_names: List[str] = None,        # Model names
    backbones: List[str] = None,          # Backbones
    n_features_list: List[int] = None,    # Number of features
    layers: List[List[str]] = None,       # Layers
    max_epochs: int = 10,                 # Max epochs
    
    # Multi-node configuration
    session_name: str = None,             # LSF session name
    num_workers: int = None,              # Number of LSF workers
    mem_per_task: int = 8000,             # Memory per task (MB)
    
    # Output settings
    output_base: Union[str, Path] = None, # Base output directory
    max_models: int = 4,                  # Max models in poster
    max_test_images: int = 10,            # Max test images
    run_validation_tests: bool = False,   # Run validation tests
    show_original: bool = False,          # Show original images
    device: str = "auto",                 # Device for inference
    
    # Advanced settings
    wait_for_completion: bool = True,     # Wait for all tasks to complete
    max_wait_time: int = 3600,            # Maximum wait time
    auto_submit: bool = True,            # Automatically submit LSF jobs
    
) -> Dict[str, Any]:
    """
    Multi-node version of diff_parameter_and_save_poster.
    
    This function distributes hyperparameter search across multiple nodes
    using LSF, then combines results and creates comparison posters.
    
    üöÄ WORKFLOW:
    1. Generate parameter combinations as individual tasks
    2. Create LSF configuration and task files
    3. Create training script for workers
    4. Submit jobs to LSF (if auto_submit=True)
    5. Wait for completion and collect results
    6. Generate comparison poster from combined results
    """
    
    print("üöÄ MULTI-NODE HYPERPARAMETER SEARCH")
    print("="*70)
    print("üéØ Distributing training across multiple LSF nodes")
    print("üèóÔ∏è Using lsf_tflex for workload management")
    
    # Set defaults
    data_root = Path(data_root)
    if output_base is None:
        output_base = data_root / f'multinode_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
    if session_name is None:
        session_name = f"anomaly_search_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    if test_images is None:
        test_images = data_root / "test"  # Default test directory
    
    output_base = Path(output_base)
    lsf_dir = output_base / 'lsf_files'
    
    # STEP 1: Generate training tasks
    print(f"\nüìã STEP 1: Generating training tasks")
    tasks = generate_training_tasks(
        data_root=data_root,
        normal_dir=normal_dir,
        abnormal_dir=abnormal_dir,
        class_name=class_name,
        model_names=model_names,
        backbones=backbones,
        n_features_list=n_features_list,
        layers=layers,
        max_epochs=max_epochs,
        output_base=output_base
    )
    
    # STEP 2: Create LSF configuration
    print(f"\nüîß STEP 2: Creating LSF configuration")
    config_file, task_file, connection_file = create_lsf_config(
        session_name=session_name,
        tasks=tasks,
        output_dir=lsf_dir,
        mem_per_task=mem_per_task,
        num_workers=num_workers
    )
    
    # STEP 3: Create training script
    print(f"\nüìù STEP 3: Creating training script")
    training_script = create_training_script(
        script_path=lsf_dir / 'multinode_training.py'
    )
    
    # STEP 4: Create task file
    print(f"\nüìã STEP 4: Creating task file")
    task_file_path = create_task_file(
        tasks=tasks,
        task_file_path=task_file,
        training_script_path=training_script
    )
    
    # STEP 5: Provide instructions or auto-submit
    print(f"\nüéØ STEP 5: Job submission")
    if auto_submit:
        print("üöÄ Auto-submitting LSF jobs...")
        try:
            # Submit the LSF session
            cmd = f"lsf_tflex --session {config_file}"
            print(f"Executing: {cmd}")
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
            
            if result.returncode == 0:
                print("‚úÖ LSF session submitted successfully!")
                print(result.stdout)
            else:
                print("‚ùå Failed to submit LSF session:")
                print(result.stderr)
                return {'success': False, 'error': result.stderr}
                
        except Exception as e:
            print(f"‚ùå Error submitting jobs: {e}")
            return {'success': False, 'error': str(e)}
    else:
        print("üìã Manual submission required. Run these commands:")
        print(f"   cd {lsf_dir}")
        print(f"   lsf_tflex --session {config_file}")
        print(f"   # Monitor with: lsf_tflex --session {config_file} --status")
        
        if not wait_for_completion:
            return {
                'success': True,
                'config_file': str(config_file),
                'task_file': str(task_file),
                'training_script': str(training_script),
                'output_base': str(output_base),
                'session_name': session_name,
                'total_tasks': len(tasks)
            }
    
    # STEP 6: Wait for completion and collect results
    print(f"\nüìä STEP 6: Collecting results")
    combined_results = collect_training_results(
        output_base=output_base,
        wait_for_completion=wait_for_completion,
        max_wait_time=max_wait_time
    )
    
    # STEP 7: Create comparison poster
    print(f"\nüé® STEP 7: Creating comparison poster")
    if combined_results['successful_trainings'] > 0:
        try:
            # Generate output filename
            output_file = output_base / f"multinode_comparison_poster_{session_name}.png"
            
            comparison_res = create_modular_batch_comparison_poster(
                search_results=combined_results,
                test_images=test_images,
                max_models=max_models,
                max_test_images=max_test_images,
                run_validation_tests=run_validation_tests,
                show_original=show_original,
                device=device,
                output_file=str(output_file)
            )
            
            if comparison_res['success']:
                print(f"‚úÖ Comparison poster created: {output_file}")
                stats = comparison_res['batch_statistics']
                print(f"‚ö° {stats['total_inference_time']:.1f}s total inference time")
                
                # Combine all results
                final_results = {
                    'success': True,
                    'session_name': session_name,
                    'training_results': combined_results,
                    'poster_results': comparison_res,
                    'poster_file': str(output_file),
                    'config_files': {
                        'lsf_config': str(config_file),
                        'task_file': str(task_file),
                        'training_script': str(training_script)
                    },
                    'total_tasks': len(tasks),
                    'output_base': str(output_base)
                }
                
                return final_results
            else:
                print("‚ùå Failed to create comparison poster")
                return {'success': False, 'error': 'Poster creation failed'}
                
        except Exception as e:
            print(f"‚ùå Error creating poster: {e}")
            return {'success': False, 'error': str(e)}
    else:
        print("‚ùå No successful training results to create poster from")
        return {'success': False, 'error': 'No successful training results'}
    
    print(f"\nüéâ MULTI-NODE TRAINING COMPLETE!")
    return {'success': True, 'training_results': combined_results}


In [105]:
#| export
def parse_layers_argument(layers_str: str) -> List[str]:
    """
    Parse layer string into list of layer names.
    
    Examples:
        "layer1" -> ["layer1"]
        "layer1,layer2" -> ["layer1", "layer2"]
        "layer1,layer2,layer3" -> ["layer1", "layer2", "layer3"]
    """
    return [layer.strip() for layer in layers_str.split(',')]


In [106]:
#| export
def print_configuration_summary(args):
    """Print a summary of the training configuration"""
    
    print("üîß MULTI-NODE TRAINING CONFIGURATION")
    print("=" * 60)
    print(f"üìÅ Data Root: {args.data_root}")
    print(f"‚úÖ Normal Dir: {args.normal_dir}")
    print(f"‚ùå Abnormal Dir: {args.abnormal_dir}")
    print(f"üè∑Ô∏è  Class Name: {args.class_name}")
    print(f"üß™ Test Images: {args.test_images or 'Auto-detect'}")
    print()
    
    print("ü§ñ MODEL CONFIGURATION:")
    print(f"   Models: {args.model_names}")
    print(f"   Backbones: {args.backbones}")
    print(f"   Features: {args.n_features_list}")
    print(f"   Layers: {args.layers}")
    print()
    
    # Calculate total combinations
    total_combinations = (len(args.model_names) * 
                         len(args.backbones) * 
                         len(args.n_features_list) * 
                         len(args.layers))
    
    print("üìä TRAINING SCALE:")
    print(f"   Total Combinations: {total_combinations}")
    print(f"   Max Epochs: {args.max_epochs}")
    print(f"   LSF Workers: {args.num_workers}")
    print(f"   Memory per Task: {args.mem_per_task} MB")
    print()
    
    print("üìÇ OUTPUT CONFIGURATION:")
    print(f"   Output Base: {args.output_base or '<data_root>/multinode_results'}")
    print(f"   Session Name: {args.session_name or 'Auto-generated'}")
    print(f"   Max Models in Poster: {args.max_models}")
    print(f"   Max Test Images: {args.max_test_images}")
    print()
    
    print("‚öôÔ∏è  EXECUTION FLAGS:")
    print(f"   Auto Submit: {args.auto_submit}")
    print(f"   Wait for Completion: {args.wait_for_completion}")
    print(f"   Run Validation: {args.run_validation}")
    print(f"   Show Original: {args.show_original}")
    print(f"   Device: {args.device}")
    print(f"   Dry Run: {args.dry_run}")



In [107]:
#| export


def validate_arguments(args):
    """Validate command line arguments"""
    
    # Check data root exists
    if not Path(args.data_root).exists():
        raise ValueError(f"Data root directory does not exist: {args.data_root}")
    
    # Check normal directory exists
    normal_path = Path(args.data_root) / args.normal_dir
    if not normal_path.exists():
        raise ValueError(f"Normal directory does not exist: {normal_path}")
    
    # Check abnormal directory exists  
    abnormal_path = Path(args.data_root) / args.abnormal_dir
    if not abnormal_path.exists():
        raise ValueError(f"Abnormal directory does not exist: {abnormal_path}")
    
    # Validate memory per task
    if args.mem_per_task < 1000:
        raise ValueError("Memory per task should be at least 1000 MB")
    
    # Validate number of workers
    if args.num_workers < 1:
        raise ValueError("Number of workers should be at least 1")
    
    print("‚úÖ All arguments validated successfully")


In [108]:
#| export
def parse_cli_list(list_str):
    """
    Parse a string into a list, supporting both comma and space separation.
    
    Examples:
        parse_cli_list("layer1,layer2,layer3")     # ['layer1', 'layer2', 'layer3']
        parse_cli_list("layer1 layer2 layer3")     # ['layer1', 'layer2', 'layer3']
        parse_cli_list("layer1")                   # ['layer1']
    """
    if not list_str: return []
    
    # If there are commas, split by comma
    if ',' in list_str:
        return [item.strip() for item in list_str.split(',') if item.strip()]
    
    # Otherwise split by whitespace
    return [item.strip() for item in list_str.split() if item.strip()]

In [109]:
#| export
def parse_cli_nested_lists(
    arg_str # "layer1,layer2;layer3,layer4"  -> [['layer1', 'layer2'], ['layer3', 'layer4']]
    # "layer1,layer2|layer3,layer4"  -> [['layer1', 'layer2'], ['layer3', 'layer4']]
    ):
    """
    Parse a string from command line into nested list structure.
    
    Supports:
    - Semicolon separates lists: "layer1,layer2;layer3,layer4"
    - Pipe separates lists: "layer1,layer2|layer3,layer4"  
    - Within each list, comma or space separation works
    
    """
    if not arg_str: return []
    
    # Determine list separator (semicolon or pipe)
    list_separator = ';' if ';' in arg_str else '|'
    
    # Split into individual list strings
    list_strings = [s.strip() for s in arg_str.split(list_separator) if s.strip()]
    
    # Parse each list using the existing parser
    return [parse_cli_list(list_str) for list_str in list_strings]



In [110]:
#| export
def parse_cli_models(
    arg_str # "padim;fastflow|patchcore"  -> ['padim', 'fastflow', 'patchcore']
    ):
    """
    Parse a string from command line into a list of model names.
    
    Supports:
    - Semicolon separates models: "padim;fastflow;patchcore"
    - Pipe separates models: "padim|fastflow|patchcore"
    - Returns a flat list of strings for individual model names
    
    Examples:
        parse_cli_models("padim;fastflow")     # ['padim', 'fastflow']
        parse_cli_models("padim|fastflow")     # ['padim', 'fastflow']
        parse_cli_models("padim")              # ['padim']
    """
    if not arg_str: return []
    
    # Determine separator (semicolon or pipe)
    separator = ';' if ';' in arg_str else '|'
    
    # Split and clean up model names
    return [model.strip() for model in arg_str.split(separator) if model.strip()]



In [111]:
#| export
def parse_cli_numbers(
    arg_str # "64;128|256"  -> [64, 128, 256]
    ):
    """
    Parse a string from command line into a list of integers.
    
    Supports:
    - Semicolon separates numbers: "64;128;256"
    - Pipe separates numbers: "64|128|256"
    - Comma separates numbers: "64,128,256"
    - Returns a flat list of integers
    
    Examples:
        parse_cli_numbers("64;128")     # [64, 128]
        parse_cli_numbers("64|128")     # [64, 128]
        parse_cli_numbers("64,128")     # [64, 128]
        parse_cli_numbers("64")         # [64]
    """
    if not arg_str: return []
    
    # Determine separator (semicolon, pipe, or comma)
    separator = ';' if ';' in arg_str else ('|' if '|' in arg_str else ',')
    
    # Split and convert to integers
    return [int(num.strip()) for num in arg_str.split(separator) if num.strip()]

In [26]:
parse_cli_nested_lists('layer1,layer2;layer1,layer2,layer3')

In [None]:
backbones='wide_resnet50_2;resnet18'
parse_cli_models(backbones)

In [112]:
#| export
@dataclass
class MultiNodeConfig:
    """Configuration class for multi-node training parameters."""
    data_root: str = 'data_root'
    normal_dir: str = 'normal'
    abnormal_dir: str = 'abnormal'
    class_name: Optional[str] = None
    test_images: str = 'test_images'
    model_names: str = 'padim'
    backbones: str = 'resnet18;wide_resnet50_2'
    n_features_list: str = '64;320'
    layers: str = "layer1,layer2;layer1,layer2,layer3"
    max_epochs: int = 100
    output_base: str = 'output_base'
    max_models: int = 10
    max_test_images: int = 100
    run_validation: bool = False
    show_original: bool = True
    device: str = 'auto'
    auto_submit: bool = True
    wait_for_completion: bool = True
    max_wait_time: int = 3600
    session_name: Optional[str] = None
    num_workers: int = 4
    mem_per_task: int = 8000
    verbose: bool = True

In [113]:
#| export
def load_config_from_file(
    config_path: Union[str, Path]
    ) -> Dict[str, Any]:
    """
    Load configuration from JSON, YAML, or INI file.
    
    """
    config_path = Path(config_path)
    
    if not config_path.exists():
        raise FileNotFoundError(f"Config file not found: {config_path}")
    
    suffix = config_path.suffix.lower()
    
    if suffix == '.json':
        import json
        with open(config_path, 'r') as f:
            return json.load(f)
    
    elif suffix in ['.yaml', '.yml']:
        try:
            with open(config_path, 'r') as f:
                return yaml.safe_load(f) or {}
        except ImportError:
            raise ValueError("PyYAML is required to load YAML config files. Install with: pip install pyyaml")
    
    elif suffix in ['.ini', '.cfg']:
        config = configparser.ConfigParser()
        config.read(config_path)
        
        # Convert ConfigParser to dictionary
        config_dict = {}
        for section in config.sections():
            for key, value in config[section].items():
                # Try to convert to appropriate types
                try:
                    # Try boolean conversion first
                    if value.lower() in ['true', 'false']:
                        config_dict[key] = config.getboolean(section, key)
                    # Try integer conversion
                    elif value.isdigit() or (value.startswith('-') and value[1:].isdigit()):
                        config_dict[key] = config.getint(section, key)
                    # Keep as string
                    else:
                        config_dict[key] = value
                except ValueError:
                    config_dict[key] = value
        
        return config_dict
    
    else:
        raise ValueError(f"Unsupported config file format: {suffix}. Supported formats: .json, .yaml, .yml, .ini, .cfg")


In [29]:

def create_sample_config_file(
    output_path: Union[str, Path], 
    format_type: str = 'json') -> None:
    """
    Create a sample configuration file with default parameters.
    
    """
    config = MultiNodeConfig()
    config_dict = asdict(config)
    
    output_path = Path(output_path)
    
    if format_type.lower() == 'json':
        with open(output_path, 'w') as f:
            json.dump(config_dict, f, indent=2)
    
    elif format_type.lower() in ['yaml', 'yml']:
        try:
            with open(output_path, 'w') as f:
                yaml.dump(config_dict, f, default_flow_style=False, indent=2)
        except ImportError:
            raise ValueError("PyYAML is required to create YAML config files. Install with: pip install pyyaml")
    
    elif format_type.lower() in ['ini', 'cfg']:
        config_parser = configparser.ConfigParser()
        config_parser['DEFAULT'] = {}
        
        # Add all config items to DEFAULT section
        for key, value in config_dict.items():
            config_parser['DEFAULT'][key] = str(value)
        
        with open(output_path, 'w') as f:
            config_parser.write(f)
    
    else:
        raise ValueError(f"Unsupported format: {format_type}. Supported formats: json, yaml, ini")
    
    print(f"‚úÖ Sample config file created: {output_path}")

In [125]:
config_path = Path(r'/home/ai_dsx.work/data/projects/AD_tool_test/sample_multinode_config.yaml')
config_file_ = create_sample_config_file(config_path, 'yaml')

In [None]:
#| export
#@call_parse
def create_multinode_config(
    output_path: str, # Path where to save the config file
    format: str='json', # Config file format ('json', 'yaml', or 'ini')
    ):
    """
    Create a sample configuration file for multi-node training.
    
    Examples:
        create_multinode_config config.json
        create_multinode_config config.yaml --format yaml
        create_multinode_config config.ini --format ini
    """
    try:
        create_sample_config_file(output_path, format)
        print(f"‚úÖ Sample configuration file created at: {output_path}")
        print(f"üìù Edit this file with your specific parameters, then use:")
        print(f"   multi_node_train_ --config_file {output_path}")
        
    except Exception as e:
        print(f"‚ùå Error creating config file: {e}")
        sys.exit(1)

## Testing Config File Functionality

Let's test the new config file support:

In [None]:
#| export
@call_parse
def multi_node_train_with_config_precedence(
    # Config file support
    config_file: str = None,  # Path to configuration file (JSON, YAML, or INI format)
    
    # Original parameters with special sentinel values to detect CLI usage
    data_root: str = None,  # Path to the dataset root directory
    normal_dir: str = None,  # Path to the normal class directory
    abnormal_dir: str = None,  # Path to the abnormal class directory
    class_name: str = None,  # Name of the class to train on
    test_images: str = None,  # Path to the test images directory
    model_names: str = None,  # List of model names to train
    backbones: str = None,  # List of backbone architectures to use
    n_features_list: str = None,  # List of feature dimensions to use
    layers: str = None,  # List of layers
    max_epochs: int = None,  # Maximum number of training epochs
    output_base: str = None,  # Base directory for output files
    max_models: int = None,  # Maximum number of models to include in the poster
    max_test_images: int = None,  # Maximum number of test images to include in the poster
    run_validation: bool = None,  # Run validation tests after training
    show_original: bool = None,  # Show original images in the poster
    device: str = None,  # Device to use for training
    auto_submit: bool = None,  # Automatically submit jobs to LSF
    wait_for_completion: bool = None,  # Wait for jobs to complete
    max_wait_time: int = None,  # Maximum wait time for job completion
    session_name: str = None,  # Name of the LSF session
    num_workers: int = None,  # Number of workers per task
    mem_per_task: int = None,  # Memory per task (in MB)
    verbose: bool = False,  # Enable verbose output
):
    """
    Multi-node training with proper config file precedence using fastcore's call_parse.
    
    Precedence order (highest to lowest):
    1. CLI arguments (explicitly provided by user)
    2. Config file values
    3. Built-in defaults
    
    Examples:
        # Using config file only
        multi_node_train --config_file config.json
        
        # Using config file with CLI overrides  
        multi_node_train --config_file config.json --max_epochs 200 --device cuda
        
        # Traditional usage (backward compatible)
        multi_node_train --data_root /path/to/data --model_names padim
    """
    
    # Define our built-in defaults (these are the real defaults we want to use)
    DEFAULTS = {
        'data_root': 'data_root',
        'normal_dir': 'normal',
        'abnormal_dir': 'abnormal', 
        'class_name': None,
        'test_images': 'test_images',
        'model_names': 'padim',
        'backbones': 'resnet18;wide_resnet50_2',
        'n_features_list': '64;320',
        'layers': "layer1,layer2;layer1,layer2,layer3",
        'max_epochs': 100,
        'output_base': 'output_base',
        'max_models': 10,
        'max_test_images': 100,
        'run_validation': False,
        'show_original': True,
        'device': 'auto',
        'auto_submit': True,
        'wait_for_completion': True,
        'max_wait_time': 3600,
        'session_name': None,
        'num_workers': 4,
        'mem_per_task': 8000,
        'verbose': False
    }
    
    # Collect all function parameters (this shows what user explicitly provided)
    import inspect
    frame = inspect.currentframe()
    provided_args = {k: v for k, v in frame.f_locals.items() 
                    if k in DEFAULTS and v is not None}
    
    # Also handle boolean verbose separately since it has a default of False
    if 'verbose' in frame.f_locals:
        provided_args['verbose'] = frame.f_locals['verbose']
    
    if verbose:
        print(f"üñ•Ô∏è  CLI arguments explicitly provided: {list(provided_args.keys())}")
    
    try:
        # Start with built-in defaults
        final_config = DEFAULTS.copy()
        
        # Load and apply config file values (if provided)
        config_data = {}
        if config_file:
            if verbose:
                print(f"üìÅ Loading configuration from: {config_file}")
            
            try:
                config_data = load_config_from_file(config_file)
                
                if verbose:
                    print(f"üîß Config file values found: {list(config_data.keys())}")
                
                # Apply config file values (override defaults)
                for key, value in config_data.items():
                    if key in final_config:
                        if verbose and final_config[key] != value:
                            print(f"   {key}: {final_config[key]} -> {value} (from config)")
                        final_config[key] = value
                    elif verbose:
                        print(f"   ‚ö†Ô∏è  Unknown config parameter ignored: {key}")
                        
                if verbose:
                    print(f"‚úÖ Configuration loaded successfully")
                    
            except Exception as e:
                print(f"‚ùå Error loading config file: {e}")
                return
        
        # Apply CLI arguments (highest precedence - overrides both defaults and config)
        if verbose and provided_args:
            print(f"üñ•Ô∏è  Applying CLI overrides:")
        
        for key, value in provided_args.items():
            if key != 'config_file':  # Skip the config_file parameter itself
                if verbose and final_config.get(key) != value:
                    print(f"   {key}: {final_config.get(key)} -> {value} (from CLI)")
                final_config[key] = value
        
        if verbose:
            print(f"\nüöÄ Final configuration:")
            for key, value in final_config.items():
                print(f"   {key}: {value}")
        
        # Parse CLI parameters (existing logic)
        parsed_layers = parse_cli_nested_lists(final_config['layers'])
        model_names_parsed = parse_cli_models(final_config['model_names'])
        backbones_parsed = parse_cli_models(final_config['backbones'])
        n_features_parsed = parse_cli_numbers(final_config['n_features_list'])
        
        if verbose:
            print(f"\nüîç PARSED CONFIGURATIONS:")
            print(f"   Model names: {model_names_parsed}")
            print(f"   Backbones: {backbones_parsed}")
            print(f"   Features: {n_features_parsed}")
            print(f"   Layers: {parsed_layers}")

        # Create output directory with timestamp
        from datetime import datetime 
        date_now = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = Path(f"{final_config['output_base']}_multinode_results_{date_now}")
        
        print(f"\nüìä Training Summary:")
        print(f"   Auto submit: {final_config['auto_submit']}")
        print(f"   Wait for completion: {final_config['wait_for_completion']}")
        print(f"   Output path: {output_path}")
        
        # Call the main training function with final config
        result = multinode_diff_parameter_and_save_poster(
            data_root=Path(final_config['data_root']),
            normal_dir=final_config['normal_dir'],
            abnormal_dir=final_config['abnormal_dir'],
            class_name=final_config['class_name'],
            test_images=final_config['test_images'],
            model_names=model_names_parsed,
            backbones=backbones_parsed,
            n_features_list=n_features_parsed,
            layers=parsed_layers,
            max_epochs=final_config['max_epochs'],
            session_name=final_config['session_name'],
            num_workers=final_config['num_workers'],
            mem_per_task=final_config['mem_per_task'],
            output_base=output_path,
            max_models=final_config['max_models'],
            max_test_images=final_config['max_test_images'],
            run_validation_tests=final_config['run_validation'],
            show_original=final_config['show_original'],
            device=final_config['device'],
            auto_submit=final_config['auto_submit'],
            wait_for_completion=final_config['wait_for_completion'],
            max_wait_time=final_config['max_wait_time']
        )
        
        return result
        
    except KeyboardInterrupt:
        print("\n‚ö†Ô∏è  Training interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        if verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)

In [127]:
config_file_path = Path(r'/home/ai_dsx.work/data/projects/AD_tool_test/sample_multinode_config.yaml')

In [128]:
multi_node_train_with_config_precedence(
    config_file=config_file_path,
    verbose=True

)

In [129]:
import os
os.chdir(r'/home/ai_dsx.work/data/projects/be-vision-ad-tools/nbs')

In [134]:
#| hide
import nbdev; nbdev.nbdev_export('08_training.multi_node.ipynb')