In [10]:
#| default_exp sync

# Sync Operations
> Remote synchronization engine for chasqui workflow automation

This module handles all communication between local and remote systems:

- Upload jobs from local queue to remote waiting directory
- Trigger remote agent to process waiting jobs
- Parse remote agent logs and update local database
- Download completed results

## Key Design Decisions

**Manual 2FA Requirement:** All remote operations happen in a single SSH session 
to minimize authentication overhead.

**Lightweight Remote State:** Remote side uses append-only log files rather than 
a database for simplicity and robustness.

**Self-Perpetuating Queue:** Completed PBS jobs trigger the agent to submit 
waiting jobs, eliminating need for cron.

## Workflow
```
1. Get QUEUED_LOCAL jobs from database
2. Upload VASP inputs to remote work directories
3. Generate and upload PBS scripts to waiting/
4. Trigger agent to submit jobs
5. Parse agent log to see what was submitted
6. Update local database with submission status
7. Check for completed jobs (flags)
8. Update database and optionally download results
```

In [11]:
#| export
from pathlib import Path
from datetime import datetime
from typing import Optional, List, Dict, Any
import json
import tempfile
import os

from chasqui.database import ChasquiDB
from chasqui.ssh import SSHConnection
from chasqui.templates import generate_pbs_script_from_job
from chasqui.agent import deploy_agent, trigger_agent, parse_agent_log

from fastcore.basics import patch

## Architecture

The sync operation follows this flow:
```
Local DB → SSH Connection → Remote System
   ↓                            ↓
QUEUED_LOCAL              waiting/*.sh
   ↓                            ↓
UPLOADED                  agent.sh (triggered)
   ↓                            ↓
SUBMITTED ←── agent.log ←── PBS Queue
   ↓
COMPLETED
```

The `sync()` function orchestrates all operations in a single SSH session.

## Sync Configuration

Store sync parameters in a configuration object.

In [12]:
#| export

class SyncConfig:
    """
    Configuration for sync operations.
    
    Example:
        >>> config = SyncConfig(
        ...     remote_host='bebop',
        ...     chasqui_remote_dir='$HOME/chasqui_remote',
        ...     max_queued=40,
        ...     max_running=30
        ... )
    """
    def __init__(
        self,
        remote_host: str = 'bebop',
        chasqui_remote_dir: str = '$HOME/chasqui_remote',
        max_queued: int = 40,
        max_running: int = 30,
        auto_deploy_agent: bool = True,
        download_results: bool = False
    ):
        self.remote_host = remote_host
        self.chasqui_remote_dir = chasqui_remote_dir
        self.max_queued = max_queued
        self.max_running = max_running
        self.auto_deploy_agent = auto_deploy_agent
        self.download_results = download_results

## Helper Functions

Internal functions for sync operations.

In [13]:
#| export

def _upload_vasp_inputs(
    ssh: SSHConnection,
    job: Dict[str, Any],
    work_dir: str
) -> None:
    """
    Upload VASP input files to remote work directory.
    
    Args:
        ssh: Active SSH connection
        job: Job dictionary from database
        work_dir: Remote work directory (expanded path)
    """
    local_path = Path(job['local_path']).expanduser()
    
    if not local_path.exists():
        raise FileNotFoundError(f"Local job directory not found: {local_path}")
    
    # Create remote work directory
    ssh.run(f'mkdir -p {work_dir}')
    
    # Upload VASP input files
    for filename in ['POSCAR', 'INCAR', 'KPOINTS', 'POTCAR']:
        local_file = local_path / filename
        if local_file.exists():
            remote_file = f"{work_dir}/{filename}"
            ssh.upload(str(local_file), remote_file)
        # Note: POTCAR might not exist for some test cases

In [14]:
#| export

def _upload_pbs_script(
    ssh: SSHConnection,
    job: Dict[str, Any],
    work_dir: str,
    waiting_dir: str
) -> str:
    """
    Generate and upload PBS script to waiting directory.
    
    Args:
        ssh: Active SSH connection
        job: Job dictionary from database
        work_dir: Remote work directory (expanded)
        waiting_dir: Remote waiting directory (expanded)
        
    Returns:
        Remote script path
    """
    # Generate PBS script with work_dir
    job_with_workdir = job.copy()
    if job_with_workdir.get('vasp_config'):
        config = json.loads(job_with_workdir['vasp_config'])
        config['remote_work_dir'] = work_dir
        job_with_workdir['vasp_config'] = json.dumps(config)
    else:
        job_with_workdir['vasp_config'] = json.dumps({'remote_work_dir': work_dir})
    
    script = generate_pbs_script_from_job(job_with_workdir)
    
    # Write to temporary local file
    with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as tmp:
        tmp.write(script)
        tmp_path = tmp.name
    
    try:
        # Upload to waiting directory
        remote_script = f"{waiting_dir}/{job['job_id']}.sh"
        ssh.upload(tmp_path, remote_script)
        return remote_script
    finally:
        os.unlink(tmp_path)

In [15]:
#| export

def _check_completed_jobs(
    ssh: SSHConnection,
    completed_dir: str,
    jobs: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
    """
    Check for completion flags and parse them.
    
    Args:
        ssh: Active SSH connection
        completed_dir: Remote completed directory (expanded)
        jobs: List of job dictionaries to check
        
    Returns:
        List of completed job updates: [{'job_id': ..., 'status': 'DONE', 'pbs_id': ...}, ...]
    """
    completed = []
    
    for job in jobs:
        job_id = job['job_id']
        flag_file = f"{completed_dir}/{job_id}.flag"
        
        if ssh.exists(flag_file):
            # Read flag content
            flag_content = ssh.run(f'cat {flag_file}')
            lines = flag_content.strip().split('\n')
            
            if len(lines) >= 1:
                status = lines[0].strip()  # DONE or FAIL
                pbs_id = lines[1].strip() if len(lines) > 1 else None
                
                completed.append({
                    'job_id': job_id,
                    'status': status,
                    'pbs_id': pbs_id
                })
    
    return completed

In [7]:
#| export
def sync(
    local_db_path: str = "~/.chasqui/jobs.db",
    remote_host: Optional[str] = None,
    dry_run: bool = False
) -> Dict[str, Any]:
    """
    Synchronize local and remote job queues.
    
    This is the main orchestration function that:
    
    1. Uploads queued jobs to remote
    
    2. Triggers remote agent
    
    3. Syncs job status back to local DB
    
    4. Downloads completed results (optional)
    
    Args:
    
        local_db_path: Path to local SQLite database
        
        remote_host: SSH connection string (e.g., 'user@hpc.cluster.edu')
                     If None, reads from config
                     
        dry_run: If True, show what would happen without executing
        
    Returns:
    
        Dictionary with sync statistics:
        {
            'uploaded': 5,
            'submitted': 3,
            'completed': 2,
            'failed': 0,
            'timestamp': '2025-10-28T10:30:00Z'
        }
        
    Example:
    
        >>> result = sync()
        >>> print(f"Uploaded {result['uploaded']} jobs")
    """
    # TODO: Implement
    # For now, return a placeholder
    return {
        'uploaded': 0,
        'submitted': 0,
        'completed': 0,
        'failed': 0,
        'timestamp': datetime.now().isoformat()
    }

In [8]:
#| hide
# This cell won't be exported, but runs during nbdev_test

# Test that function exists and returns expected structure
result = sync(dry_run=True)
assert 'uploaded' in result
assert 'timestamp' in result
print("✓ sync() skeleton works")

✓ sync() skeleton works
