In [1]:
#| default_exp agent

# Remote Agent Script
> Automatic job submission engine for PBS queue management

This module generates the remote agent script that manages job submission on the HPC cluster.

## Purpose

The agent script runs on the remote system (bebop) and:
- Monitors PBS queue status
- Checks against queue limits (70 queued, 30 running)
- Submits jobs from waiting/ when slots are available
- Moves submitted jobs to tracking directory
- Logs all activity

## Design

**Trigger mechanisms:**
1. Called by completing PBS jobs (self-perpetuating)
2. Called by sync operation (bootstrap/recovery)

**File-based locking:** Uses `flock` to prevent concurrent execution

**Stateless operation:** All state is in the filesystem (waiting/, submitted/, completed/)

## Directory Structure
```
~/chasqui_remote/
â”œâ”€â”€ waiting/           # Jobs ready to submit
â”œâ”€â”€ submitted/         # Jobs in PBS queue
â”œâ”€â”€ completed/         # Finished jobs
â”œâ”€â”€ logs/
â”‚   â””â”€â”€ agent.log      # Activity log
â”œâ”€â”€ agent.sh           # This script
â””â”€â”€ agent.lock         # Lock file for flock
```

In [2]:
#| export
from pathlib import Path
from typing import Optional
from string import Template

## Agent Bash Script

The core submission engine. This runs on the remote system.

In [12]:
#| export

AGENT_SCRIPT = """#!/bin/bash
# Chasqui Agent: Automatic PBS job submission
# This script submits waiting jobs when PBS queue has capacity

set -e  # Exit on error

# Configuration
CHASQUI_DIR="${CHASQUI_DIR:-$$HOME/chasqui_remote}"
WAITING_DIR="$$CHASQUI_DIR/waiting"
SUBMITTED_DIR="$$CHASQUI_DIR/submitted"
COMPLETED_DIR="$$CHASQUI_DIR/completed"
LOG_FILE="$$CHASQUI_DIR/logs/agent.log"

# Queue limits (adjust these for your cluster)
MAX_QUEUED=$MAX_QUEUED
MAX_RUNNING=$MAX_RUNNING
MAX_TOTAL=$$((MAX_QUEUED + MAX_RUNNING))

# Ensure directories exist
mkdir -p "$$WAITING_DIR" "$$SUBMITTED_DIR" "$$COMPLETED_DIR" "$$CHASQUI_DIR/logs"

# Logging function
log_message() {
    echo "$$(date -Iseconds) $$1" >> "$$LOG_FILE"
}

# Start of agent run
log_message "AGENT_START"

# Count current PBS jobs
QUEUED=$$(qstat -u $$USER 2>/dev/null | grep " Q " | wc -l || echo 0)
RUNNING=$$(qstat -u $$USER 2>/dev/null | grep " R " | wc -l || echo 0)
TOTAL=$$((QUEUED + RUNNING))

log_message "AGENT_CHECK queued=$$QUEUED running=$$RUNNING total=$$TOTAL"

# Check if we're at capacity
if [ $$TOTAL -ge $$MAX_TOTAL ]; then
    log_message "AGENT_AT_LIMIT total=$$TOTAL max=$$MAX_TOTAL"
    log_message "AGENT_END submitted=0"
    exit 0
fi

# Calculate available slots
SLOTS=$$((MAX_TOTAL - TOTAL))
log_message "AGENT_SLOTS available=$$SLOTS"

# Count waiting jobs
WAITING_COUNT=$$(ls "$$WAITING_DIR"/*.sh 2>/dev/null | wc -l || echo 0)
if [ $$WAITING_COUNT -eq 0 ]; then
    log_message "AGENT_NO_WAITING"
    log_message "AGENT_END submitted=0"
    exit 0
fi

log_message "AGENT_FOUND_WAITING count=$$WAITING_COUNT"

# Submit jobs (up to available slots)
SUBMITTED_COUNT=0
for job_script in $$(ls "$$WAITING_DIR"/*.sh 2>/dev/null | head -n $$SLOTS); do
    JOB_NAME=$$(basename "$$job_script" .sh)
    
    # Submit to PBS
    PBS_OUTPUT=$$(qsub "$$job_script" 2>&1)
    QSUB_EXIT=$$?
    
    if [ $$QSUB_EXIT -eq 0 ]; then
        # Extract PBS job ID (format: 123456.hostname)
        PBS_ID=$$(echo "$$PBS_OUTPUT" | tr -d '\\n')
        
        log_message "AGENT_SUBMIT job=$$JOB_NAME pbs_id=$$PBS_ID status=success"
        
        # Move script to submitted directory
        mv "$$job_script" "$$SUBMITTED_DIR/"
        
        SUBMITTED_COUNT=$$((SUBMITTED_COUNT + 1))
    else
        # Submission failed
        log_message "AGENT_SUBMIT job=$$JOB_NAME status=failed error=$$PBS_OUTPUT"
    fi
done

log_message "AGENT_END submitted=$$SUBMITTED_COUNT"

exit 0
"""

## Agent Script Generator

Generate the agent script with configurable parameters.

In [13]:
#| export

def generate_agent_script(
    chasqui_remote_dir: str = "$HOME/chasqui_remote",
    max_queued: int = 40,
    max_running: int = 30,
    output_path: Optional[str] = None
) -> str:
    """
    Generate the remote agent bash script.
    
    Args:
        chasqui_remote_dir: Remote chasqui directory (default: "$HOME/chasqui_remote")
        max_queued: Maximum queued jobs allowed (default: 40)
        max_running: Maximum running jobs allowed (default: 30)
        output_path: If provided, write script to this file
        
    Returns:
        Agent script content as string
        
    Example:
        >>> script = generate_agent_script(
        ...     max_queued=50,
        ...     max_running=20
        ... )
        >>> print(script[:100])
        #!/bin/bash
        # Chasqui Agent: Automatic PBS job submission
    """
    template = Template(AGENT_SCRIPT)
    script = template.safe_substitute(
        CHASQUI_DIR=chasqui_remote_dir,
        MAX_QUEUED=max_queued,
        MAX_RUNNING=max_running
    )
    
    # Write to file if requested
    if output_path:
        output_file = Path(output_path)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        output_file.write_text(script)
        # Make executable
        output_file.chmod(0o755)
    
    return script

## Agent Deployment

Deploy the agent script to the remote system.

In [14]:
#| export

def deploy_agent(
    ssh_connection,
    chasqui_remote_dir: str = "$HOME/chasqui_remote",
    max_queued: int = 40,
    max_running: int = 30
) -> str:
    """
    Deploy agent script to remote system.
    
    Args:
        ssh_connection: Active SSHConnection object
        chasqui_remote_dir: Remote chasqui directory
        max_queued: Maximum queued jobs allowed
        max_running: Maximum running jobs allowed
        
    Returns:
        Remote path where agent was deployed
        
    Example:
        >>> from chasqui.ssh import SSHConnection
        >>> from chasqui.agent import deploy_agent
        >>> with SSHConnection('bebop') as ssh:
        ...     path = deploy_agent(ssh, max_queued=50)
        ...     print(f"Agent deployed to: {path}")
    """
    import tempfile
    import os
    
    # Generate script
    script = generate_agent_script(
        chasqui_remote_dir=chasqui_remote_dir,
        max_queued=max_queued,
        max_running=max_running
    )
    
    # Create temporary local file
    with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as tmp:
        tmp.write(script)
        tmp_path = tmp.name
    
    try:
        # Expand remote path
        remote_path_expanded = ssh_connection.run(
            f'echo {chasqui_remote_dir}/agent.sh'
        ).strip()
        
        # Upload
        ssh_connection.upload(tmp_path, remote_path_expanded)
        
        # Make executable
        ssh_connection.run(f'chmod +x {remote_path_expanded}')
        
        return remote_path_expanded
        
    finally:
        os.unlink(tmp_path)

## Manual Agent Trigger

Manually trigger the agent (for testing or recovery).

In [15]:
#| export

def trigger_agent(
    ssh_connection,
    chasqui_remote_dir: str = "~/chasqui_remote"
) -> str:
    """
    Manually trigger the agent to submit waiting jobs.
    
    Args:
        ssh_connection: Active SSHConnection object
        chasqui_remote_dir: Remote chasqui directory
        
    Returns:
        Agent log output
        
    Example:
        >>> from chasqui.ssh import SSHConnection
        >>> from chasqui.agent import trigger_agent
        >>> with SSHConnection('bebop') as ssh:
        ...     output = trigger_agent(ssh)
        ...     print(output)
    """
    # Expand path
    agent_path = ssh_connection.run(
        f'echo {chasqui_remote_dir}/agent.sh'
    ).strip()
    
    # Run agent (with flock to prevent conflicts)
    lock_path = f"{chasqui_remote_dir}/agent.lock"
    command = f'flock -n {lock_path} bash {agent_path}'
    
    try:
        result = ssh_connection.run(command)
        return result
    except Exception as e:
        # Agent might already be running (flock fails)
        return f"Agent trigger failed (may already be running): {e}"

## Agent Log Parser

Parse agent logs for sync operations.

In [16]:
#| export

def parse_agent_log(log_content: str) -> list:
    """
    Parse agent log entries.
    
    Args:
        log_content: Raw agent.log content
        
    Returns:
        List of log entry dictionaries
        
    Example:
        >>> log = '''
        ... 2025-11-01T10:00:00Z AGENT_SUBMIT job=abc-123 pbs_id=12345.bebop status=success
        ... 2025-11-01T10:00:01Z AGENT_END submitted=1
        ... '''
        >>> entries = parse_agent_log(log)
        >>> len(entries)
        2
    """
    entries = []
    
    for line in log_content.strip().split('\n'):
        if not line.strip():
            continue
            
        # Parse format: TIMESTAMP ACTION key=value key=value ...
        parts = line.split(' ', 2)
        if len(parts) < 2:
            continue
            
        timestamp = parts[0]
        action = parts[1]
        
        entry = {
            'timestamp': timestamp,
            'action': action,
            'raw': line
        }
        
        # Parse key=value pairs
        if len(parts) > 2:
            pairs = parts[2].split()
            for pair in pairs:
                if '=' in pair:
                    key, value = pair.split('=', 1)
                    entry[key] = value
        
        entries.append(entry)
    
    return entries

## Tests

Verify script generation and parsing.

In [18]:
#| hide
import tempfile
import os

# Test 1: Generate agent script
print("=" * 60)
print("Test 1: Generate agent script")
print("=" * 60)

script = generate_agent_script(
    max_queued=50,
    max_running=20
)

assert len(script) > 500, "Script too short"
assert "#!/bin/bash" in script, "Missing shebang"
assert "MAX_QUEUED=50" in script, "MAX_QUEUED not set"
assert "MAX_RUNNING=20" in script, "MAX_RUNNING not set"
assert "qstat -u" in script, "qstat command missing"
assert "qsub" in script, "qsub command missing"
assert "AGENT_SUBMIT" in script, "Logging missing"
print("âœ“ Script generation works")
print(f"âœ“ Script length: {len(script)} bytes")

# Test 2: Write to file
print("\n" + "=" * 60)
print("Test 2: Write to file")
print("=" * 60)

with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as tmp:
    temp_path = tmp.name

try:
    script = generate_agent_script(output_path=temp_path)
    
    assert os.path.exists(temp_path), "File not created"
    
    # Check if executable
    stat_info = os.stat(temp_path)
    is_executable = bool(stat_info.st_mode & 0o111)
    assert is_executable, "File not executable"
    
    print(f"âœ“ File created: {temp_path}")
    print(f"âœ“ File is executable")
    
finally:
    os.unlink(temp_path)

# Test 3: Log parsing
print("\n" + "=" * 60)
print("Test 3: Log parsing")
print("=" * 60)

sample_log = """
2025-11-01T10:00:00Z AGENT_START
2025-11-01T10:00:00Z AGENT_CHECK queued=65 running=28 total=93
2025-11-01T10:00:00Z AGENT_SLOTS available=7
2025-11-01T10:00:01Z AGENT_SUBMIT job=abc-123 pbs_id=12345.bebop status=success
2025-11-01T10:00:02Z AGENT_SUBMIT job=def-456 pbs_id=12346.bebop status=success
2025-11-01T10:00:03Z AGENT_END submitted=2
"""

entries = parse_agent_log(sample_log)
assert len(entries) == 6, f"Expected 6 entries, got {len(entries)}"

submit_entries = [e for e in entries if e['action'] == 'AGENT_SUBMIT']
assert len(submit_entries) == 2, "Should have 2 submit entries"
assert submit_entries[0]['job'] == 'abc-123', "Job name not parsed"
assert submit_entries[0]['pbs_id'] == '12345.bebop', "PBS ID not parsed"

print(f"âœ“ Parsed {len(entries)} log entries")
print(f"âœ“ Found {len(submit_entries)} submissions")
print(f"âœ“ Job IDs extracted correctly")

# Test 4: Default values
print("\n" + "=" * 60)
print("Test 4: Default values")
print("=" * 60)

script_default = generate_agent_script()
assert "MAX_QUEUED=40" in script_default, "Default MAX_QUEUED not 40"
assert "MAX_RUNNING=30" in script_default, "Default MAX_RUNNING not 30"
print("âœ“ Default queue limits: 70 queued, 30 running")

print("\n" + "=" * 60)
print("ðŸŽ‰ ALL TESTS PASSED!")
print("=" * 60)

Test 1: Generate agent script
âœ“ Script generation works
âœ“ Script length: 2343 bytes

Test 2: Write to file
âœ“ File created: /var/folders/32/674cwdcj6dv58_6xwjvdnf6w0000gr/T/tmpcjcdqpgh.sh
âœ“ File is executable

Test 3: Log parsing
âœ“ Parsed 6 log entries
âœ“ Found 2 submissions
âœ“ Job IDs extracted correctly

Test 4: Default values
âœ“ Default queue limits: 70 queued, 30 running

ðŸŽ‰ ALL TESTS PASSED!


## Usage Examples

### Deploy Agent to Remote System
```python
from chasqui.ssh import SSHConnection
from chasqui.agent import deploy_agent, trigger_agent

# Deploy the agent
with SSHConnection('bebop') as ssh:
    # Deploy with custom limits
    agent_path = deploy_agent(
        ssh,
        max_queued=50,
        max_running=20
    )
    print(f"Agent deployed to: {agent_path}")
    
    # Trigger it manually (for testing)
    output = trigger_agent(ssh)
    print(output)
```

### Check Agent Logs
```python
from chasqui.ssh import SSHConnection
from chasqui.agent import parse_agent_log

with SSHConnection('bebop') as ssh:
    # Fetch agent log
    log_content = ssh.run('cat ~/chasqui_remote/logs/agent.log')
    
    # Parse entries
    entries = parse_agent_log(log_content)
    
    # Show recent submissions
    submissions = [e for e in entries if e['action'] == 'AGENT_SUBMIT']
    print(f"Total submissions: {len(submissions)}")
    
    for entry in submissions[-5:]:  # Last 5
        print(f"  {entry['timestamp']}: {entry['job']} â†’ {entry['pbs_id']}")
```

### Generate Script Locally (for inspection)
```python
from chasqui.agent import generate_agent_script

# Generate with custom settings
script = generate_agent_script(
    chasqui_remote_dir="~/my_chasqui",
    max_queued=80,
    max_running=40,
    output_path="agent_local.sh"
)

print("Agent script saved to: agent_local.sh")

# Inspect it
with open("agent_local.sh") as f:
    print(f.read()[:500])
```

### Integration with Sync

The sync module will use these functions:
```python
def sync():
    with SSHConnection('bebop') as ssh:
        # 1. Upload jobs
        # ...
        
        # 2. Trigger agent to submit them
        trigger_agent(ssh)
        
        # 3. Parse log to see what was submitted
        log = ssh.run('cat ~/chasqui_remote/logs/agent.log')
        entries = parse_agent_log(log)
        # Update local database based on entries
```

In [19]:
from chasqui.agent import generate_agent_script

# Generate the script
script = generate_agent_script(
    max_queued=40,
    max_running=30
)

# Show first 50 lines
lines = script.split('\n')
print(f"Total lines: {len(lines)}\n")
print("=== First 50 lines ===")
print('\n'.join(lines[:50]))

# Check key features
print("\n=== Feature Check ===")
checks = {
    'Has shebang': script.startswith('#!/bin/bash'),
    'Checks qstat': 'qstat -u' in script,
    'Submits with qsub': 'qsub' in script,
    'Uses flock': 'Not in script (called externally)',
    'Logs activity': 'log_message' in script,
    'Moves to submitted': 'mv' in script and 'SUBMITTED_DIR' in script,
}

for check, result in checks.items():
    print(f"âœ“ {check}: {result}")

Total lines: 86

=== First 50 lines ===
#!/bin/bash
# Chasqui Agent: Automatic PBS job submission
# This script submits waiting jobs when PBS queue has capacity

set -e  # Exit on error

# Configuration
CHASQUI_DIR="${CHASQUI_DIR:-$HOME/chasqui_remote}"
WAITING_DIR="$CHASQUI_DIR/waiting"
SUBMITTED_DIR="$CHASQUI_DIR/submitted"
COMPLETED_DIR="$CHASQUI_DIR/completed"
LOG_FILE="$CHASQUI_DIR/logs/agent.log"

# Queue limits (adjust these for your cluster)
MAX_QUEUED=40
MAX_RUNNING=30
MAX_TOTAL=$((MAX_QUEUED + MAX_RUNNING))

# Ensure directories exist
mkdir -p "$WAITING_DIR" "$SUBMITTED_DIR" "$COMPLETED_DIR" "$CHASQUI_DIR/logs"

# Logging function
log_message() {
    echo "$(date -Iseconds) $1" >> "$LOG_FILE"
}

# Start of agent run
log_message "AGENT_START"

# Count current PBS jobs
QUEUED=$(qstat -u $USER 2>/dev/null | grep " Q " | wc -l || echo 0)
RUNNING=$(qstat -u $USER 2>/dev/null | grep " R " | wc -l || echo 0)
TOTAL=$((QUEUED + RUNNING))

log_message "AGENT_CHECK queued=$QUEUED runnin