# Run AlphaFold 3 Prediction for Antibody-Antigen Complexes

This notebook runs AlphaFold 3 predictions using the same bind-mount layout as `00_test_af3_setup.ipynb`.

**Requirements:**
- Run on a GPU node: `srun --partition=dhvi-gpu --gres=gpu:1 --mem=10G -c 6 --pty bash`
- Load CUDA if needed: `module load CUDA/11.4`

**Bind mounts (matches working HPC script):**
- Host input dir → `/root/af_input`
- Host output dir → `/root/af_output`
- Model parameters → `/root/models`
- Databases → `/root/public_databases`
- AF3 program dir → `/root/AF3`
- Temp dir → `/tmp` (prevents cleanup errors)

In [1]:
import os
import subprocess
import json
import sys
import time
import shutil
from pathlib import Path
from datetime import datetime, timedelta

# =============================================================================
# Configuration - matches run_AF3_wk24_glycan.job and 00_test_af3_setup.ipynb
# =============================================================================

user = os.environ.get("USER", "unknown")

# Runtime / Apptainer directories
os.environ.setdefault("APPTAINER_TMPDIR", f"/scratch/{user}/tmp")
os.environ.setdefault("APPTAINER_CACHEDIR", f"/cwork/{user}/cache")

# IMPORTANT: Set TMPDIR for Python's tempfile module (used by Jackhmmer)
# This prevents cleanup errors when container/host have permission conflicts
TMPDIR = Path(f"/scratch/{user}/af3_tmp")
TMPDIR.mkdir(parents=True, exist_ok=True)
os.environ["TMPDIR"] = str(TMPDIR)
os.environ["TEMP"] = str(TMPDIR)
os.environ["TMP"] = str(TMPDIR)

# Ensure scratch/cache dirs exist
Path(os.environ["APPTAINER_TMPDIR"]).mkdir(parents=True, exist_ok=True)
Path(os.environ["APPTAINER_CACHEDIR"]).mkdir(parents=True, exist_ok=True)

# AF3 paths (same as run_AF3_wk24_glycan.job)
AF3_HOME = Path("/opt/apps/community/alphafold3")
DATABASES_DIR = AF3_HOME / "AF3_databases"
MODEL_PARAMETERS_DIR = AF3_HOME / "AF3_models"
AF3_PROGRAMS_DIR = AF3_HOME / "AF3_Programs" / "alphafold3"

# SIF image: find the first available
SIF_CANDIDATES = [
    AF3_HOME / "AF_v3.0.1.sif",
    AF3_HOME / "AF3_v3.0.1.sif",
    AF3_HOME / "AF3_111125.sif",
]
SIF_IMAGE = None
for candidate in SIF_CANDIDATES:
    if candidate.exists():
        SIF_IMAGE = candidate
        break

# Input/output directories (project-relative paths)
PROJECT_ROOT = Path(f"/cwork/{user}/ab_seq_bind_analysis")
AF3_INPUT_DIR = PROJECT_ROOT / "data" / "processed" / "af3_inputs"
AF3_OUTPUT_DIR = PROJECT_ROOT / "data" / "processed" / "af3_outputs"

# Ensure output directory exists
AF3_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("AF3 Configuration")
print("=" * 60)
print(f"APPTAINER_TMPDIR:   {os.environ.get('APPTAINER_TMPDIR')}")
print(f"APPTAINER_CACHEDIR: {os.environ.get('APPTAINER_CACHEDIR')}")
print(f"TMPDIR:             {os.environ.get('TMPDIR')}")
print(f"AF3_HOME:           {AF3_HOME}")
print(f"SIF_IMAGE:          {SIF_IMAGE}")
print(f"DATABASES_DIR:      {DATABASES_DIR}")
print(f"MODEL_PARAMS_DIR:   {MODEL_PARAMETERS_DIR}")
print(f"AF3_PROGRAMS_DIR:   {AF3_PROGRAMS_DIR}")
print(f"AF3_INPUT_DIR:      {AF3_INPUT_DIR}")
print(f"AF3_OUTPUT_DIR:     {AF3_OUTPUT_DIR}")

AF3 Configuration
APPTAINER_TMPDIR:   /scratch/hsb26/tmp
APPTAINER_CACHEDIR: /cwork/hsb26/cache
TMPDIR:             /scratch/hsb26/af3_tmp
AF3_HOME:           /opt/apps/community/alphafold3
SIF_IMAGE:          /opt/apps/community/alphafold3/AF3_v3.0.1.sif
DATABASES_DIR:      /opt/apps/community/alphafold3/AF3_databases
MODEL_PARAMS_DIR:   /opt/apps/community/alphafold3/AF3_models
AF3_PROGRAMS_DIR:   /opt/apps/community/alphafold3/AF3_Programs/alphafold3
AF3_INPUT_DIR:      /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_inputs
AF3_OUTPUT_DIR:     /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_outputs


## 1. Select Input Complex

Choose which antibody-antigen complex to predict.

In [2]:
# List available input JSON files
print("Available input complexes:")
print("=" * 60)

input_files = sorted(AF3_INPUT_DIR.glob("*.json"))
input_files = [f for f in input_files if not f.name.startswith(".")]  # skip hidden files

for i, f in enumerate(input_files):
    print(f"  {i:2d}. {f.name}")

print(f"\nTotal: {len(input_files)} input files")

Available input complexes:
   0. P10A1_1JPL_WT.json
   1. P10A9_1JPL_WT.json
   2. P12E11_H3_JHB_94.json
   3. P13C1_1JPLm414_C_G1.json
   4. P17A1_1JPLm414_T_G3.json
   5. P17E9_H3_JHB_94.json
   6. P1A8_1JPLm414_C_G1.json
   7. P22H12_1JPLm414_T_G3.json
   8. P23F10_1JPL_4I.json
   9. P24E7_H3_JHB_94.json
  10. P25G10_H3_JHB_94.json
  11. P27D10_H3_JHB_94.json
  12. P28A7_1JPLm414_C_G1.json
  13. P28A7_1JPLm414_T_G3.json
  14. P30A5_1JPLm414_T_G3.json
  15. P31A3_1JPLm414_C_G1.json
  16. P32B9_H3_JHB_94.json
  17. P32G11_1JPL_WT.json
  18. P33F7_HK68.json
  19. P34E9_1JPLm414_C_G1.json
  20. P35A1_1JPLm414_C_G1.json
  21. P35A1_1JPLm414_T_G3.json
  22. P35C12_1JPLm414_C_G1.json
  23. P36B2_1JPLm414_T_G3.json
  24. P36D8_1JPLm414_C_G1.json
  25. P36H10_1JPL_WT.json
  26. P37G10_HK68.json
  27. P38B10_1JPL_4I.json
  28. P38B10_HK68.json
  29. P38B11_1JPLm414_T_G3.json
  30. P38F11_1JPL_4I.json
  31. P38F11_HK68.json
  32. P39A3_1JPLm414_C_G1.json
  33. P39A3_1JPLm414_T_G3.json
  34. P4

In [3]:
# =============================================================================
# SELECT YOUR COMPLEX HERE
# =============================================================================
# Option 1: Select by index from the list above
# INPUT_INDEX = 0
# INPUT_JSON = input_files[INPUT_INDEX]

# Option 2: Specify by name directly (for first actual complex test)
INPUT_JSON = AF3_INPUT_DIR / "P10A1_1JPL_WT.json"

# =============================================================================

RUN_NAME = INPUT_JSON.stem

print(f"Selected complex: {INPUT_JSON.name}")
print(f"Run name: {RUN_NAME}")

# Load and display the input JSON
with open(INPUT_JSON, "r") as f:
    input_data = json.load(f)

print(f"\nComplex details:")
print(f"  Name: {input_data.get('name')}")
print(f"  Seeds: {input_data.get('modelSeeds')}")
print(f"  Chains:")
for seq in input_data.get("sequences", []):
    if "protein" in seq:
        chain_id = seq["protein"]["id"]
        chain_seq = seq["protein"]["sequence"]
        print(f"    {chain_id}: {len(chain_seq)} residues")

Selected complex: P10A1_1JPL_WT.json
Run name: P10A1_1JPL_WT

Complex details:
  Name: P10A1_1JPL_WT
  Seeds: [1]
  Chains:
    H: 121 residues
    L: 112 residues
    A: 166 residues


## 2. Validate Environment

In [4]:
def check_path(label, path, required=True):
    """Check if a path exists and print status."""
    exists = path.exists() if path else False
    status = "OK" if exists else "MISSING"
    symbol = "[+]" if exists else "[X]"
    print(f"{symbol} {label}: {path} - {status}")
    if required and not exists:
        raise FileNotFoundError(f"Required path missing: {label} = {path}")
    return exists

print("Environment Check")
print("=" * 60)

# Check critical paths
check_path("AF3_HOME", AF3_HOME)
check_path("SIF_IMAGE", SIF_IMAGE)
check_path("DATABASES_DIR", DATABASES_DIR)
check_path("MODEL_PARAMETERS_DIR", MODEL_PARAMETERS_DIR)
check_path("AF3_PROGRAMS_DIR", AF3_PROGRAMS_DIR)
check_path("INPUT_JSON", INPUT_JSON)

# Check for singularity/apptainer
print("\nSingularity/Apptainer Check")
print("-" * 40)
result = subprocess.run(["which", "singularity"], capture_output=True, text=True)
if result.returncode == 0:
    print(f"[+] singularity found: {result.stdout.strip()}")
else:
    result = subprocess.run(["which", "apptainer"], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"[+] apptainer found: {result.stdout.strip()}")
    else:
        print("[X] Neither singularity nor apptainer found!")

# Check GPU access
print("\nGPU Check")
print("-" * 40)
result = subprocess.run(["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], 
                       capture_output=True, text=True)
if result.returncode == 0:
    print(f"[+] GPU available: {result.stdout.strip()}")
else:
    print("[X] No GPU detected! Run on a GPU node:")
    print("    srun --partition=dhvi-gpu --gres=gpu:1 --mem=10G -c 6 --pty bash")

print("\n" + "=" * 60)
print("Environment validation complete!")

Environment Check
[+] AF3_HOME: /opt/apps/community/alphafold3 - OK
[+] SIF_IMAGE: /opt/apps/community/alphafold3/AF3_v3.0.1.sif - OK
[+] DATABASES_DIR: /opt/apps/community/alphafold3/AF3_databases - OK
[+] MODEL_PARAMETERS_DIR: /opt/apps/community/alphafold3/AF3_models - OK
[+] AF3_PROGRAMS_DIR: /opt/apps/community/alphafold3/AF3_Programs/alphafold3 - OK
[+] INPUT_JSON: /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_inputs/P10A1_1JPL_WT.json - OK

Singularity/Apptainer Check
----------------------------------------
[+] singularity found: /usr/bin/singularity

GPU Check
----------------------------------------
[+] GPU available: NVIDIA RTX 5000 Ada Generation

Environment validation complete!


## 3. Build and Run AF3 Command

In [5]:
# =============================================================================
# Execution Control
# =============================================================================
RUN_DATA_PIPELINE = True   # CPU-only genetic/template search
RUN_INFERENCE = True       # GPU-required model inference
DRY_RUN = False            # Set True to only print command without executing

# =============================================================================
# Build command with proper bind mounts (matches 00_test_af3_setup.ipynb)
# =============================================================================

# Clean any leftover temp files from previous failed runs
for old_tmp in TMPDIR.glob("tmp*"):
    try:
        if old_tmp.is_dir():
            shutil.rmtree(old_tmp, ignore_errors=True)
        else:
            old_tmp.unlink(missing_ok=True)
    except:
        pass

# Build the singularity command
cmd = [
    "singularity", "exec", "--nv",
    "--bind", f"{AF3_INPUT_DIR}:/root/af_input",
    "--bind", f"{AF3_OUTPUT_DIR}:/root/af_output",
    "--bind", f"{MODEL_PARAMETERS_DIR}:/root/models",
    "--bind", f"{DATABASES_DIR}:/root/public_databases",
    "--bind", f"{AF3_PROGRAMS_DIR}:/root/AF3",
    # Bind temp directory to prevent cleanup errors
    "--bind", f"{TMPDIR}:/tmp",
    str(SIF_IMAGE),
    "python", "/root/AF3/run_alphafold.py",
    f"--json_path=/root/af_input/{INPUT_JSON.name}",
    "--model_dir=/root/models",
    "--db_dir=/root/public_databases",
    "--output_dir=/root/af_output",
]

# Add optional flags
if not RUN_DATA_PIPELINE:
    cmd.append("--run_data_pipeline=false")
if not RUN_INFERENCE:
    cmd.append("--run_inference=false")

print("AF3 Command")
print("=" * 60)
print(" ".join(cmd[:10]) + " ...")
print()
print("Full command:")
print(" ".join(cmd))

AF3 Command
singularity exec --nv --bind /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_inputs:/root/af_input --bind /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_outputs:/root/af_output --bind /opt/apps/community/alphafold3/AF3_models:/root/models --bind ...

Full command:
singularity exec --nv --bind /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_inputs:/root/af_input --bind /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_outputs:/root/af_output --bind /opt/apps/community/alphafold3/AF3_models:/root/models --bind /opt/apps/community/alphafold3/AF3_databases:/root/public_databases --bind /opt/apps/community/alphafold3/AF3_Programs/alphafold3:/root/AF3 --bind /scratch/hsb26/af3_tmp:/tmp /opt/apps/community/alphafold3/AF3_v3.0.1.sif python /root/AF3/run_alphafold.py --json_path=/root/af_input/P10A1_1JPL_WT.json --model_dir=/root/models --db_dir=/root/public_databases --output_dir=/root/af_output


In [6]:
# =============================================================================
# Run AF3 Prediction
# =============================================================================

run_info = {
    "timestamp_start": datetime.now().isoformat(),
    "input_json": str(INPUT_JSON),
    "run_name": RUN_NAME,
    "output_dir": str(AF3_OUTPUT_DIR),
    "run_data_pipeline": RUN_DATA_PIPELINE,
    "run_inference": RUN_INFERENCE,
    "cmd": " ".join(cmd),
}

if DRY_RUN:
    print("DRY RUN - Command not executed.")
    print("Set DRY_RUN = False to run the prediction.")
else:
    print(f"Running AF3 prediction for: {RUN_NAME}")
    print("=" * 60)
    print(f"Started: {run_info['timestamp_start']}")
    print("\nThis may take 30-60+ minutes for antibody-antigen complexes.\n")
    
    # Progress tracking
    start_time = time.time()
    stage_markers = {
        "Running data pipeline": "MSA Search",
        "Jackhmmer": "Jackhmmer",
        "Finished Jackhmmer": "Jackhmmer done",
        "Running model inference": "Model Inference", 
        "Running structure module": "Structure Module",
        "Writing output": "Writing Output",
        "Running fold job": "Starting fold job",
    }
    current_stage = "Initializing"
    jackhmmer_count = 0
    jackhmmer_total = 4  # uniref90, bfd, mgy, uniprot
    
    def format_elapsed(seconds):
        return str(timedelta(seconds=int(seconds)))
    
    def print_progress(stage, elapsed, extra=""):
        spinner = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
        spin_char = spinner[int(elapsed) % len(spinner)]
        msg = f"\r{spin_char} [{format_elapsed(elapsed)}] {stage}"
        if extra:
            msg += f" {extra}"
        print(msg + " " * 20, end="", flush=True)
    
    # Run with live output and progress
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        env={**os.environ, "TMPDIR": "/tmp", "TEMP": "/tmp", "TMP": "/tmp"}
    )
    
    print("Progress:")
    print("-" * 60)
    
    last_progress_time = time.time()
    output_lines = []
    
    for line in proc.stdout:
        elapsed = time.time() - start_time
        output_lines.append(line)
        
        # Update stage based on output
        for marker, stage_name in stage_markers.items():
            if marker in line:
                if "Finished Jackhmmer" in line:
                    jackhmmer_count += 1
                    current_stage = f"Jackhmmer ({jackhmmer_count}/{jackhmmer_total})"
                else:
                    current_stage = stage_name
        
        # Print progress update every 2 seconds or on stage change
        if time.time() - last_progress_time >= 2:
            print_progress(current_stage, elapsed)
            last_progress_time = time.time()
        
        # Also print important log lines
        if any(x in line for x in ["[+]", "[X]", "Error", "error", "Finished", "Running fold job", "Output will be written"]):
            print()  # newline before log
            print(line.rstrip())
    
    rc = proc.wait()
    elapsed = time.time() - start_time
    
    run_info["timestamp_end"] = datetime.now().isoformat()
    run_info["return_code"] = rc
    run_info["elapsed_seconds"] = elapsed
    run_info["output_tail"] = output_lines[-100:]
    
    print()  # Final newline
    print("\n" + "=" * 60)
    print(f"Total time: {format_elapsed(elapsed)}")
    if rc == 0:
        print("[+] AF3 prediction completed successfully!")
    else:
        print(f"[X] AF3 prediction failed with code {rc}")
        # Cleanup any remaining temp files
        print("\nCleaning up temp files...")
        for tmp_dir in TMPDIR.glob("tmp*"):
            try:
                shutil.rmtree(tmp_dir, ignore_errors=True)
            except:
                pass
    
    print(f"Finished: {run_info['timestamp_end']}")

Running AF3 prediction for: P10A1_1JPL_WT
Started: 2026-02-03T15:46:06.405073

This may take 30-60+ minutes for antibody-antigen complexes.

Progress:
------------------------------------------------------------
⠋ [0:03:20] Jackhmmer (2/4)                    
I0203 15:49:26.677670 140293194106432 subprocess_utils.py:97] Finished Jackhmmer (bfd-first_non_consensus_sequences.fasta) in 168.875 seconds
⠏ [0:13:59] Jackhmmer (4/4)                    
I0203 16:00:05.620727 140293210891840 subprocess_utils.py:97] Finished Jackhmmer (uniref90_2022_05.fa) in 807.818 seconds
⠏ [0:16:49] Jackhmmer (6/4)                    
I0203 16:02:56.288481 140293185713728 subprocess_utils.py:97] Finished Jackhmmer (uniprot_all_2021_04.fa) in 978.485 seconds
⠹ [0:21:22] Jackhmmer (8/4)                    
I0203 16:07:28.586431 140293202499136 subprocess_utils.py:97] Finished Jackhmmer (mgy_clusters_2022_05.fa) in 1250.783 seconds

I0203 16:07:29.078211 140301133235328 subprocess_utils.py:97] Finished Hmmbuild

## 4. Inspect Outputs

In [7]:
# Check what outputs were generated
output_subdir = AF3_OUTPUT_DIR / RUN_NAME

print(f"Checking outputs for: {RUN_NAME}")
print(f"Output directory: {output_subdir}")
print("=" * 60)

if output_subdir.exists():
    files = list(output_subdir.rglob("*"))
    if files:
        print(f"Found {len(files)} files/directories:\n")
        for f in sorted(files)[:30]:
            rel = f.relative_to(output_subdir)
            if f.is_file():
                size = f.stat().st_size
                print(f"  {rel} ({size:,} bytes)")
            else:
                print(f"  {rel}/")
        if len(files) > 30:
            print(f"  ... and {len(files) - 30} more")
    else:
        print("No output files found.")
else:
    print("Output directory does not exist yet.")
    print("Run the prediction first!")

Checking outputs for: P10A1_1JPL_WT
Output directory: /cwork/hsb26/ab_seq_bind_analysis/data/processed/af3_outputs/P10A1_1JPL_WT
Output directory does not exist yet.
Run the prediction first!


In [8]:
# Save run info to output directory
if 'run_info' in dir() and output_subdir.exists():
    run_info_path = output_subdir / "run_info.json"
    with open(run_info_path, "w") as f:
        # Don't save output_tail to keep file small
        run_info_to_save = {k: v for k, v in run_info.items() if k != 'output_tail'}
        json.dump(run_info_to_save, f, indent=2)
    print(f"Saved run info: {run_info_path}")
else:
    print("No run info to save (prediction not run yet).")

No run info to save (prediction not run yet).


In [9]:
# Load and display confidence scores if available
summary_files = list(output_subdir.glob("**/seed-*_sample-*_summary_confidences.json")) if output_subdir.exists() else []

if summary_files:
    print("Confidence Summary")
    print("=" * 60)
    for sf in sorted(summary_files):
        with open(sf) as f:
            conf = json.load(f)
        print(f"\n{sf.parent.name}:")
        for key, value in conf.items():
            if isinstance(value, float):
                print(f"  {key}: {value:.4f}")
            else:
                print(f"  {key}: {value}")
else:
    print("No confidence summary files found.")
    print("Run the prediction first!")

No confidence summary files found.
Run the prediction first!


## Notes

### Running Options

**Full prediction (default):**
```python
RUN_DATA_PIPELINE = True
RUN_INFERENCE = True
```

**CPU-only MSA search (no GPU needed):**
```python
RUN_DATA_PIPELINE = True
RUN_INFERENCE = False
```

**GPU-only inference (after MSA is done):**
```python
RUN_DATA_PIPELINE = False
RUN_INFERENCE = True
```

### Temp Directory Cleanup Errors

If you see `OSError: [Errno 39] Directory not empty` errors:
1. This is often non-fatal - AF3 may have completed successfully
2. Check outputs anyway - look for `.cif` files in the output directory
3. The notebook binds `/tmp` to prevent most cleanup issues

### Expected Runtime

- Single protein (~50 residues): ~25 minutes
- Antibody-antigen complex (~400 residues total): ~45-90 minutes
- Most time is spent in Jackhmmer MSA search