# AlphaFold 3 Structure Prediction

This notebook runs AlphaFold 3 structure predictions for antibody-antigen complexes.

## Current Run

**Input**: `P1A8_1JPLm414_C_G1.json`  
**Complex**: Antibody P1A8 with immunogen 1JPLm414_C_G1

The JSON file contains all three chains (H, L, Ag) required for complex modeling.

## 1. Setup and Validation

**Validation Checkpoint**: Verify input file exists and paths are correct before running.

In [None]:
import json
import os
from pathlib import Path
from datetime import datetime

# Configuration
# Host paths (your repo)
input_json = "af3_inputs/P1A8_1JPLm414_C_G1.json"
sample_name = "P1A8_1JPLm414_C_G1"
output_base_dir = Path("af3_outputs")

# AlphaFold 3 installation paths (based on working HPC script)
AF3_HOME = Path("/opt/apps/community/alphafold3")
DATABASES_DIR = AF3_HOME / "AF3_databases"
MODEL_PARAMETERS_DIR = AF3_HOME / "AF3_models"
AF3_PROGRAMS_DIR = AF3_HOME / "AF3_Programs" / "alphafold3"

# Pick the SIF image that exists on this system
for candidate in [
    AF3_HOME / "AF3_v3.0.1.sif",
    AF3_HOME / "AF3_111125.sif",
    AF3_HOME / "AF_v3.0.1.sif",
]:
    if candidate.exists():
        SIF_IMAGE = candidate
        break
else:
    SIF_IMAGE = AF3_HOME / "AF3_v3.0.1.sif"  # fallback; will fail validation if missing

# Container-side mount points (match the working script)
CONTAINER_INPUT_DIR = Path("/root/af_input")
CONTAINER_OUTPUT_DIR = Path("/root/af_output")
CONTAINER_MODEL_DIR = Path("/root/models")
CONTAINER_DB_DIR = Path("/root/public_databases")
CONTAINER_AF3_DIR = Path("/root/AF3")

AF3_SCRIPT_IN_CONTAINER = CONTAINER_AF3_DIR / "run_alphafold.py"

# Execution control flags (official AF3 flags)
# --run_data_pipeline (default true): CPU-only genetic + template search
# --run_inference (default true): GPU inference
run_data_pipeline = True
run_inference = True

# Optional: configure Apptainer/Singularity cache locations (helps on HPC)
user = os.environ.get("USER", "unknown")
APPTAINER_TMPDIR = Path(f"/scratch/{user}/tmp")
APPTAINER_CACHEDIR = Path(f"/cwork/{user}/cache")

# Track run information
run_info = {
    "timestamp": datetime.now().isoformat(),
    "input_json": input_json,
    "sample_name": sample_name,
    "output_dir": str((output_base_dir / sample_name).absolute()),
    "af3_home": str(AF3_HOME),
    "databases_dir": str(DATABASES_DIR),
    "model_parameters_dir": str(MODEL_PARAMETERS_DIR),
    "af3_programs_dir": str(AF3_PROGRAMS_DIR),
    "sif_image": str(SIF_IMAGE),
    "run_data_pipeline": run_data_pipeline,
    "run_inference": run_inference,
    "apptainer_tmpdir": str(APPTAINER_TMPDIR),
    "apptainer_cachedir": str(APPTAINER_CACHEDIR),
}

print("AlphaFold 3 Prediction Run Configuration")
print("=" * 80)
print(f"Input JSON (host): {input_json}")
print(f"Sample name: {sample_name}")
print(f"Output directory (host): {run_info['output_dir']}")
print("\nAlphaFold 3 installation:")
print(f"  AF3_HOME: {AF3_HOME}")
print(f"  SIF: {SIF_IMAGE}")
print(f"  DATABASES_DIR: {DATABASES_DIR}")
print(f"  MODEL_PARAMETERS_DIR: {MODEL_PARAMETERS_DIR}")
print(f"  AF3_PROGRAMS_DIR: {AF3_PROGRAMS_DIR}")
print("\nExecution flags:")
print(f"  --run_data_pipeline: {run_data_pipeline}")
print(f"  --run_inference: {run_inference}")
print(f"\nTimestamp: {run_info['timestamp']}")

AlphaFold 3 Prediction Run Configuration
Input JSON: af3_inputs/P1A8_1JPLm414_C_G1.json
Sample name: P1A8_1JPLm414_C_G1
Output directory: af3_outputs/P1A8_1JPLm414_C_G1
Singularity image: /hpc/group/dhvi/WieheLab/MMH/alphafold_v3.0.1/alphafold3.0.1_111125_latest.sif
Database directory: /hpc/group/dhvi/WieheLab/AF3_dbs
Model directory: /hpc/group/dhvi/WieheLab/AF3_dbs

Execution flags:
  --run_data_pipeline: True
  --run_inference: True

Timestamp: 2026-01-28T14:08:28.532737


In [None]:
# Validate input file exists
input_path = Path(input_json)
if not input_path.exists():
    raise FileNotFoundError(f"Input JSON file not found: {input_json}")

print(f"OK: input file exists: {input_json}")
print(f"  File size: {input_path.stat().st_size / 1024:.2f} KB")
print(f"  Last modified: {datetime.fromtimestamp(input_path.stat().st_mtime)}")

# Load and validate JSON structure
with open(input_path, 'r') as f:
    json_data = json.load(f)

print("\nOK: JSON file loaded")
print(f"  Name: {json_data.get('name', 'N/A')}")
print(f"  Version: {json_data.get('version', 'N/A')}")
print(f"  Dialect: {json_data.get('dialect', 'N/A')}")
print(f"  Number of sequences: {len(json_data.get('sequences', []))}")

# Verify required chains are present (H, L, Ag)
sequence_ids = [seq.get('protein', {}).get('id') for seq in json_data.get('sequences', [])]
print(f"  Chain IDs: {sequence_ids}")

required = {"H", "L", "Ag"}
found = set(sequence_ids)
if found != required:
    raise ValueError(f"Expected chain IDs {sorted(required)}, but found {sequence_ids}")

# Display sequence lengths
print("\nSequence lengths:")
for seq in json_data.get('sequences', []):
    chain_id = seq['protein']['id']
    seq_len = len(seq['protein']['sequence'])
    print(f"  {chain_id}: {seq_len} amino acids")

✓ Input file exists: af3_inputs/P1A8_1JPLm414_C_G1.json
  File size: 0.75 KB
  Last modified: 2026-01-28 00:51:19.587919

✓ JSON file is valid
  Name: P1A8_1JPLm414_C_G1
  Version: 4
  Dialect: alphafold3
  Number of sequences: 3
  Chain IDs: ['H', 'L', 'Ag']
  ✓ All required chains present (H, L, Ag)

Sequence lengths:
  H: 118 amino acids
  L: 106 amino acids
  Ag: 167 amino acids


In [None]:
# Optional: set Apptainer env vars if directories exist / can be created
try:
    APPTAINER_TMPDIR.mkdir(parents=True, exist_ok=True)
    os.environ["APPTAINER_TMPDIR"] = str(APPTAINER_TMPDIR)
    print(f"OK: APPTAINER_TMPDIR set to {APPTAINER_TMPDIR}")
except Exception as e:
    print(f"NOTE: Could not set APPTAINER_TMPDIR: {e}")

try:
    APPTAINER_CACHEDIR.mkdir(parents=True, exist_ok=True)
    os.environ["APPTAINER_CACHEDIR"] = str(APPTAINER_CACHEDIR)
    print(f"OK: APPTAINER_CACHEDIR set to {APPTAINER_CACHEDIR}")
except Exception as e:
    print(f"NOTE: Could not set APPTAINER_CACHEDIR: {e}")

# Check AF3 installation paths
for label, p in [
    ("SIF_IMAGE", SIF_IMAGE),
    ("DATABASES_DIR", DATABASES_DIR),
    ("MODEL_PARAMETERS_DIR", MODEL_PARAMETERS_DIR),
    ("AF3_PROGRAMS_DIR", AF3_PROGRAMS_DIR),
]:
    if p.exists():
        print(f"OK: {label} exists: {p}")
    else:
        raise FileNotFoundError(f"Missing {label}: {p}")

# Create output directory (host)
output_dir = output_base_dir / sample_name
output_dir.mkdir(parents=True, exist_ok=True)
print(f"\nOK: output directory created: {output_dir}")

# Define host input/output directories for binding
host_input_dir = Path(input_json).parent
host_output_dir = output_dir

print(f"Host input dir: {host_input_dir.absolute()}")
print(f"Host output dir: {host_output_dir.absolute()}")

✓ Singularity image exists: /hpc/group/dhvi/WieheLab/MMH/alphafold_v3.0.1/alphafold3.0.1_111125_latest.sif
  Image size: 4.92 GB
✓ Database directory exists: /hpc/group/dhvi/WieheLab/AF3_dbs

✓ Output directory created: af3_outputs/P1A8_1JPLm414_C_G1


**Validation Checkpoint**: Please verify:
- Input JSON file exists and is valid
- All three chains (H, L, Ag) are present
- Sequence lengths are reasonable
- AlphaFold 3 SIF + databases + model parameters exist
- Output directory created

If everything looks correct, proceed to run AlphaFold 3.

## 2. Run AlphaFold 3 Prediction

**Note**: This may take a significant amount of time depending on system resources. The prediction will run and output will be displayed as it becomes available.

**Execution Control**:
- `--run_data_pipeline` (default: true): Runs genetic and template search. This is CPU-only and time-consuming. Can be run separately on a machine without GPU.
- `--run_inference` (default: true): Runs the inference. This requires a GPU.

You can modify these flags in the configuration cell above to control which parts run.

In [None]:
# Construct the Singularity command (adapted from the working AF3 script)
# Bind host input/output/models/databases into standard container locations.

# Container expects JSON under /root/af_input
json_filename = Path(input_json).name
container_json_path = str(CONTAINER_INPUT_DIR / json_filename)

# Build flags (official format with equals signs)
flags = [
    f"--json_path={container_json_path}",
    f"--model_dir={CONTAINER_MODEL_DIR}",
    f"--db_dir={CONTAINER_DB_DIR}",
    f"--output_dir={CONTAINER_OUTPUT_DIR}",
]

# Optional execution control
if not run_data_pipeline:
    flags.append("--run_data_pipeline=false")
if not run_inference:
    flags.append("--run_inference=false")

command = " ".join([
    "singularity exec --nv",
    f"--bind {host_input_dir.absolute()}:{CONTAINER_INPUT_DIR}",
    f"--bind {host_output_dir.absolute()}:{CONTAINER_OUTPUT_DIR}",
    f"--bind {MODEL_PARAMETERS_DIR}:{CONTAINER_MODEL_DIR}",
    f"--bind {DATABASES_DIR}:{CONTAINER_DB_DIR}",
    f"--bind {AF3_PROGRAMS_DIR}:{CONTAINER_AF3_DIR}",
    str(SIF_IMAGE),
    "python",
    str(AF3_SCRIPT_IN_CONTAINER),
    " ".join(flags),
])

print("AlphaFold 3 Command (Singularity)")
print("=" * 80)
print(command)
print("=" * 80)

# Save command to run info
run_info["command"] = command
run_info["container_json_path"] = container_json_path
run_info["flags"] = flags
run_info["host_input_dir"] = str(host_input_dir.absolute())
run_info["host_output_dir"] = str(host_output_dir.absolute())

AlphaFold 3 Command:
singularity exec --nv \
  -B /cwork:/cwork \
  -B /hpc/group/dhvi/WieheLab/AF3_dbs:/hpc/group/dhvi/WieheLab/AF3_dbs \
  /hpc/group/dhvi/WieheLab/MMH/alphafold_v3.0.1/alphafold3.0.1_111125_latest.sif \
  python /app/alphafold/run_alphafold.py \
    --json_path=/cwork/hsb26/ab_seq_bind_analysis/af3_inputs/P1A8_1JPLm414_C_G1.json --output_dir=/cwork/hsb26/ab_seq_bind_analysis/af3_outputs/P1A8_1JPLm414_C_G1 --db_dir=/hpc/group/dhvi/WieheLab/AF3_dbs --use_templates=false

Note: This command follows the official AF3 format.
Key flags:
  --run_data_pipeline: Run genetic and template search (CPU-only)
  --run_inference: Run the inference (requires GPU)
  Both default to true if not specified


In [9]:
!nvidia-smi


Wed Jan 28 14:33:52 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.10              Driver Version: 570.86.10      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX 5000 Ada Gene...    On  |   00000000:0C:00.0 Off |                  Off |
| 30%   18C    P8             17W /  250W |       2MiB /  32760MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Run AlphaFold 3
# Note: This will execute the command and stream output
import subprocess
import sys

print("Starting AlphaFold 3 prediction...")
print("=" * 80)
print(f"Start time: {datetime.now().isoformat()}")
print()

process = subprocess.Popen(
    command,
    shell=True,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    universal_newlines=True,
    bufsize=1
)

output_lines = []
for line in process.stdout:
    print(line, end='')
    output_lines.append(line)
    sys.stdout.flush()

return_code = process.wait()

run_info["completion_time"] = datetime.now().isoformat()
run_info["return_code"] = return_code
run_info["output_lines"] = output_lines

print()
print("=" * 80)
if return_code == 0:
    print("SUCCESS: AlphaFold 3 prediction completed")
else:
    print(f"FAILURE: AlphaFold 3 prediction failed (return code {return_code})")
print(f"Completion time: {run_info['completion_time']}")

Starting AlphaFold 3 prediction...
Start time: 2026-01-28T14:33:54.786548

FATAL Flags parsing error: Unknown command line flag 'use_templates'. Did you mean: max_template_date ?
Pass --helpshort or --helpfull to see help on flags.

✗ AlphaFold 3 prediction failed with return code: 1
Completion time: 2026-01-28T14:34:01.758945


## 3. Check Output Files

**Validation Checkpoint**: Verify that output files were generated correctly.

In [None]:
# Check output directory contents
print("Output Directory Contents:")
print("=" * 80)
print(f"Output directory: {output_dir}")

files = []
if output_dir.exists():
    files = list(output_dir.glob("*"))
    if files:
        print(f"\nFound {len(files)} files/directories:")
        for f in sorted(files):
            if f.is_file():
                size = f.stat().st_size
                size_str = f"{size / 1024:.2f} KB" if size < 1024**2 else f"{size / (1024**2):.2f} MB"
                print(f"  - {f.name} ({size_str})")
            else:
                print(f"  - {f.name}/")

        pdb_files = list(output_dir.glob("*.pdb"))
        if pdb_files:
            print(f"\nFound {len(pdb_files)} PDB structure file(s):")
            for pdb in sorted(pdb_files):
                print(f"  - {pdb.name}")

        json_files = list(output_dir.glob("*.json"))
        if json_files:
            print(f"\nFound {len(json_files)} JSON file(s):")
            for jf in sorted(json_files):
                print(f"  - {jf.name}")
    else:
        print("\nNo files found in output directory")
else:
    print("\nOutput directory does not exist")

run_info["output_files"] = [str(f) for f in files]

Output Directory Contents:
Output directory: af3_outputs/P1A8_1JPLm414_C_G1

⚠ No files found in output directory


In [7]:
# Save run information
run_info_file = output_dir / "run_info.json"
with open(run_info_file, 'w') as f:
    json.dump(run_info, f, indent=2)

print(f"\n✓ Run information saved to: {run_info_file}")


✓ Run information saved to: af3_outputs/P1A8_1JPLm414_C_G1/run_info.json


## Summary

**Run Status**: Check the output above to verify:
- ✓ Prediction completed (return code 0)
- ✓ Output files generated
- ✓ PDB structure files present (if successful)

**Next Steps**:
- Review the generated PDB structure files
- Analyze predicted binding interface
- Compare with experimental binding data
- Run additional predictions if needed

**Note**: If you need to run only the data pipeline or only inference separately, modify the `run_data_pipeline` and `run_inference` flags in the configuration cell and re-run.