In the first chunk you will write your base structure JSONs, input the PDB IDs of TCRs known to bind to the epitope generated for. The standard TCR chain notation is as follows:

chain A: MHC
chain B: Beta microglobulin
chain C: peptide
chain D: TCR alpha chain
chain E: TCR beta chain

Make sure that your base JSONs follow this structure!!

In [None]:
#!/usr/bin/env python3
import os
import json
import requests
from Bio import SeqIO
from io import StringIO

def fetch_fasta(pdb_id):
    """Download FASTA for a PDB ID from RCSB and return SeqRecords."""
    url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
    r = requests.get(url)
    if r.status_code != 200:
        raise RuntimeError(f"Failed to fetch FASTA for {pdb_id}")
    return list(SeqIO.parse(StringIO(r.text), "fasta"))

def make_af3_json(pdb_id, outdir=" "):
    """Create AlphaFold3 JSON input using FASTA sequences."""
    records = fetch_fasta(pdb_id)

    sequences = []
    seen = set()
    peptide_entry = None

    for rec in records:
        seq = str(rec.seq)
        if seq in seen:
            continue
        seen.add(seq)

        entry = {"protein": {"sequence": seq, "description": rec.description}}

        # Identify peptide explicitly
        if seq == "GILGFVFTL":
            peptide_entry = entry
        else:
            sequences.append(entry)

    # Force peptide to index 2 (third position)
    if peptide_entry:
        if len(sequences) >= 2:
            sequences.insert(2, peptide_entry)
        else:
            sequences.append(peptide_entry)  # fallback if fewer than 2 entries
    else:
        print(f"Warning: No peptide found for {pdb_id}")

    job = {
        "name": f"{pdb_id}_tcr_pmhc",
        "modelSeeds": [1],
        "sequences": sequences,
        "dialect": "alphafold3",
        "version": 1
    }

    os.makedirs(outdir, exist_ok=True)
    outfile = os.path.join(outdir, f"{pdb_id}.json")
    with open(outfile, "w") as f:
        json.dump(job, f, indent=2)

    print(f"Wrote JSON for {pdb_id}")

if __name__ == "__main__":
    pdb_ids = ["2vlj", "2vlk", "2vlr", "5isz", "1oga", "5euo"] #fill IDs for epitope
    for pdb_id in pdb_ids:
        make_af3_json(pdb_id)


In [None]:
import pandas as pd
import json
from pathlib import Path


# Input directories
SOLVED_JSON_DIR = Path(" ") # solved structures
NEEDLEMEN_FILES = [] # see tcrdock/scripts for needlemen alignment script

OUT_DIR = Path(" ")
OUT_DIR.mkdir(parents=True, exist_ok=True)


def load_json(pdb_id: str) -> dict:
    path = SOLVED_JSON_DIR / f"{pdb_id}.json"
    if not path.exists():
        raise FileNotFoundError(f"Solved JSON not found: {path}")
    return json.loads(path.read_text())


def replace_cdr3b(full_seq: str, old_cdr3: str, new_cdr3: str) -> str:
    if old_cdr3 not in full_seq:
        raise ValueError(f"CDR3 {old_cdr3} not found in beta chain sequence")
    return full_seq.replace(old_cdr3, new_cdr3)


def find_tcr_beta_entry(solved_json: dict) -> dict:
    """Return the 5th chain (index 4), assumed to be TCR beta."""
    try:
        return solved_json["sequences"][4]["protein"]
    except IndexError:
        raise KeyError("Solved JSON does not have at least 5 sequences (expected TCR beta at index 4)")


def process_needlemen(csv_file: str):
    df = pd.read_csv(csv_file)
    prefix = get_prefix_from_path(csv_file)

    for _, row in df.iterrows():
        gen_idx = row["Generated_Index"]
        gen_cdr3 = row["Generated_TCR"]
        solved_id = row["Best_Solved_ID"]
        solved_cdr3 = row["Best_Solved_TCR"]

        solved_json = load_json(solved_id)

        beta_protein = find_tcr_beta_entry(solved_json)
        old_seq = beta_protein["sequence"]

        try:
            new_seq = replace_cdr3b(old_seq, solved_cdr3, gen_cdr3)
        except ValueError:
            print(f"WARNING: could not splice {gen_cdr3} into {solved_id} (index {gen_idx})")
            continue

        beta_protein["sequence"] = new_seq
        solved_json["name"] = f"{prefix}_{gen_idx}"

        out_name = f"{prefix}_{gen_idx}.json"
        out_path = OUT_DIR / out_name
        out_path.write_text(json.dumps(solved_json, indent=2))

        print(f"Wrote {out_path}")


def main():
    for f in NEEDLEMEN_FILES:
        print(f"Processing {f}")
        process_needlemen(f)


if __name__ == "__main__":
    main()


In [None]:
import os
import json
from pathlib import Path

# Input directory containing your current JSONs
input_dir = Path(" ")
# Output directory for cleaned version 1 JSONs
output_dir = Path("")
output_dir.mkdir(exist_ok=True)

# Function to clean a single JSON
def clean_json(json_path, output_path):
    with open(json_path, "r") as f:
        data = json.load(f)

    # Set version to 1
    data["version"] = 1

    # Ensure dialect is set
    data["dialect"] = "alphafold3"

    # Remove unsupported fields in each sequence and fix IDs
    new_sequences = []
    for i, seq_entry in enumerate(data["sequences"]):
        seq_type = next(iter(seq_entry))  # 'protein', 'rna', 'dna', 'ligand'
        seq_data = seq_entry[seq_type]

        # Remove unsupported keys like 'description', 'templates', 'modifications', etc.
        allowed_keys = {"id", "sequence", "unpairedMsa", "pairedMsa", "ccdCodes", "smiles"}
        seq_data = {k: v for k, v in seq_data.items() if k in allowed_keys}

        # Assign alphanumeric uppercase ID
        seq_data["id"] = chr(ord("A") + i)

        new_sequences.append({seq_type: seq_data})

    data["sequences"] = new_sequences

    # Remove optional keys that version 1 does not support
    for key in ["bondedAtomPairs", "userCCD", "userCCDPath"]:
        if key in data:
            data.pop(key)

    # Write cleaned JSON
    with open(output_path, "w") as f:
        json.dump(data, f, indent=2)

# Walk through input directory
for json_file in input_dir.glob("*.json"):
    output_file = output_dir / json_file.name
    clean_json(json_file, output_file)
    print(f"Cleaned {json_file.name} -> {output_file}")

print("All JSONs cleaned and written to", output_dir)


For controls, ambiguous amino acids codes can be found (X, B, Z, and J which can correspond to multiple amino acids in different frequencies), alphafold is unsure how to handle these so below, we will rewrite them to discete values randomly. 

In [20]:
import json
from pathlib import Path
import random

# Natural amino acid frequencies
aa_freqs = {
    'A': 0.0825, 'R': 0.0553, 'N': 0.0406, 'D': 0.0545, 'C': 0.0137,
    'Q': 0.0393, 'E': 0.0675, 'G': 0.0707, 'H': 0.0227, 'I': 0.0593,
    'L': 0.0966, 'K': 0.0584, 'M': 0.0242, 'F': 0.0386, 'P': 0.0470,
    'S': 0.0651, 'T': 0.0534, 'W': 0.0108, 'Y': 0.0292, 'V': 0.0687
}

def replace_ambiguous(seq):
    new_seq = []
    for aa in seq:
        if aa == 'X':
            new_seq.append(random.choices(list(aa_freqs.keys()), weights=aa_freqs.values())[0])
        elif aa == 'B':
            new_seq.append(random.choice(['D', 'N']))
        elif aa == 'Z':
            new_seq.append(random.choice(['E', 'Q']))
        elif aa == 'J':
            new_seq.append(random.choice(['I', 'L']))
        else:
            new_seq.append(aa)
    return ''.join(new_seq)

# Path to the list of JSONs
json_list_file = Path(" ")

# Iterate over all JSON files in the list
with open(json_list_file) as f:
    for line in f:
        json_path = Path(line.strip())
        if not json_path.exists():
            print(f"File does not exist: {json_path}")
            continue

        with json_path.open("r") as jf:
            data = json.load(jf)

        updated = False
        for entry in data.get("sequences", []):
            protein = entry.get("protein", {})
            if protein.get("id") == "E":
                original_seq = protein["sequence"]
                protein["sequence"] = replace_ambiguous(original_seq)
                updated = True
                print(f"Updated chain E in {json_path}")
                break

        if updated:
            with json_path.open("w") as jf:
                json.dump(data, jf, indent=2)
        else:
            print(f"No chain E found in {json_path}")


Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/controls_prior_123.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/gen5_scp-random_147.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/gen5_scp-random_177.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/controls_random_244.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/gen5_fsp-fake_151.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/gen5_scp-random_144.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/gen5_scp-select_144.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/gen0_outsample_temper0.4_173.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/controls_prior_77.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/controls_random_168.json
Updated chain E in /scratch/ggrama/alphafold3/rewritten_jsons/controls_random_54.json
Updated chain E in /scratch/ggrama/alpha

In [2]:
import json
from pathlib import Path

SOLVED_IDS = [
    "2vlj", "2vlk", "2vlr", "5euo", "5isz", "1oga"
]

AF3_OUTPUT_DIR = Path(" ")
MSA_DIR = Path(" ")
MSA_DIR.mkdir(parents=True, exist_ok=True)

for solved_id in SOLVED_IDS:
    # Find output directory that starts with solved_id
    dirs = [d for d in AF3_OUTPUT_DIR.iterdir() if d.is_dir() and d.name.startswith(solved_id)]
    if not dirs:
        print(f"WARNING: No output dir found for {solved_id}, skipping")
        continue
    out_dir = dirs[0]  # take the first match

    data_json = out_dir / f"{out_dir.name}_data.json"
    if not data_json.exists():
        print(f"WARNING: {data_json} not found, skipping")
        continue

    with open(data_json) as f:
        data = json.load(f)

    msa_solved_dir = MSA_DIR / solved_id
    msa_solved_dir.mkdir(parents=True, exist_ok=True)

    for seq in data.get("sequences", []):
        prot = seq.get("protein", {})
        chain_id = prot.get("id")
        msa_content = prot.get("unpairedMsa")
        if not msa_content:
            print(f"No unpaired MSA for chain {chain_id} in {solved_id}")
            continue

        msa_path = msa_solved_dir / f"{chain_id}.a3m"
        with open(msa_path, "w") as msa_file:
            msa_file.write(msa_content)

        print(f"Wrote MSA for {solved_id} chain {chain_id} -> {msa_path}")


Wrote MSA for 2vlj chain A -> /scratch/ggrama/alphafold3/msa/2vlj/A.a3m
Wrote MSA for 2vlj chain B -> /scratch/ggrama/alphafold3/msa/2vlj/B.a3m
Wrote MSA for 2vlj chain C -> /scratch/ggrama/alphafold3/msa/2vlj/C.a3m
Wrote MSA for 2vlj chain D -> /scratch/ggrama/alphafold3/msa/2vlj/D.a3m
Wrote MSA for 2vlj chain E -> /scratch/ggrama/alphafold3/msa/2vlj/E.a3m
Wrote MSA for 2vlk chain A -> /scratch/ggrama/alphafold3/msa/2vlk/A.a3m
Wrote MSA for 2vlk chain B -> /scratch/ggrama/alphafold3/msa/2vlk/B.a3m
Wrote MSA for 2vlk chain C -> /scratch/ggrama/alphafold3/msa/2vlk/C.a3m
Wrote MSA for 2vlk chain D -> /scratch/ggrama/alphafold3/msa/2vlk/D.a3m
Wrote MSA for 2vlk chain E -> /scratch/ggrama/alphafold3/msa/2vlk/E.a3m
Wrote MSA for 2vlr chain A -> /scratch/ggrama/alphafold3/msa/2vlr/A.a3m
Wrote MSA for 2vlr chain B -> /scratch/ggrama/alphafold3/msa/2vlr/B.a3m
Wrote MSA for 2vlr chain C -> /scratch/ggrama/alphafold3/msa/2vlr/C.a3m
Wrote MSA for 2vlr chain D -> /scratch/ggrama/alphafold3/msa/2vl

In [12]:
from pathlib import Path

MSA_DIR = Path("/scratch/ggrama/alphafold3/msa")

def sanitize_a3m(msa_path: Path):
    lines = msa_path.read_text().splitlines()
    if not lines:
        return
    # Uppercase the first sequence
    lines[0] = lines[0].upper() if not lines[0].startswith(">") else lines[0]
    sanitized_lines = []
    for line in lines:
        if line.startswith(">"):
            sanitized_lines.append(line)
        else:
            sanitized_lines.append(line.upper().replace("-", "-"))
    msa_path.write_text("\n".join(sanitized_lines))

for solved_id_dir in MSA_DIR.iterdir():
    if not solved_id_dir.is_dir():
        continue
    for a3m_file in solved_id_dir.glob("*.a3m"):
        sanitize_a3m(a3m_file)
        print(f"Sanitized {a3m_file}")


Sanitized /scratch/ggrama/alphafold3/msa/2vlr/D.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlr/B.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlr/E.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlr/C.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlr/A.a3m
Sanitized /scratch/ggrama/alphafold3/msa/5isz/D.a3m
Sanitized /scratch/ggrama/alphafold3/msa/5isz/E.a3m
Sanitized /scratch/ggrama/alphafold3/msa/5isz/A.a3m
Sanitized /scratch/ggrama/alphafold3/msa/5isz/C.a3m
Sanitized /scratch/ggrama/alphafold3/msa/5isz/B.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlj/C.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlj/B.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlj/A.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlj/D.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlj/E.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlk/C.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlk/B.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlk/E.a3m
Sanitized /scratch/ggrama/alphafold3/msa/2vlk/D.a3m
Sanitized /s