In [37]:
def get_fasta_sequences_and_smiles(file_path):
    protein_sequences = []
    smiles_str = ""
    with open(file_path) as f:
        seq = ""
        record = False
        smiles_record = False
        for line in f:
            if line.startswith(">"):
                if record and seq:
                    protein_sequences.append(seq)
                    seq = ""
                record = "|protein" in line
                # If we were reading SMILES, stop now
                if smiles_record and smiles_str:
                    smiles_record = False
                smiles_record = "|smiles" in line
                continue
            if record:
                seq += line.strip()
            if smiles_record:
                smiles_str += line.strip()
    if record and seq:
        protein_sequences.append(seq)
    # If no SMILES found, set to None
    if not smiles_str:
        smiles_str = None
    return protein_sequences, smiles_str

# Example usage:
# protein_seqs, smiles = get_fasta_sequences_and_smiles("ZINCgj000000VSmS.fasta")

In [38]:
recp_name = "s2"

In [39]:
import sys
sys.path.append("/lustre/fs6/lyu_lab/scratch/ichen/dudez_rescore/src")
from dudez_rescore.batch import Batch

In [40]:
fasta_dir = f"/lustre/fs6/lyu_lab/scratch/ichen/data/boltz2_runs/{recp_name}_config"
os.makedirs(fasta_dir, exist_ok=True)

In [41]:
old_fasta_dir = f"/lustre/fs6/lyu_lab/scratch/ichen/data/boltz_rescore/{recp_name}/{recp_name}_fasta"

In [42]:
import os

In [43]:
a3m_filepath = {
    "d4": "/lustre/fs6/lyu_lab/scratch/ichen/data/boltz_rescore/d4/d4_raw/d4_uniref.a3m",
    "ampc": "/lustre/fs6/lyu_lab/scratch/ichen/data/boltz_rescore/ampc/ampc_raw/ampc.a3m",
    "s2": "/lustre/fs6/lyu_lab/scratch/ichen/data/boltz_rescore/s2/s2_raw/s2.a3m",
}

In [44]:
for i in os.listdir(old_fasta_dir):

    protein_seqs, smiles = get_fasta_sequences_and_smiles(f"{old_fasta_dir}/{i}")
    
    without_ext = os.path.splitext(i)[0]
    boltz_config = {
        "version": 1,
        "sequences": [
            {
                "protein": {
                    "id": "A",
                    "sequence": protein_seqs[0],
                    "msa":a3m_filepath[recp_name],
                }
            },
            {
                "ligand": {
                    "id": "B",
                    "smiles": smiles
                }
            }
        ],
        "properties": [
            {
                "affinity": {
                    "binder": "B"
                }
            }
        ]
    }
    yaml_file = f"{fasta_dir}/{without_ext}.yaml"

    Batch.write_yaml_line_by_line(yaml_file, boltz_config)
