In [1]:
from pathlib import Path
from Bio.PDB import PDBParser, PDBIO
from Bio.PDB.Polypeptide import is_aa
import csv

# Directory principali
FOLDS_DIR = Path("/Users/lorenzosisti/Downloads/folds_2025_11_04_11_56")
MODELS_DIR = Path("/Users/lorenzosisti/Downloads/models")

CSV_OUTPUT = Path("chain_mapping_summary.csv")  # dove salvare il mapping

# reset CSV ad ogni esecuzione
with open(CSV_OUTPUT, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["PDB file", "chain_target", "chain_reference"])


In [3]:
def get_chain_sequences(pdb_file):
    """Estrae le sequenze amminoacidiche per ogni catena del PDB."""
    parser = PDBParser(QUIET=True)
    struct = parser.get_structure("struct", pdb_file)

    chain_seqs = {}
    for model in struct:
        for chain in model:
            seq = "".join(
                res.get_resname()
                for res in chain
                if is_aa(res, standard=True)
            )
            chain_seqs[chain.id] = seq

    return chain_seqs


In [4]:
def seq_matches_exact_or_trimmed(target_seq, reference_seq):
    """True solo se identica o reference √® contenuta in target (senza gap)."""
    return target_seq == reference_seq or reference_seq in target_seq


def get_best_mapping_exact_or_trimmed(target_seqs, reference_seqs):
    mapping = {}

    for target_chain, target_seq in target_seqs.items():
        match_found = False

        for ref_chain, ref_seq in reference_seqs.items():

            if seq_matches_exact_or_trimmed(target_seq, ref_seq):
                mapping[target_chain] = ref_chain
                match_found = True
                break

        if not match_found:
            print(f"‚ö†Ô∏è  Nessuna corrispondenza per chain {target_chain}")
            return None

    return mapping


In [5]:
def rename_chains_two_step(target_pdb, mapping, output_pdb):
    """Rinomina catene in due passaggi: X ‚Üí X_TMP ‚Üí NuovaX."""
    parser = PDBParser(QUIET=True)
    struct = parser.get_structure("struct", target_pdb)

    # Primo passaggio: rinomina temporaneamente
    for chain in struct.get_chains():
        if chain.id in mapping:
            chain.id = f"{mapping[chain.id]}_TMP"

    # Secondo passaggio: rimuove _TMP
    updated_ids = {}
    for chain in struct.get_chains():
        if chain.id.endswith("_TMP"):
            new_id = chain.id.replace("_TMP", "")
            chain.id = new_id
            updated_ids[new_id] = new_id

    io = PDBIO()
    io.set_structure(struct)
    io.save(str(output_pdb))


In [None]:
for subfolder in FOLDS_DIR.iterdir():
    if not subfolder.is_dir():
        continue

    pdb_id = subfolder.name.split("_")[0]
    reference_pdb = next(MODELS_DIR.glob(f"{pdb_id}_*.pdb"))

    for pdbfile in subfolder.glob(f"{pdb_id}_*.pdb"):

        # Saltare i file gi√† rinominati
        if "_renamed" in pdbfile.name:
            continue

        print(f"\nüîç Processing: {pdbfile.name}")

        target_seqs = get_chain_sequences(pdbfile)
        reference_seqs = get_chain_sequences(reference_pdb)

        mapping = get_best_mapping_exact_or_trimmed(target_seqs, reference_seqs)

        if mapping is None:
            print("‚ùå Skip (no mapping trovato)")
            continue

        print(f"üîÅ Renaming: {mapping}")

        output_pdb = pdbfile.with_name(pdbfile.stem + "_renamed.pdb")
        rename_chains_two_step(pdbfile, mapping, output_pdb)

        with open(CSV_OUTPUT, "a", newline="") as f:
            writer = csv.writer(f)
            for t, r in mapping.items():
                writer.writerow([pdbfile.name, t, r])

        print(f"üíæ Salvato: {output_pdb}")