# Importing MMACHC proteins-like

In [5]:
import os
import requests
from Bio.PDB import PDBList
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.DSSP import DSSP

# === SETTINGS ===
INTERPRO_ID = "IPR012134"  # MMACHC-like domain
OUTPUT_DIR = "mmachc_family_dataset"
PDB_DIR = os.path.join(OUTPUT_DIR, "pdb")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PDB_DIR, exist_ok=True)

# === STEP 1: Fetch UniProt entries in JSON ===
def fetch_uniprot_entries(interpro_id):
    query = f"xref:InterPro-{interpro_id}"
    url = (
        "https://rest.uniprot.org/uniprotkb/search?"
        f"query={query}"
        "&fields=accession,id,structure_3d,sequence"
        "&format=json"
    )
    print(f"[INFO] Fetching UniProt entries for InterPro {interpro_id} in JSON...")
    resp = requests.get(url)
    if resp.status_code != 200:
        raise RuntimeError(f"UniProt request failed: {resp.status_code}, URL={url}")
    results = resp.json()
    data = []

    for entry in results.get("results", []):
        accession = entry["primaryAccession"]
        uniprot_id = entry["uniProtkbId"]
        seq = entry["sequence"]["value"]

        pdb_ids = []
        for struct in entry.get("uniProtKBCrossReferences", []):
            if struct["database"] == "PDB":
                pdb_ids.append(struct["id"])
        pdb_ids_str = ";".join(pdb_ids)

        data.append({
            "Entry": accession,
            "UniProt_ID": uniprot_id,
            "Sequence": seq,
            "PDB_IDs": pdb_ids_str
        })

    return data

# === STEP 2: Download PDB mmCIF files ===
def download_pdb(pdb_id):
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_id, pdir=PDB_DIR, file_format="mmCif")

# === STEP 3: Download AlphaFold mmCIF structures ===
def download_alphafold(uniprot_id):
    url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.cif"
    local_path = os.path.join(PDB_DIR, f"AF-{uniprot_id}.cif")
    if os.path.exists(local_path):
        print(f"[INFO] AlphaFold structure already downloaded: {local_path}")
        return local_path
    print(f"[INFO] Downloading AlphaFold structure for {uniprot_id}...")
    resp = requests.get(url)
    if resp.status_code == 200:
        with open(local_path, "wb") as f:
            f.write(resp.content)
        return local_path
    else:
        print(f"[WARN] AlphaFold structure not found for {uniprot_id}")
        return None

# === STEP 4: Run DSSP and save CB513-like format ===
def run_dssp_and_save(uniprot_id, seq, pdb_ids):
    pdb_path = None

    # Try experimental PDB first
    if pdb_ids:
        pdb_id = pdb_ids.split(";")[0]
        try:
            download_pdb(pdb_id)
        except Exception as e:
            print(f"[ERROR] Could not download PDB {pdb_id}: {e}")
        else:
            # Find mmCIF file
            for root, dirs, files in os.walk(PDB_DIR):
                for file in files:
                    if pdb_id.lower() in file.lower() and file.endswith(".cif"):
                        pdb_path = os.path.join(root, file)
                        break
                if pdb_path:
                    break

    # If no PDB found, try AlphaFold
    if not pdb_path:
        pdb_path = download_alphafold(uniprot_id)

    if not pdb_path:
        print(f"[WARN] No structure for {uniprot_id}, skipping DSSP.")
        return

    try:
        parser = MMCIFParser(QUIET=True)
        structure = parser.get_structure(uniprot_id, pdb_path)
        model = structure[0]
        dssp = DSSP(model, pdb_path, file_type="MMCIF")

        ss_map = {"H": "H", "G": "H", "I": "H",  # Helix types
                  "E": "E", "B": "E",             # Beta strand types
                  "-": "C", "T": "C", "S": "C"}   # Coil/turn/bend
        seq_ss = []

        for aa in seq:
            ss_label = "C"  # Default coil
            for key in dssp.keys():
                if dssp[key][1] == aa:
                    raw_ss = dssp[key][2]
                    ss_label = ss_map.get(raw_ss, "C")
                    break
            seq_ss.append(ss_label)

        cb_lines = [f"{aa} {ss}" for aa, ss in zip(seq, seq_ss)]
        with open(os.path.join(OUTPUT_DIR, f"{uniprot_id}.txt"), "w") as f:
            f.write("\n".join(cb_lines))

        print(f"[OK] Saved CB513-like file for {uniprot_id}")
    except Exception as e:
        print(f"[ERROR] DSSP failed for {uniprot_id}: {e}")


In [10]:
import os
import subprocess

BLAST_BIN_DIR = "/usr/bin/"  # adjust path
DB_PATH = "/home/ubuntu/cullpdb+profile_5926_filtered.npy.gz"        # BLAST db base name (without extensions)
FASTA_DIR = "mmachc_family_dataset/fasta"         # where FASTA files are saved
PSSM_DIR = "mmachc_family_dataset/pssm"
os.makedirs(FASTA_DIR, exist_ok=True)
os.makedirs(PSSM_DIR, exist_ok=True)

def write_fasta(uniprot_id, sequence):
    fasta_path = os.path.join(FASTA_DIR, f"{uniprot_id}.fasta")
    with open(fasta_path, "w") as f:
        f.write(f">{uniprot_id}\n{sequence}\n")
    return fasta_path

def run_psiblast(fasta_file, output_pssm):
    psiblast_cmd = [
        os.path.join(BLAST_BIN_DIR, "psiblast"),
        "-query", fasta_file,
        "-db", DB_PATH,
        "-num_iterations", "3",
        "-out_ascii_pssm", output_pssm,
        "-evalue", "0.001",
        "-num_threads", "4",  # adjust as needed
        "-out", "/dev/null"  # suppress normal output
    ]
    print(f"[INFO] Running PSI-BLAST: {' '.join(psiblast_cmd)}")
    result = subprocess.run(psiblast_cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"[ERROR] PSI-BLAST failed on {fasta_file}:\n{result.stderr}")
    else:
        print(f"[OK] PSI-BLAST PSSM saved to {output_pssm}")

# Example usage: integrate with your existing data loop
def generate_pssms_for_entries(entries):
    for entry in entries:
        uniprot_id = entry["UniProt_ID"]
        seq = entry["Sequence"]
        fasta_file = write_fasta(uniprot_id, seq)
        pssm_out = os.path.join(PSSM_DIR)


In [11]:
# === MAIN ===
entries = fetch_uniprot_entries(INTERPRO_ID)
print(f"[INFO] Retrieved {len(entries)} UniProt entries.")

for entry in entries:
    uniprot_id = entry["UniProt_ID"]
    seq = entry["Sequence"]
    pdb_ids = entry["PDB_IDs"]

    # Run DSSP and save CB513-like SS file
    run_dssp_and_save(uniprot_id, seq, pdb_ids)

    # Save FASTA file
    fasta_file = write_fasta(uniprot_id, seq)

    # Prepare PSSM output path
    pssm_out = os.path.join(PSSM_DIR, f"{uniprot_id}.pssm")

    # Run PSI-BLAST to generate PSSM
    run_psiblast(fasta_file, pssm_out)


print("[DONE] Dataset saved in CB513-like format.")


[INFO] Fetching UniProt entries for InterPro IPR012134 in JSON...
[INFO] Retrieved 25 UniProt entries.
[INFO] Downloading AlphaFold structure for PROA_ECOLI...
[WARN] AlphaFold structure not found for PROA_ECOLI
[WARN] No structure for PROA_ECOLI, skipping DSSP.
[INFO] Running PSI-BLAST: /usr/bin/psiblast -query mmachc_family_dataset/fasta/PROA_ECOLI.fasta -db /home/ubuntu/cullpdb+profile_5926_filtered.npy.gz -num_iterations 3 -out_ascii_pssm mmachc_family_dataset/pssm/PROA_ECOLI.pssm -evalue 0.001 -num_threads 4 -out /dev/null
[ERROR] PSI-BLAST failed on mmachc_family_dataset/fasta/PROA_ECOLI.fasta:
BLAST Database error: No alias or index file found for protein database [/home/ubuntu/cullpdb+profile_5926_filtered.npy.gz] in search path [/home/ubuntu::]

Structure exists: 'mmachc_family_dataset/pdb/1vlu.cif' 





[OK] Saved CB513-like file for PROA_YEAST
[INFO] Running PSI-BLAST: /usr/bin/psiblast -query mmachc_family_dataset/fasta/PROA_YEAST.fasta -db /home/ubuntu/cullpdb+profile_5926_filtered.npy.gz -num_iterations 3 -out_ascii_pssm mmachc_family_dataset/pssm/PROA_YEAST.pssm -evalue 0.001 -num_threads 4 -out /dev/null
[ERROR] PSI-BLAST failed on mmachc_family_dataset/fasta/PROA_YEAST.fasta:
BLAST Database error: No alias or index file found for protein database [/home/ubuntu/cullpdb+profile_5926_filtered.npy.gz] in search path [/home/ubuntu::]

[INFO] Downloading AlphaFold structure for PROA_BACSU...
[WARN] AlphaFold structure not found for PROA_BACSU
[WARN] No structure for PROA_BACSU, skipping DSSP.
[INFO] Running PSI-BLAST: /usr/bin/psiblast -query mmachc_family_dataset/fasta/PROA_BACSU.fasta -db /home/ubuntu/cullpdb+profile_5926_filtered.npy.gz -num_iterations 3 -out_ascii_pssm mmachc_family_dataset/pssm/PROA_BACSU.pssm -evalue 0.001 -num_threads 4 -out /dev/null
[ERROR] PSI-BLAST failed 

In [2]:
def load_cb513_file(filepath):
    with open(filepath) as f:
        lines = f.read().strip().split("\n")
    seq = "".join(line.split()[0] for line in lines)
    ss = "".join(line.split()[1] for line in lines)
    return seq, ss

seq, ss = load_cb513_file("mmachc_family_dataset/PROA_YEAST.txt")
print(seq)
print(ss)


MSSSQQIAKNARKAGNILKTISNEGRSDILYKIHDALKANAHAIEEANKIDLAVAKETGLADSLLKRLDLFKGDKFEVMLQGIKDVAELEDPVGKVKMARELDDGLTLYQVTAPVGVLLVIFESRPEVIANITALSIKSGNAAILKGGKESVNTFREMAKIVNDTIAQFQSETGVPVGSVQLIETRQDVSDLLDQDEYIDLVVPRGSNALVRKIKDTTKIPVLGHADGICSIYLDEDADLIKAKRISLDAKTNYPAGCNAMETLLINPKFSKWWEVLENLTLEGGVTIHATKDLKTAYFDKLNELGKLTEAIQCKTVDADEEQDFDKEFLSLDLAAKFVTSTESAIQHINTHSSRHTDAIVTENKANAEKFMKGVDSSGVYWNASTRFADGFRYGFGAEVGISTSKIHARGPVGLDGLVSYQYQIRGDGQVASDYLGAGGNKAFVHKDLDIKTVTL
CCCCHHHHHHHHHHHHHHHCHCHHHHCHHHHHHCHHHHHHHCHHHHHHHHHHHHHHHCHHHHCHHHHHHHCHHHHCHHCHHHHHHHHHHHHCHHHHHCHHHHHHHHCHHHHCHCHHHHHHHCHCHCHHHHHHCHHCHHCHHHHHHHHHHHCHHCCHHCHHHHHHCHHHCHCHCHHCHHCHHHHHCHHHHCHHHHHHHHHHHHHCHHCHHHHHHHHHCCHHCHHHCHHHHECHHHHHHHHHHHHHHHCHHHHCHHCHHEHHCHCHHHHCHCCHHHHHHHHHCHHHHHCHCHCHHHHCHHCHHHHHHHHHCHHHHEHCHHHHHHHHCHHHCHCHHHHHHCHCCCHCHHHCHHCCCCHCCHHHHCHHHHHHHHCCHHHHCCHHHHHHCCHCHHHCHHHCHHHHHHCCCHHCHHHCHHHHHHHCHHHHHHHHHHHHCHHHHHHHHHHCHCHHHHHHCHCH
