# ESMFold structure prediction

In [14]:
import pandas as pd
import requests, urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Load `chains` dataframes from csv files

In [5]:
chains = pd.read_csv('proteins/chains.csv')
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta
...,...,...,...,...,...,...
1062,8FIT,synthetic,proteins/original_pdb_files/8FIT.pdb,proteins/original_fasta_files/8FIT.fasta,8FIT:A,proteins/inferred_fasta_files/8FIT.fasta
1063,8J1W,synthetic,proteins/original_pdb_files/8J1W.pdb,proteins/original_fasta_files/8J1W.fasta,8J1W:A,proteins/inferred_fasta_files/8J1W.fasta
1064,8HDV,synthetic,proteins/original_pdb_files/8HDV.pdb,proteins/original_fasta_files/8HDV.fasta,8HDV:A,proteins/inferred_fasta_files/8HDV.fasta
1065,8AO0,synthetic,proteins/original_pdb_files/8AO0.pdb,proteins/original_fasta_files/8AO0.fasta,8AO0:A,proteins/inferred_fasta_files/8AO0.fasta


## Predicting the structures
Prepare directory:

In [34]:
EF_PREDICTED_PDB_FILES_DIRECTORY = "proteins/esmfold_predicted_pdb_files"
!mkdir {EF_PREDICTED_PDB_FILES_DIRECTORY}

In [33]:
chains = chains.head(5).copy()
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta


In [35]:
chains["EF_prediction_pdb_path"] = chains.apply(lambda row: f"{EF_PREDICTED_PDB_FILES_DIRECTORY}/{row['chain_id']}.pdb",
                                                axis = 1)
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,EF_prediction_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta,proteins/esmfold_predicted_pdb_files/8XPV:A.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta,proteins/esmfold_predicted_pdb_files/8GQ4:A.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta,proteins/esmfold_predicted_pdb_files/8TIF:A.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta,proteins/esmfold_predicted_pdb_files/8H3Z:A.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta,proteins/esmfold_predicted_pdb_files/8ALL:A.pdb


In [36]:
def get_sequence(fasta_path, chain_id):
    with open(fasta_path, "r") as file:
        extract_next_line = False
        for line in file:
            if extract_next_line:
                return line.rstrip()
            if line.startswith(">") and chain_id in line:
                extract_next_line = True
    return None

"""
assert(get_sequence("proteins/inferred_fasta_files/5SSZ.fasta", "5SSZ:A") == "LAHSKMVPIPAGVFTMGTDDPQIKQDGEAPARRVTIDAFYMDAYEVSNTEFEKFVNSTGYLTEAEKFGDSFVFEGMLSEQVKTNIQQAVAAAPWWLPVKGANWRHPEGPDSTILHRPDHPVLHVSWNDAVAYCTWAGKRLPTEAEWEYSCRGGLHNRLFPWGNKLQPKGQHYANIWQGEFPVTNTGEDGFQGTAPVDAFPPNGYGLYNIVGNAWEWTSDWWTVHHSVEETLNPKGPPSGKDRVKKGGSYMCHRSYCYRYRCAARSQNTPDSSASNLGFRCAADRLPTMDS")
assert(get_sequence("proteins/inferred_fasta_files/8F8M.fasta", "8F8M:B") == "NALLRYLLDKD")
"""

for index, row in chains.iterrows():
    sequence = get_sequence(row["inferred_fasta_path"], row["chain_id"])
    if sequence is None:
        row["EF_prediction_pdb_path"] = None
        continue
    prediction = requests.post(url = "https://api.esmatlas.com/foldSequence/v1/pdb/", data = sequence, verify = False)
    if prediction.status_code != 200:
        row["EF_prediction_pdb_path"] = None
        continue
    with open(row["EF_prediction_pdb_path"], "w") as f:
        f.write(prediction.text)

chains

## Save `chain` dataframe as csv file

In [None]:
chains.to_csv("proteins/chains_esmfold.csv", sep = ",", index = False)