# ESMFold structure prediction

In [1]:
import pandas as pd
import requests, urllib3
from time import sleep
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Load `chains` dataframe from .csv file

In [2]:
chains = pd.read_csv('proteins/chains.csv')
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,masks_path,chain_inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta,proteins/chain_fasta_files/8XPV:A.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta,proteins/chain_fasta_files/8GQ4:A.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta,proteins/chain_fasta_files/8TIF:A.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta,proteins/chain_fasta_files/8H3Z:A.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta,proteins/chain_fasta_files/8ALL:A.fasta
...,...,...,...,...,...,...,...,...
1042,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,8HNE:A,proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta,proteins/chain_fasta_files/8HNE:A.fasta
1043,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,8FIN:A,proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta,proteins/chain_fasta_files/8FIN:A.fasta
1044,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,8J0A:A,proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta,proteins/chain_fasta_files/8J0A:A.fasta
1045,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,8HDU:A,proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta,proteins/chain_fasta_files/8HDU:A.fasta


## Predicting the structures
Prepare directory:

In [3]:
EF_PREDICTED_PDB_FILES_DIRECTORY = "proteins/esmfold_prediction_pdb_files"
!mkdir {EF_PREDICTED_PDB_FILES_DIRECTORY}

In [4]:
chains["EF_prediction_pdb_path"] = chains.apply(lambda row: f"{EF_PREDICTED_PDB_FILES_DIRECTORY}/{row['chain_id']}.pdb",
                                                axis = 1)
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,masks_path,chain_inferred_fasta_path,EF_prediction_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta,proteins/chain_fasta_files/8XPV:A.fasta,proteins/esmfold_prediction_pdb_files/8XPV:A.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta,proteins/chain_fasta_files/8GQ4:A.fasta,proteins/esmfold_prediction_pdb_files/8GQ4:A.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta,proteins/chain_fasta_files/8TIF:A.fasta,proteins/esmfold_prediction_pdb_files/8TIF:A.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta,proteins/chain_fasta_files/8H3Z:A.fasta,proteins/esmfold_prediction_pdb_files/8H3Z:A.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta,proteins/chain_fasta_files/8ALL:A.fasta,proteins/esmfold_prediction_pdb_files/8ALL:A.pdb
...,...,...,...,...,...,...,...,...,...
1042,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,8HNE:A,proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta,proteins/chain_fasta_files/8HNE:A.fasta,proteins/esmfold_prediction_pdb_files/8HNE:A.pdb
1043,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,8FIN:A,proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta,proteins/chain_fasta_files/8FIN:A.fasta,proteins/esmfold_prediction_pdb_files/8FIN:A.pdb
1044,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,8J0A:A,proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta,proteins/chain_fasta_files/8J0A:A.fasta,proteins/esmfold_prediction_pdb_files/8J0A:A.pdb
1045,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,8HDU:A,proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta,proteins/chain_fasta_files/8HDU:A.fasta,proteins/esmfold_prediction_pdb_files/8HDU:A.pdb


Predict the structure of all the chains in the chains dataframe:

In [5]:
def get_sequence(fasta_path: str) -> str:
    """
    This function obtains a sequence from given fasta file.
    
    Parameters:
        fasta_path (str): path to a fasta file containing exactly one
                          sequence on the second line of the file
    
    Returns:
        string of letters without new line at the end
    """
    with open(fasta_path, "r") as file:
        lines = file.readlines()

    assert(len(lines) == 2)
    return lines[1].strip()


def predict_structure(chain_id: str, fasta_path: str, pdb_path: str) -> bool:
    """
    This function obtains a prediction of structure using ESMFold API
    and saves it into a pdb file.
    
    Parameters:
        chain_id (str): id of the predicted chain
        fasta_path (str): path to a fasta file containing exactly one
                          sequence on the second line of the file
        pdb_path (str): path to the file where the predicted structure will be saved

    Returns:
        boolean indicating whether the prediction was successful or not
    """
    sequence = get_sequence(fasta_path)
    prediction = requests.post(url = "https://api.esmatlas.com/foldSequence/v1/pdb/", data = sequence, verify = False)
    if prediction.status_code != 200:
        print(f"Prediction of structure {chain_id} was unsuccessful - status code of the request: {prediction.status_code}")
        return False
    with open(pdb_path, "w") as f:
        f.write(prediction.text)
    return True


unsuccessful_predictions = []
for index, row in chains.iterrows():
    if not predict_structure(row["chain_id"], row["chain_inferred_fasta_path"], row["EF_prediction_pdb_path"]):
        unsuccessful_predictions.append(row["chain_id"])
    if (index + 1) % 50 == 0 or index == 1046:
        print(f"Successfully predicted the structure of {index + 1} chains.")
    sleep(5)

print(f"List of unsuccesful predictions: {unsuccessful_predictions}")
chains

Successfully predicted the structure of 100 chains.
Successfully predicted the structure of 150 chains.
Successfully predicted the structure of 200 chains.
Successfully predicted the structure of 250 chains.
Successfully predicted the structure of 300 chains.
Successfully predicted the structure of 350 chains.
Successfully predicted the structure of 400 chains.
Successfully predicted the structure of 450 chains.
Successfully predicted the structure of 500 chains.
Successfully predicted the structure of 550 chains.
Successfully predicted the structure of 600 chains.
Successfully predicted the structure of 650 chains.
Successfully predicted the structure of 700 chains.
Successfully predicted the structure of 750 chains.
Successfully predicted the structure of 800 chains.
Successfully predicted the structure of 850 chains.
Successfully predicted the structure of 900 chains.
Successfully predicted the structure of 950 chains.
Successfully predicted the structure of 1000 chains.
Successfull

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,masks_path,chain_inferred_fasta_path,EF_prediction_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta,proteins/chain_fasta_files/8XPV:A.fasta,proteins/esmfold_prediction_pdb_files/8XPV:A.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta,proteins/chain_fasta_files/8GQ4:A.fasta,proteins/esmfold_prediction_pdb_files/8GQ4:A.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta,proteins/chain_fasta_files/8TIF:A.fasta,proteins/esmfold_prediction_pdb_files/8TIF:A.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta,proteins/chain_fasta_files/8H3Z:A.fasta,proteins/esmfold_prediction_pdb_files/8H3Z:A.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta,proteins/chain_fasta_files/8ALL:A.fasta,proteins/esmfold_prediction_pdb_files/8ALL:A.pdb
...,...,...,...,...,...,...,...,...,...
1042,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,8HNE:A,proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta,proteins/chain_fasta_files/8HNE:A.fasta,proteins/esmfold_prediction_pdb_files/8HNE:A.pdb
1043,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,8FIN:A,proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta,proteins/chain_fasta_files/8FIN:A.fasta,proteins/esmfold_prediction_pdb_files/8FIN:A.pdb
1044,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,8J0A:A,proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta,proteins/chain_fasta_files/8J0A:A.fasta,proteins/esmfold_prediction_pdb_files/8J0A:A.pdb
1045,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,8HDU:A,proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta,proteins/chain_fasta_files/8HDU:A.fasta,proteins/esmfold_prediction_pdb_files/8HDU:A.pdb


## Save `chain` dataframe as .csv file

In [6]:
chains.to_csv("proteins/chains_esmfold.csv", sep = ",", index = False)