# ESMFold structure prediction

In [1]:
import pandas as pd
import requests, urllib3, os
from time import sleep
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Load `chains` dataframe from .csv file

In [2]:
PROTEIN_DIRECTORY = "data/proteins"

chains = pd.read_csv("data/chains.csv")
chains = chains[:5]
chains

Unnamed: 0,pdb_id,label,chain_id
0,8P0E,monomer,8P0E:A
1,8PX8,monomer,8PX8:A
2,8B2E,monomer,8B2E:A
3,8HOE,monomer,8HOE:A
4,8TCE,monomer,8TCE:A


## Predicting the structures

Predict the structure of all the chains in the chains dataframe:

In [3]:
def get_sequence(fasta_path: str) -> str:
    """
    This function obtains a sequence from given fasta file.
    
    Parameters:
        fasta_path (str): path to a fasta file containing exactly one
                          sequence on the second line of the file
    
    Returns:
        string of letters without new line at the end
    """
    with open(fasta_path, "r") as file:
        lines = file.readlines()

    assert(len(lines) == 2)
    return lines[1].strip()


def print_status() -> None:
    global NUMBER_OF_PREDICTED_CHAINS
    NUMBER_OF_PREDICTED_CHAINS += 1
    if NUMBER_OF_PREDICTED_CHAINS % 50 == 0 or NUMBER_OF_PREDICTED_CHAINS == 1465:
        print(f"Predicted the structure of {NUMBER_OF_PREDICTED_CHAINS} chains.")
    sleep(10)


def esmfold_predict(pdb_id: str, chain_id: str, protein_directory: str) -> str:
    """
    This function obtains a prediction of structure using ESMFold API
    and saves it into a pdb file.
    
    Parameters:
        pdb_id (str): PDB ID of the protein
        chain_id (str): id of the chain
        protein_directory (str): path to the directory containing all information about proteins

    Returns:
        relative path to the pdb file containing the prediction
    """
    sequence = get_sequence(f"{protein_directory}/{pdb_id}/{chain_id}.fasta")
    prediction = requests.post(url = "https://api.esmatlas.com/foldSequence/v1/pdb/", data = sequence, verify = False)
    print_status()
    if prediction.status_code != 200:
        print(f"Prediction of structure {chain_id} was unsuccessful - status code of the request: {prediction.status_code}")
        return None
    prediction_path = f"{protein_directory}/{pdb_id}/esmfold/{chain_id}.pdb"
    if not os.path.exists(f"{protein_directory}/{pdb_id}/esmfold"):
        os.makedirs(f"{protein_directory}/{pdb_id}/esmfold")
    with open(prediction_path, "w") as f:
        f.write(prediction.text)
    return prediction_path


NUMBER_OF_PREDICTED_CHAINS = 0

chains["EF_prediction_path"] = chains.apply(
    lambda row: esmfold_predict(row["pdb_id"], row["chain_id"], PROTEIN_DIRECTORY),
    axis = 1
)

In [4]:
chains

Unnamed: 0,pdb_id,label,chain_id,EF_prediction_path
0,8P0E,monomer,8P0E:A,data/proteins/8P0E/esmfold/8P0E:A.pdb
1,8PX8,monomer,8PX8:A,data/proteins/8PX8/esmfold/8PX8:A.pdb
2,8B2E,monomer,8B2E:A,data/proteins/8B2E/esmfold/8B2E:A.pdb
3,8HOE,monomer,8HOE:A,data/proteins/8HOE/esmfold/8HOE:A.pdb
4,8TCE,monomer,8TCE:A,data/proteins/8TCE/esmfold/8TCE:A.pdb


## Save `chain` dataframe as .csv file

In [5]:
chains.to_csv("proteins/chains_esmfold.csv", sep = ",", index = False)