# OmegaFold structure prediction

Installation of necessary software:

In [1]:
from IPython.utils import io

In [2]:
with io.capture_output() as captured:
    !pip install -q torch
    !conda install -y requests
    !conda install -y biopython=1.79
    !conda install -y pandas
    !pip install -q git+https://github.com/jvkersch/tmtools.git
    !conda install -y Prody
    !git clone --branch beta --quiet https://github.com/sokrypton/OmegaFold.git
    !pip -q install py3Dmol
    !apt-get install aria2 -qq > /dev/null
    !aria2c -q -x 16 https://helixon.s3.amazonaws.com/release1.pt
    !mkdir -p ~/.cache/omegafold_ckpt
    !mv release1.pt ~/.cache/omegafold_ckpt/model.pt

print(captured.stdout if ("Error" in captured.stdout) or ("ERROR" in captured.stdout) else "Installation successful")

Installation successful


In [3]:
import pandas as pd
import os, sys, re, torch, requests, json, Bio

## Load `chains` dataframe from .csv file

In [4]:
PROTEIN_DIRECTORY = "data/proteins"

chains = pd.read_csv("data/chains.csv")
chains

Unnamed: 0,pdb_id,label,chain_id
0,8P0E,monomer,8P0E:A
1,8PX8,monomer,8PX8:A
2,8B2E,monomer,8B2E:A
3,8HOE,monomer,8HOE:A
4,8TCE,monomer,8TCE:A
...,...,...,...
1460,8G9J,synthetic,8G9J:A
1461,8OYV,synthetic,8OYV:A
1462,8TNO,synthetic,8TNO:A
1463,8FJE,synthetic,8FJE:A


## Predict the structures

Predict the structure of all the chains in the `chains` dataframe:

In [5]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
NUMBER_OF_PREDICTED_CHAINS = 0


def print_status() -> None:
    global NUMBER_OF_PREDICTED_CHAINS
    NUMBER_OF_PREDICTED_CHAINS += 1
    if NUMBER_OF_PREDICTED_CHAINS % 50 == 0 or NUMBER_OF_PREDICTED_CHAINS == 1465:
        print(f"Predicted the structure of {NUMBER_OF_PREDICTED_CHAINS} chains.")


def omegafold_predict(pdb_id: str, chain_id: str, protein_directory: str) -> bool:
    """
    This function obtains a prediction of structure using OmegaFold model
    and saves it into a pdb file.
    
    Parameters:
        pdb_id (str): PDB ID of the protein
        chain_id (str): id of the chain
        protein_directory (str): path to the directory containing all information about proteins

    Returns:
        relative path to the pdb file containing the prediction
    """
    fasta_path = f"{protein_directory}/{pdb_id}/{chain_id}.fasta"
    prediction_path = f"{protein_directory}/{pdb_id}/omegafold"
    if not os.path.exists(prediction_path):
        os.makedirs(prediction_path)
    with io.capture_output() as captured:
        !python OmegaFold/main.py --device={DEVICE} {fasta_path} {prediction_path}

    print_status()

    if "Error" in captured.stdout:
        return None
    return prediction_path


chains["OF_prediction_path"] = chains.apply(
    lambda row: omegafold_predict(row["pdb_id"], row["chain_id"], PROTEIN_DIRECTORY),
    axis = 1
)

Predicted the structure of 50 chains.
Predicted the structure of 100 chains.
Predicted the structure of 150 chains.
Predicted the structure of 200 chains.
Predicted the structure of 250 chains.
Predicted the structure of 300 chains.
Predicted the structure of 350 chains.
Predicted the structure of 400 chains.
Predicted the structure of 450 chains.
Predicted the structure of 500 chains.
Predicted the structure of 550 chains.
Predicted the structure of 600 chains.
Predicted the structure of 650 chains.
Predicted the structure of 700 chains.
Predicted the structure of 750 chains.
Predicted the structure of 800 chains.
Predicted the structure of 850 chains.
Predicted the structure of 900 chains.
Predicted the structure of 950 chains.
Predicted the structure of 1000 chains.
Predicted the structure of 1050 chains.
Predicted the structure of 1100 chains.
Predicted the structure of 1150 chains.
Predicted the structure of 1200 chains.
Predicted the structure of 1250 chains.
Predicted the struct

In [6]:
chains

Unnamed: 0,pdb_id,label,chain_id,OF_prediction_path
0,8P0E,monomer,8P0E:A,data/proteins/8P0E/omegafold
1,8PX8,monomer,8PX8:A,data/proteins/8PX8/omegafold
2,8B2E,monomer,8B2E:A,data/proteins/8B2E/omegafold
3,8HOE,monomer,8HOE:A,data/proteins/8HOE/omegafold
4,8TCE,monomer,8TCE:A,data/proteins/8TCE/omegafold
...,...,...,...,...
1460,8G9J,synthetic,8G9J:A,data/proteins/8G9J/omegafold
1461,8OYV,synthetic,8OYV:A,data/proteins/8OYV/omegafold
1462,8TNO,synthetic,8TNO:A,data/proteins/8TNO/omegafold
1463,8FJE,synthetic,8FJE:A,data/proteins/8FJE/omegafold


Number of proteins for which the prediction failed:

In [9]:
chains["OF_prediction_path"].isna().sum()

0

## Save chain dataframe as .csv file

In [7]:
chains.to_csv("proteins/chains_omegafold.csv", sep = ",", index = False)