# Omegafold - test predictions, obtaining information about chains from fasta files

Installation of necessary software:

In [1]:
from IPython.utils import io

In [1]:
with io.capture_output() as captured:
    !pip install -q torch
    !conda install -y requests
    !conda install -y biopython
    !conda install -y pandas
    !pip install -q git+https://github.com/jvkersch/tmtools.git
    !conda install -y Prody
    !git clone --branch beta --quiet https://github.com/sokrypton/OmegaFold.git
    !pip -q install py3Dmol
    !apt-get install aria2 -qq > /dev/null
    !aria2c -q -x 16 https://helixon.s3.amazonaws.com/release1.pt
    !mkdir -p ~/.cache/omegafold_ckpt
    !mv release1.pt ~/.cache/omegafold_ckpt/model.pt

print(captured.stderr if captured.stderr != "" else "Installation successful")

Installation successful


In [2]:
import os, sys, re, torch, requests, json, Bio
import pandas as pd

## Load `proteins` and `chains` dataframes from csv files

In [3]:
proteins = pd.read_csv('proteins/proteins.csv')
proteins

Unnamed: 0,pdb_id,original_pdb_path,original_fasta_path,unique_chains,inferred_fasta_path
0,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,"{'8EIO:B', '8EIO:A'}",proteins/inferred_fasta_files/8EIO.fasta
1,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,"{'8FMN:A', '8FMN:B', '8FMN:C'}",proteins/inferred_fasta_files/8FMN.fasta
2,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,"{'7XBG:A', '7XBG:B'}",proteins/inferred_fasta_files/7XBG.fasta
3,7UFZ,proteins/original_pdb_files/7UFZ.pdb,proteins/original_fasta_files/7UFZ.fasta,{'7UFZ:A'},proteins/inferred_fasta_files/7UFZ.fasta
4,8D20,proteins/original_pdb_files/8D20.pdb,proteins/original_fasta_files/8D20.fasta,{'8D20:A'},proteins/inferred_fasta_files/8D20.fasta


In [4]:
chains = pd.read_csv('proteins/chains.csv')
chains

Unnamed: 0,pdb_id,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path
0,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:B,proteins/inferred_fasta_files/8EIO.fasta
1,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:A,proteins/inferred_fasta_files/8EIO.fasta
2,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:A,proteins/inferred_fasta_files/8FMN.fasta
3,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:B,proteins/inferred_fasta_files/8FMN.fasta
4,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:C,proteins/inferred_fasta_files/8FMN.fasta
5,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:A,proteins/inferred_fasta_files/7XBG.fasta
6,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:B,proteins/inferred_fasta_files/7XBG.fasta
7,7UFZ,proteins/original_pdb_files/7UFZ.pdb,proteins/original_fasta_files/7UFZ.fasta,7UFZ:A,proteins/inferred_fasta_files/7UFZ.fasta
8,8D20,proteins/original_pdb_files/8D20.pdb,proteins/original_fasta_files/8D20.fasta,8D20:A,proteins/inferred_fasta_files/8D20.fasta


## Predicting the structures

Prepare directory:

In [5]:
OF_PREDICTED_PDB_FILES_DIRECTORY = "proteins/omegafold_predicted_pdb_files"
!mkdir {OF_PREDICTED_PDB_FILES_DIRECTORY}

mkdir: cannot create directory ‘proteins/omegafold_predicted_pdb_files’: File exists


In [7]:
chains["OF_prediction_pdb_path"] = chains.apply(lambda row: f"{OF_PREDICTED_PDB_FILES_DIRECTORY}/{row['chain_id']}.pdb", axis = 1)

Predict the structure of all the chains in the `chains` dataframe:

In [53]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for index, row in proteins.iterrows():
    fasta_path = row["original_fasta_path"]
    with io.capture_output() as captured:
        !python OmegaFold/main.py --device={device} {fasta_path} {OF_PREDICTED_PDB_FILES_DIRECTORY}
    if "Error" in captured.stdout:
        chains.loc[chains["pdb_id"] == row["pdb_id"], "OF_prediction_pdb_path"] = None
        print(f"Prediction of structure {row['pdb_id']} was unsuccessful")
    else:
        print(f"Successfully predicted the structure of {row['pdb_id']}")


Prediction of structure 8EIO was unsuccessful
Successfully predicted the structure of 8FMN
Successfully predicted the structure of 7XBG
Successfully predicted the structure of 7UFZ
Successfully predicted the structure of 8D20


In [9]:
chains.at[0, "OF_prediction_pdb_path"] = None
chains.at[1, "OF_prediction_pdb_path"] = None
chains

Unnamed: 0,pdb_id,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,OF_prediction_pdb_path
0,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:B,proteins/inferred_fasta_files/8EIO.fasta,
1,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:A,proteins/inferred_fasta_files/8EIO.fasta,
2,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:A,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:A.pdb
3,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:B,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:B.pdb
4,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:C,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:C.pdb
5,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:A,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:A.pdb
6,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:B,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:B.pdb
7,7UFZ,proteins/original_pdb_files/7UFZ.pdb,proteins/original_fasta_files/7UFZ.fasta,7UFZ:A,proteins/inferred_fasta_files/7UFZ.fasta,proteins/omegafold_predicted_pdb_files/7UFZ:A.pdb
8,8D20,proteins/original_pdb_files/8D20.pdb,proteins/original_fasta_files/8D20.fasta,8D20:A,proteins/inferred_fasta_files/8D20.fasta,proteins/omegafold_predicted_pdb_files/8D20:A.pdb


## Evaluation of predicted structures

### pLDDT

Confidence values are stored as b-factors in pdb files with predictions.
What do we want to know - average, minimum pLDDT?

In [11]:
from Bio.PDB import PDBParser
from statistics import mean

def extract_average_pLDDT(pdb_path):
    if pdb_path is None:
        return None
    structure = PDBParser(QUIET = True).get_structure("X", pdb_path)
    b_factors = [residue["CA"].get_bfactor() for residue in structure.get_residues()]
    return mean(b_factors)

chains["OF_average_pLDDT"] = chains.apply(lambda row: extract_average_pLDDT(row["OF_prediction_pdb_path"]), axis = 1)
chains

Unnamed: 0,pdb_id,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,OF_prediction_pdb_path,OF_average_pLDDT
0,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:B,proteins/inferred_fasta_files/8EIO.fasta,,
1,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:A,proteins/inferred_fasta_files/8EIO.fasta,,
2,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:A,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:A.pdb,77.656474
3,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:B,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:B.pdb,93.432239
4,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:C,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:C.pdb,91.49619
5,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:A,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:A.pdb,93.640537
6,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:B,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:B.pdb,23.856888
7,7UFZ,proteins/original_pdb_files/7UFZ.pdb,proteins/original_fasta_files/7UFZ.fasta,7UFZ:A,proteins/inferred_fasta_files/7UFZ.fasta,proteins/omegafold_predicted_pdb_files/7UFZ:A.pdb,93.066236
8,8D20,proteins/original_pdb_files/8D20.pdb,proteins/original_fasta_files/8D20.fasta,8D20:A,proteins/inferred_fasta_files/8D20.fasta,proteins/omegafold_predicted_pdb_files/8D20:A.pdb,95.077135


### TM-score

In [12]:
from tmtools.io import get_structure, get_residue_data
from tmtools import tm_align

In [13]:
def get_coords_and_sequence(pdb_file, chain_id = "x:A"):
    try:
        chain_letter = chain_id.split(":")[1]
        structure = get_structure(pdb_file)
        for chain in structure.get_chains():
            if chain.id == chain_letter:
                return get_residue_data(chain)
    except:
        return None


def compute_tm_score(chain_id, original_pdb_file, predicted_pdb_file):
    if original_pdb_file is None or predicted_pdb_file is None:
        return None
    original_coords_and_sequence = get_coords_and_sequence(original_pdb_file, chain_id)
    predicted_coords_and_sequence = get_coords_and_sequence(predicted_pdb_file)
    if original_coords_and_sequence is None or predicted_coords_and_sequence is None:
        return None
    original_coords, original_sequence = original_coords_and_sequence
    predicted_coords, predicted_sequence = predicted_coords_and_sequence
    result = tm_align(original_coords, predicted_coords, original_sequence, predicted_sequence)
    return result.tm_norm_chain1


chains["OF_TM_score"] = chains.apply(lambda row: compute_tm_score(row["chain_id"], row["original_pdb_path"], row["OF_prediction_pdb_path"]), axis = 1)
chains

Unnamed: 0,pdb_id,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,OF_prediction_pdb_path,OF_average_pLDDT,OF_TM_score
0,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:B,proteins/inferred_fasta_files/8EIO.fasta,,,
1,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:A,proteins/inferred_fasta_files/8EIO.fasta,,,
2,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:A,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:A.pdb,77.656474,
3,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:B,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:B.pdb,93.432239,0.73545
4,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:C,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:C.pdb,91.49619,0.793632
5,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:A,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:A.pdb,93.640537,0.896852
6,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:B,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:B.pdb,23.856888,0.280689
7,7UFZ,proteins/original_pdb_files/7UFZ.pdb,proteins/original_fasta_files/7UFZ.fasta,7UFZ:A,proteins/inferred_fasta_files/7UFZ.fasta,proteins/omegafold_predicted_pdb_files/7UFZ:A.pdb,93.066236,0.969861
8,8D20,proteins/original_pdb_files/8D20.pdb,proteins/original_fasta_files/8D20.fasta,8D20:A,proteins/inferred_fasta_files/8D20.fasta,proteins/omegafold_predicted_pdb_files/8D20:A.pdb,95.077135,0.972306


### RMSD
Only on Calpha?

In [14]:
from prody import parsePDB, matchChains, calcRMSD, calcTransformation, confProDy, matchAlign
confProDy(verbosity = 'none')



In [20]:
def compute_RMSD(chain_id, original_pdb_path, prediction_pdb_path):
    if original_pdb_path is None or prediction_pdb_path is None:
        return None
    
    chain_letter = chain_id.split(":")[1]
    original_structure = parsePDB(original_pdb_path, chain=chain_letter)
    prediction_structure = parsePDB(prediction_pdb_path)
    result = matchChains(prediction_structure, original_structure, sequid = 1, overlap = 1)
    if result is None:
        return None
    
    original, prediction, seqid, overlap = result[0]
    print(chain_id, seqid, overlap)
    calcTransformation(prediction, original).apply(original_structure)
    return calcRMSD(prediction, original)

print("inferred fasta:")
chains["OF_RMSD"] = chains.apply(lambda row: compute_RMSD(row["chain_id"], row["original_pdb_path"], row["OF_prediction_pdb_path"]), axis = 1)
chains

inferred fasta:
8FMN:A 100.0 98.11320754716981
8FMN:B 100.0 100.0
8FMN:C 100.0 100.0
7XBG:A 100.0 99.00332225913621
7XBG:B 100.0 99.49238578680203
7UFZ:A 100.0 47.77898158179848
8D20:A 100.0 53.627311522048366


Unnamed: 0,pdb_id,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,OF_prediction_pdb_path,OF_average_pLDDT,OF_TM_score,OF_RMSD
0,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:B,proteins/inferred_fasta_files/8EIO.fasta,,,,
1,8EIO,proteins/original_pdb_files/8EIO.pdb,proteins/original_fasta_files/8EIO.fasta,8EIO:A,proteins/inferred_fasta_files/8EIO.fasta,,,,
2,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:A,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:A.pdb,77.656474,,15.751002
3,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:B,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:B.pdb,93.432239,0.73545,5.287995
4,8FMN,proteins/original_pdb_files/8FMN.pdb,proteins/original_fasta_files/8FMN.fasta,8FMN:C,proteins/inferred_fasta_files/8FMN.fasta,proteins/omegafold_predicted_pdb_files/8FMN:C.pdb,91.49619,0.793632,9.46401
5,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:A,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:A.pdb,93.640537,0.896852,3.137993
6,7XBG,proteins/original_pdb_files/7XBG.pdb,proteins/original_fasta_files/7XBG.fasta,7XBG:B,proteins/inferred_fasta_files/7XBG.fasta,proteins/omegafold_predicted_pdb_files/7XBG:B.pdb,23.856888,0.280689,14.199551
7,7UFZ,proteins/original_pdb_files/7UFZ.pdb,proteins/original_fasta_files/7UFZ.fasta,7UFZ:A,proteins/inferred_fasta_files/7UFZ.fasta,proteins/omegafold_predicted_pdb_files/7UFZ:A.pdb,93.066236,0.969861,1.779818
8,8D20,proteins/original_pdb_files/8D20.pdb,proteins/original_fasta_files/8D20.fasta,8D20:A,proteins/inferred_fasta_files/8D20.fasta,proteins/omegafold_predicted_pdb_files/8D20:A.pdb,95.077135,0.972306,1.224994


## Save chain dataframe as csv file

In [17]:
chains.to_csv("proteins/chains_omegafold.csv", sep = ",", index = False)

In [4]:
import py3Dmol

# https://william-dawson.github.io/using-py3dmol.html
def display_structure(pdb_path):
    view = py3Dmol.view()
    view.addModel(open(pdb_path, 'r').read(),'pdb')

    # view.setStyle({'cartoon':{'color': 'gray'}})
    # view.setStyle({'cartoon':{'color': 'spectrum'}})

    # colour by pLDDT:
    view.setStyle({'cartoon': {'colorscheme': {'prop': 'b', 'gradient': 'roygb', 'min': 50, 'max': 90}}})

    view.zoomTo()
    view.show()
    
display_structure("proteins/omegafold_predicted_pdb_files/7UFZ:A.pdb")