# Evaluation of predicted structures

## pLDDT

In [None]:
from Bio.PDB import PDBParser
from statistics import mean

def extract_average_pLDDT(pdb_path):
    if pdb_path is None:
        return None
    structure = PDBParser(QUIET = True).get_structure("X", pdb_path)
    b_factors = [residue["CA"].get_bfactor() for residue in structure.get_residues()]
    return mean(b_factors)

chains["OF_average_pLDDT"] = chains.apply(lambda row: extract_average_pLDDT(row["OF_prediction_pdb_path"]), axis = 1)
chains

## TM score

In [50]:
import os, sys, re, torch, requests, json, Bio
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'
!python OmegaFold/main.py --device={device} "proteins/inferred_fasta_files/5SSZ.fasta" "proteins/omegafold_predicted_pdb_files"
!python OmegaFold/main.py --device={device} "proteins/inferred_fasta_files/8HI2.fasta" "proteins/omegafold_predicted_pdb_files"

INFO:root:Loading weights from /home/jovyan/.cache/omegafold_ckpt/model.pt
INFO:root:Constructing OmegaFold
INFO:root:Reading proteins/inferred_fasta_files/5SSZ.fasta
INFO:root:Predicting 1th chain in proteins/inferred_fasta_files/5SSZ.fasta
INFO:root:323 residues in this chain.
INFO:root:Finished prediction in 77.12 seconds.
INFO:root:Saving prediction to proteins/omegafold_predicted_pdb_files/5SSZ:A.pdb
INFO:root:Saved
INFO:root:Done!
INFO:root:Loading weights from /home/jovyan/.cache/omegafold_ckpt/model.pt
INFO:root:Constructing OmegaFold
INFO:root:Reading proteins/inferred_fasta_files/8HI2.fasta
INFO:root:Predicting 1th chain in proteins/inferred_fasta_files/8HI2.fasta
INFO:root:225 residues in this chain.
INFO:root:Finished prediction in 32.35 seconds.
INFO:root:Saving prediction to proteins/omegafold_predicted_pdb_files/8HI2:A.pdb
INFO:root:Saved
INFO:root:Predicting 2th chain in proteins/inferred_fasta_files/8HI2.fasta
INFO:root:237 residues in this chain.
INFO:root:Finished pr

In [51]:
from tmtools import tm_align
from prody import parsePDB, AtomGroup
from Bio.SCOP.Raf import protein_letters_3to1
from numpy import array
from typing import List, Dict, Optional, Set, Tuple

In [54]:
def parse_mask_file(mask_path: str, chain_id: str) -> Optional[str]:
    with open(mask_path, "r") as file:
        output_next_line = False
        for line in file:
            if line.startswith(">") and chain_id in line:
                output_next_line = True
            elif output_next_line:
                return line.strip()
    return None


def get_coords_and_sequence_with_mask(pdb_path: str, chain_letter: str, mask: str):
    chain = parsePDB(pdb_path, chain = chain_letter, subset = 'calpha')
    coords, sequence = [], []
    for i, atom in enumerate(chain):
        if mask[i] == "1":
            coords.append(atom.getCoords())
            sequence.append(protein_letters_3to1[atom.getResname()])
    return array(coords), "".join(sequence)


def get_coords_and_sequence_without_mask(pdb_path: str, chain_letter: str):
    chain = parsePDB(pdb_path, chain = chain_letter, subset = 'calpha')
    return chain.getCoords(), chain.getSequence()


def get_coords_and_sequence(pdb_path: str, chain_id = "x:A", mask: Optional[str] = None):
    if mask is None:
        return get_coords_and_sequence_without_mask(pdb_path, chain_id.split(":")[1])
    return get_coords_and_sequence_with_mask(pdb_path, chain_id.split(":")[1], mask)


def compute_tm_score(chain_id: str, original_pdb_path: str, predicted_pdb_path: str, mask_path: str):
    mask = parse_mask_file(mask_path, chain_id)
    original_coords, original_sequence = get_coords_and_sequence(original_pdb_path, chain_id = chain_id)
    predicted_coords, predicted_sequence = get_coords_and_sequence(predicted_pdb_path, mask = mask)
    assert(len(original_sequence) == len(predicted_sequence))
    assert(original_sequence == predicted_sequence)
    result = tm_align(original_coords, predicted_coords, original_sequence, predicted_sequence)
    return result.tm_norm_chain1


print("5SSZ:A", compute_tm_score("5SSZ:A", "proteins/original_pdb_files/5SSZ.pdb", "proteins/omegafold_predicted_pdb_files/5SSZ:A.pdb", "proteins/mask_files/5SSZ_mask.fasta"))
print("8HI2:A", compute_tm_score("8HI2:A", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:A.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print("8HI2:B", compute_tm_score("8HI2:B", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:B.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print("8HI2:C", compute_tm_score("8HI2:C", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:C.pdb", "proteins/mask_files/8HI2_mask.fasta"))



5SSZ:A 0.9763108724463694
8HI2:A 0.7655109186150614
8HI2:B 0.7908140832921055
8HI2:C 0.7199127587354959


## RMSD

In [53]:
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB.QCPSuperimposer import QCPSuperimposer

def compute_RMSD(chain_id: str, original_pdb_path: str, predicted_pdb_path: str, mask_path: str) -> float:
    mask = parse_mask_file(mask_path, chain_id)
    original_coords, _ = get_coords_and_sequence(original_pdb_path, chain_id = chain_id)
    predicted_coords, _ = get_coords_and_sequence(predicted_pdb_path, mask = mask)
    sup = SVDSuperimposer()
    sup.set(original_coords, predicted_coords)
    sup.run()
    return sup.get_rms()

def compute_RMSD2(chain_id: str, original_pdb_path: str, predicted_pdb_path: str, mask_path: str) -> float:
    mask = parse_mask_file(mask_path, chain_id)
    original_coords, _ = get_coords_and_sequence(original_pdb_path, chain_id = chain_id)
    predicted_coords, _ = get_coords_and_sequence(predicted_pdb_path, mask = mask)
    sup = QCPSuperimposer()
    sup.set(original_coords, predicted_coords)
    sup.run()
    return sup.get_rms()


print(compute_RMSD("5SSZ:A", "proteins/original_pdb_files/5SSZ.pdb", "proteins/omegafold_predicted_pdb_files/5SSZ:A.pdb", "proteins/mask_files/5SSZ_mask.fasta"))
print(compute_RMSD("8HI2:A", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:A.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print(compute_RMSD("8HI2:B", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:B.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print(compute_RMSD("8HI2:C", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:C.pdb", "proteins/mask_files/8HI2_mask.fasta"))

print(compute_RMSD2("5SSZ:A", "proteins/original_pdb_files/5SSZ.pdb", "proteins/omegafold_predicted_pdb_files/5SSZ:A.pdb", "proteins/mask_files/5SSZ_mask.fasta"))
print(compute_RMSD2("8HI2:A", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:A.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print(compute_RMSD2("8HI2:B", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:B.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print(compute_RMSD2("8HI2:C", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:C.pdb", "proteins/mask_files/8HI2_mask.fasta"))







1.1466584479264936
8.176420097982414
5.333953886598329
16.486893944433685
1.146658447926571
8.176420097982414
5.333953886598332
16.48689394443369
