# Testing used functions 

The queries used to retrieve data from the PDB dataset were tested by compering the results with manually set filters on search directly on the PDB webpage https://www.rcsb.org/.

In [2]:
import pandas as pd
from typing import List, Dict, Optional, Set, Tuple

In [8]:
def extract_chain_id(line: str) -> str:
    """
    Returns letter indicating the name of the protein chain used in the corresponding pdb file.
    
    Parameters:
        line (str): the line containing the header of a sequence in a fasta file
    """
    id_substring = line.split("|")[1].replace("s", "").removeprefix("Chain ").split(",")[0]
    if "auth" in id_substring:
        return id_substring.split()[1][0]
    return id_substring[0]


def identify_unique_chains(pdb_id: str, fasta_path: str) -> Set[str]:
    """
    Returns set of strings indicating the names of all unique chains in the fasta file.
    
    Parameters:
        pdb_id (str): PDB ID of the given protein
        fasta_path (str): path to the fasta file
    """
    unique_chains = set()
    with open(fasta_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                chain_letter = extract_chain_id(line)
                unique_chains.add(f"{pdb_id}:{chain_letter}")
    return unique_chains


assert(extract_chain_id(">8HEV_1|Chains A, C[auth B], E[auth C], G, I[auth H], K[auth I], M[auth J], O[auth K], Q[auth L], S[auth M], U, W[auth V]|Portal protein|Human betaherpesvirus 5 (10359)") == "A")
assert(extract_chain_id(">8EB7_1|Chains AA[auth A], A[auth 0], B, BA[auth C], CA[auth F], C[auth D], DA[auth Y], D[auth X], EA[auth a], E[auth Z], FA[auth b], GA[auth d], G[auth c], HA[auth e], IA[auth f], JA[auth g], Y[auth h], Z[auth i]|Tail spike protein|Salmonella phage P22 (2908168)") == "A")
assert(extract_chain_id(">5SSZ_1|Chain C|Formylglycine-generating enzyme|Homo sapiens (9606)") == "C")
assert(extract_chain_id(">7G1X_1|Chains A, B|Fatty acid-binding protein, liver|Homo sapiens (9606)") == "A")
assert(extract_chain_id(">8H61_1|Chain A[auth B]|Mutant M2 of ketoreductase CpKR|Candida parapsilosis (5480)") == "B")
assert(extract_chain_id(">8DWV_1|Chains A, B, C[auth E], D[auth F], E[auth G], F[auth H]|Speckle-type POZ protein|Homo sapiens (9606)") == "A")

assert(identify_unique_chains("5SSZ", "proteins/original_fasta_files/5SSZ.fasta") == {"5SSZ:A"})
assert(identify_unique_chains("7G1X", "proteins/original_fasta_files/7G1X.fasta") == {"7G1X:A"})
assert(identify_unique_chains("8EB0", "proteins/original_fasta_files/8EB0.fasta") == {"8EB0:A", "8EB0:B", "8EB0:C"})
assert(identify_unique_chains("8DWV", "proteins/original_fasta_files/8DWV.fasta") == {"8DWV:A"})
assert(identify_unique_chains("8PJN", "proteins/original_fasta_files/8PJN.fasta") == {"8PJN:b", "8PJN:2", "8PJN:i", "8PJN:u"})

In [7]:
from prody import parsePDB, AtomGroup
from Bio.PDB import parse_pdb_header
from Bio.SCOP.Raf import protein_letters_3to1


def extract_sequence_from_dict(residue_dict: Dict[int, str], lower: int, upper: int) -> str:
    sequence = []
    for position in range(lower, upper + 1):
        amino_acid = residue_dict.get(position, None)
        if amino_acid is not None:
            sequence.append(protein_letters_3to1[amino_acid])
    return "".join(sequence)


def extract_mask_from_sets(residue_positions: Set[int], missing_residue_positions: Set[int], lower: int, upper: int) -> str:
    mask = []
    for position in range(lower, upper + 1):
        if position in residue_positions:
            mask.append("1")
        elif position in missing_residue_positions:
            mask.append("0")
    return "".join(mask)


def obtain_residue_dict_and_borders(chain: AtomGroup) -> Dict[int, str]:
    residue_names = chain.getResnames()
    residue_numbers = chain.getResnums()
    residue_dict = dict(zip(residue_numbers, residue_names))
    return residue_dict


def obtain_sequence_and_mask(chain: AtomGroup, missing_residue_dict: Dict[int, str]) -> Tuple[str, str]:
    residue_dict = obtain_residue_dict_and_borders(chain)
    complete_dict = residue_dict | missing_residue_dict
    lower, upper = min(complete_dict.keys()), max(complete_dict.keys())
    sequence = extract_sequence_from_dict(complete_dict, lower, upper)
    mask = extract_mask_from_sets(residue_dict.keys(), missing_residue_dict.keys(), lower, upper)
    return sequence, mask
    

def obtain_missing_residues(pdb_path: str, wanted_chains_letters: Set[str]) -> Dict[str, Dict[int, str]]:
    header = parse_pdb_header(pdb_path)
    missing_residues = {chain_letter: {} for chain_letter in wanted_chains_letters}
    missing_residues_list = header["missing_residues"]
    if not missing_residues_list:
        return missing_residues
    model = missing_residues_list[0]["model"]
    for residue in missing_residues_list:
        if residue["model"] == model and residue["chain"] in wanted_chains_letters:
            missing_residues[residue["chain"]][residue["ssseq"]] = residue["res_name"]
    return missing_residues


def write_sequences_into_file(file_path: str, sequences: List[Tuple[str, str]]) -> str:
    with open(file_path, 'w') as f:
        for chain, sequence in sequences:
            f.write(f'>{chain}\n{sequence}\n')
    return file_path


def extract_aa_seq_from_pdb(pdb_path: str, fasta_path: str, masks_path: str, wanted_chains_ids: List[str]) -> Tuple[Optional[str], Optional[str]]:
    wanted_chains_letters = [chain_id.split(":")[1] for chain_id in wanted_chains_ids]
    missing_residues = obtain_missing_residues(pdb_path, set(wanted_chains_letters))
    sequences, masks = [], []
    for chain_letter, chain_id in zip(wanted_chains_letters, wanted_chains_ids):
        chain = parsePDB(pdb_path, chain = chain_letter, subset = 'calpha')
        if chain is None:
            return None, None
        sequence, mask = obtain_sequence_and_mask(chain, missing_residues[chain_letter])
        sequences.append((chain_id, sequence))
        masks.append((chain_id, mask))
    return write_sequences_into_file(fasta_path, sequences), write_sequences_into_file(masks_path, masks)



assert(extract_aa_seq_from_pdb("proteins/original_pdb_files/5SSZ.pdb","proteins/inferred_fasta_files/5SSZ.fasta", "proteins/mask_files/5SSZ_mask.fasta", ["5SSZ:A"]) == ("proteins/inferred_fasta_files/5SSZ.fasta", "proteins/mask_files/5SSZ_mask.fasta"))
with open("proteins/inferred_fasta_files/5SSZ.fasta", "r") as f:
    assert(f.read() == ">5SSZ:A\nADLGSSMEFEANAPGPVPGERQLAHSKMVPIPAGVFTMGTDDPQIKQDGEAPARRVTIDAFYMDAYEVSNTEFEKFVNSTGYLTEAEKFGDSFVFEGMLSEQVKTNIQQAVAAAPWWLPVKGANWRHPEGPDSTILHRPDHPVLHVSWNDAVAYCTWAGKRLPTEAEWEYSCRGGLHNRLFPWGNKLQPKGQHYANIWQGEFPVTNTGEDGFQGTAPVDAFPPNGYGLYNIVGNAWEWTSDWWTVHHSVEETLNPKGPPSGKDRVKKGGSYMCHRSYCYRYRCAARSQNTPDSSASNLGFRCAADRLPTMDSGRGSHHHHHHH\n")
with open ("proteins/mask_files/5SSZ_mask.fasta", "r") as f:
    assert(f.read() == ">5SSZ:A\n00000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111100000000000011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100000000000\n")

assert(extract_aa_seq_from_pdb("proteins/original_pdb_files/8HI2.pdb", "proteins/inferred_fasta_files/8HI2.fasta", "proteins/mask_files/8HI2_mask.fasta", ["8HI2:A", "8HI2:B", "8HI2:C"]) == ("proteins/inferred_fasta_files/8HI2.fasta", "proteins/mask_files/8HI2_mask.fasta"))
with open("proteins/inferred_fasta_files/8HI2.fasta", "r") as f:
    assert(f.read() == """>8HI2:A\nHSTAETTLDSFFSRAGLVGEIDLPLKGTTNPNGYANWDIDITGYAQMRRKVELFTYMRFDAEFTFVACTPTGEVVPQLLQYMFVPPGAPKPDSRESLAWQTATNPSVFVKLSDPPAQVSVPFMSPASAYQWFYDGYPTFGEHKQEKDLEYGAMPNNMMGTFSVRTVGTSKSKYPLVVRIYMRMKHVRAWIPRPMRNQNYLFKANPNYAGNSIKPTGASRTAITTL\n>8HI2:B\nVAQLTIGNSTITTQEAANIIVGYGEWPSYCSDSDATAVDKPTRPDVSVNRFYTLDTKLWEKSSKGWYWKFPDVLTETGVFGQNAQFHYLYRSGFCIHVQCNASKFHQGALLVAVLPEYVIGTVAGGTGTEDTHPPYKQTQPGADGFELQHPYVLDAGIPISQLTVCPHQWINLRTNNCATIIVPYINALPFDSALNHCNFGLLVVPISPLDYDQGATPVIPITITLAPMCSEFAGLR\n>8HI2:C\nGFPTELKPGTNQFLTTDDGVSAPILPNFHPTPCIHIPGEVRNLLELCQVETILEVNNVPTNATSLMERLRFPVSAQAGKGELCAVFRADPGRNGPWQSTLLGQLCGYYTQWSGSLEVTFMFTGSFMATGKMLIAYTPPGGPLPKDRATAMLGTHVIWDFGLQSSVTLVIPWISNTHYRAHARDGVFDYYTTGLVSIWYQTNYVVPIGAPNTAYIIALAAAQKNFTMKLCKDASDILQTG\n""")
with open("proteins/mask_files/8HI2_mask.fasta", "r") as f:
    assert(f.read() == """>8HI2:A\n001111111111111111111111000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111110000000000011111111111111111111111111111111111111111111111111111111111111111111111110000\n>8HI2:B\n000000000000000000111111111110000000000000000000111111111111111111111111111111111111111111111111111111111111111111111111111000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111\n>8HI2:C\n11111111111111111111111111111111111111111111111111111111111111111111111111100011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100000000000000011111111111111111111111111111111111111111100000000\n""")

    
    

In [9]:
from tmtools import tm_align
from prody import parsePDB, AtomGroup
from Bio.SCOP.Raf import protein_letters_3to1
from numpy import array

In [11]:
def parse_mask_file(mask_path: str, chain_id: str) -> Optional[str]:
    with open(mask_path, "r") as file:
        output_next_line = False
        for line in file:
            if line.startswith(">") and chain_id in line:
                output_next_line = True
            elif output_next_line:
                return line.strip()
    return None


def get_coords_and_sequence_with_mask(pdb_path: str, chain_letter: str, mask: str):
    chain = parsePDB(pdb_path, chain = chain_letter, subset = 'calpha')
    coords, sequence = [], []
    for i, atom in enumerate(chain):
        if mask[i] == "1":
            coords.append(atom.getCoords())
            sequence.append(protein_letters_3to1[atom.getResname()])
    return array(coords), "".join(sequence)


def get_coords_and_sequence_without_mask(pdb_path: str, chain_letter: str):
    chain = parsePDB(pdb_path, chain = chain_letter, subset = 'calpha')
    return chain.getCoords(), chain.getSequence()


def get_coords_and_sequence(pdb_path: str, chain_id = "x:A", mask: Optional[str] = None):
    if mask is None:
        return get_coords_and_sequence_without_mask(pdb_path, chain_id.split(":")[1])
    return get_coords_and_sequence_with_mask(pdb_path, chain_id.split(":")[1], mask)


def compute_tm_score(chain_id: str, original_pdb_path: str, predicted_pdb_path: str, mask_path: str):
    mask = parse_mask_file(mask_path, chain_id)
    original_coords, original_sequence = get_coords_and_sequence(original_pdb_path, chain_id = chain_id)
    predicted_coords, predicted_sequence = get_coords_and_sequence(predicted_pdb_path, mask = mask)
    assert(len(original_sequence) == len(predicted_sequence))
    assert(original_sequence == predicted_sequence)
    result = tm_align(original_coords, predicted_coords, original_sequence, predicted_sequence)
    return result.tm_norm_chain1


print("5SSZ:A", compute_tm_score("5SSZ:A", "proteins/original_pdb_files/5SSZ.pdb", "proteins/omegafold_predicted_pdb_files/5SSZ:A.pdb", "proteins/mask_files/5SSZ_mask.fasta"))
print("8HI2:A", compute_tm_score("8HI2:A", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:A.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print("8HI2:B", compute_tm_score("8HI2:B", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:B.pdb", "proteins/mask_files/8HI2_mask.fasta"))
print("8HI2:C", compute_tm_score("8HI2:C", "proteins/original_pdb_files/8HI2.pdb", "proteins/omegafold_predicted_pdb_files/8HI2:C.pdb", "proteins/mask_files/8HI2_mask.fasta"))

5SSZ:A 0.9763108724463694
8HI2:A 0.7435318889813948
8HI2:B 0.6747099260851717
8HI2:C 0.7158776700148659


In [19]:
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB.QCPSuperimposer import QCPSuperimposer

def compute_RMSD(chain_id: str, original_pdb_path: str, predicted_pdb_path: str, mask_path: str) -> float:
    mask = parse_mask_file(mask_path, chain_id)
    original_coords, _ = get_coords_and_sequence(original_pdb_path, chain_id = chain_id)
    predicted_coords, _ = get_coords_and_sequence(predicted_pdb_path, mask = mask)
    sup = SVDSuperimposer()
    sup.set(original_coords, predicted_coords)
    sup.run()
    return sup.get_rms()

def compute_RMSD2(chain_id: str, original_pdb_path: str, predicted_pdb_path: str, mask_path: str) -> float:
    mask = parse_mask_file(mask_path, chain_id)
    original_coords, _ = get_coords_and_sequence(original_pdb_path, chain_id = chain_id)
    predicted_coords, _ = get_coords_and_sequence(predicted_pdb_path, mask = mask)
    sup = QCPSuperimposer()
    sup.set(original_coords, predicted_coords)
    sup.run()
    return sup.get_rms()


assert(round(compute_RMSD("5SSZ:A",
                          "proteins/original_pdb_files/5SSZ.pdb",
                          "proteins/omegafold_predicted_pdb_files/5SSZ:A.pdb",
                          "proteins/mask_files/5SSZ_mask.fasta"), 3)
       ==
       round(compute_RMSD2("5SSZ:A",
                           "proteins/original_pdb_files/5SSZ.pdb",
                           "proteins/omegafold_predicted_pdb_files/5SSZ:A.pdb",
                           "proteins/mask_files/5SSZ_mask.fasta"), 3))

assert(round(compute_RMSD("8HI2:A", 
                          "proteins/original_pdb_files/8HI2.pdb", 
                          "proteins/omegafold_predicted_pdb_files/8HI2:A.pdb", 
                          "proteins/mask_files/8HI2_mask.fasta"), 3)
       ==
       round(compute_RMSD("8HI2:A", 
                          "proteins/original_pdb_files/8HI2.pdb", 
                          "proteins/omegafold_predicted_pdb_files/8HI2:A.pdb", 
                          "proteins/mask_files/8HI2_mask.fasta"), 3))

assert(round(compute_RMSD("8HI2:C",
                          "proteins/original_pdb_files/8HI2.pdb",
                          "proteins/omegafold_predicted_pdb_files/8HI2:C.pdb",
                          "proteins/mask_files/8HI2_mask.fasta"), 3)
       ==
       round(compute_RMSD("8HI2:C",
                          "proteins/original_pdb_files/8HI2.pdb",
                          "proteins/omegafold_predicted_pdb_files/8HI2:C.pdb",
                          "proteins/mask_files/8HI2_mask.fasta"), 3))

something 10
