# Downlownloading the structures of selected proteins

In [1]:
import pandas as pd
import requests
from typing import List, Dict, Optional, Set, Tuple

## Prepare filesystem

In [2]:
ORIGINAL_PDB_FILES_DIRECTORY = "proteins/original_pdb_files"

ORIGINAL_FASTA_FILES_DIRECTORY = "proteins/original_fasta_files"

INFERRED_FASTA_FILES_DIRECTORY = "proteins/inferred_fasta_files"

MASKS_DIRECTORY = "proteins/mask_files"

In [3]:
!mkdir {ORIGINAL_PDB_FILES_DIRECTORY}
!mkdir {ORIGINAL_FASTA_FILES_DIRECTORY}
!mkdir {INFERRED_FASTA_FILES_DIRECTORY}
!mkdir {MASKS_DIRECTORY}

## Load `proteins` dataframe from csv file

In [4]:
proteins = pd.read_csv("proteins/original_protein_dataset.csv")
proteins

Unnamed: 0,pdb_id,label
0,8XPV,monomer
1,8GQ4,monomer
2,8TIF,monomer
3,8H3Z,monomer
4,8ALL,monomer
...,...,...
745,8HNE,synthetic
746,8FIN,synthetic
747,8J0A,synthetic
748,8HDU,synthetic


## Download PDB files according to the PDB IDs

In [5]:
def download_structure(pdb_id: str, directory: str) -> Optional[str]:
    """
    Downloads the pdb file with structure corresponding to the given PDB ID.
    
    Parameters:
        pdb_id (str): PDB ID of the structure
        directory (str): path to the directory where the downloaded file should be stored
    
    Returns:
        the path to the downloaded file or None if the process fails
    """
    try:
        # download 3D structure based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://files.rcsb.org/download/{pdb_id_lower}.pdb')
        if not response.ok or response.text == 'N/A':
            return None
        structure = response.text

        # save the downloaded structure to a pdb file:
        path_file = f'{directory}/{pdb_id}.pdb'
        with open(path_file, 'w') as f:
            f.write(structure)
        return path_file

    except:
        return None


proteins["original_pdb_path"] = proteins.apply(lambda row: download_structure(row["pdb_id"], ORIGINAL_PDB_FILES_DIRECTORY),
                                               axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb
...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb


Filter out proteins whose pdb files cannot be downloaded:

In [6]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb
...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb


## Identify unique chains using downloaded fasta files
Download fasta files:

In [7]:
def download_fasta(pdb_id: str, directory: str) -> Optional[str]:
    """
    Downloads the fasta file with amino acid sequences corresponding to the given PDB ID.
    
    Parameters:
        pdb_id (str): PDB ID of the structure
        directory (str): path to the directory where the downloaded file should be stored
    
    Returns:
        the path to the downloaded file or None if the process fails
    """
    try:
        # download fasta file based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://www.rcsb.org/fasta/entry/{pdb_id_lower}')
        if not response.ok or response.text == 'N/A':
            return None
        sequence = response.text

        # save the downloaded sequence as a fasta file:
        path_file = f'{directory}/{pdb_id}.fasta'
        with open(path_file, 'w') as f:
            f.write(sequence)
        return path_file

    except:
        return None


proteins["original_fasta_path"] = proteins.apply(lambda row: download_fasta(row["pdb_id"], ORIGINAL_FASTA_FILES_DIRECTORY), axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta
...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta


Filter out proteins whose fasta files cannot be downloaded:

In [8]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta
...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta


Identify unique chains:

In [9]:
def extract_chain_id(line: str) -> str:
    """
    Returns letter indicating the name of the protein chain used in the corresponding pdb file.
    
    Parameters:
        line (str): the line containing the header of a sequence in a fasta file
    """
    id_substring = line.split("|")[1].replace("s", "").removeprefix("Chain ").split(",")[0]
    if "auth" in id_substring:
        return id_substring.split()[1][0]
    return id_substring[0]


def identify_unique_chains(pdb_id: str, fasta_path: str) -> List[str]:
    """
    Returns list of strings indicating the names of all unique chains in the fasta file.
    
    Parameters:
        pdb_id (str): PDB ID of the given protein
        fasta_path (str): path to the fasta file
    """
    unique_chains = []
    with open(fasta_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                chain_letter = extract_chain_id(line)
                unique_chains.append(f"{pdb_id}:{chain_letter}")
    return unique_chains


proteins["unique_chains"] = proteins.apply(lambda row: identify_unique_chains(row["pdb_id"], row["original_fasta_path"]), axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,[8XPV:A]
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,[8GQ4:A]
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,[8TIF:A]
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,[8H3Z:A]
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,[8ALL:A]
...,...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,[8HNE:A]
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,[8FIN:A]
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,[8J0A:A]
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,[8HDU:A]


## Obtain sequences of identified chains from pdb files

In [17]:
from prody import parsePDB, AtomGroup
from Bio.PDB import parse_pdb_header
from Bio.SCOP.Raf import protein_letters_3to1


def extract_sequence_from_dict(residue_dict: Dict[int, str], lower: int, upper: int) -> str:
    sequence = []
    for position in range(lower, upper + 1):
        amino_acid = residue_dict.get(position, None)
        if amino_acid is not None:
            sequence.append(protein_letters_3to1[amino_acid])
    return "".join(sequence)


def extract_mask_from_sets(residue_positions: Set[int], missing_residue_positions: Set[int], lower: int, upper: int) -> str:
    mask = []
    for position in range(lower, upper + 1):
        if position in residue_positions:
            mask.append("1")
        elif position in missing_residue_positions:
            mask.append("0")
    return "".join(mask)


def obtain_residue_dict_and_borders(chain: AtomGroup) -> Dict[int, str]:
    residue_names = chain.getResnames()
    residue_numbers = chain.getResnums()
    residue_dict = dict(zip(residue_numbers, residue_names))
    return residue_dict


def obtain_sequence_and_mask(chain: AtomGroup, missing_residue_dict: Dict[int, str]) -> Tuple[str, str]:
    residue_dict = obtain_residue_dict_and_borders(chain)
    complete_dict = residue_dict | missing_residue_dict
    lower, upper = min(complete_dict.keys()), max(complete_dict.keys())
    sequence = extract_sequence_from_dict(complete_dict, lower, upper)
    mask = extract_mask_from_sets(residue_dict.keys(), missing_residue_dict.keys(), lower, upper)
    assert(len(sequence) == len(mask))
    return sequence, mask
    

def obtain_missing_residues(pdb_path: str, wanted_chains_letters: Set[str]) -> Dict[str, Dict[int, str]]:
    header = parse_pdb_header(pdb_path)
    missing_residues = {chain_letter: {} for chain_letter in wanted_chains_letters}
    missing_residues_list = header["missing_residues"]
    if not missing_residues_list:
        return missing_residues
    model = missing_residues_list[0]["model"]
    for residue in missing_residues_list:
        if residue["model"] == model and residue["chain"] in wanted_chains_letters and residue["res_name"] in protein_letters_3to1.keys():
            missing_residues[residue["chain"]][residue["ssseq"]] = residue["res_name"]
    return missing_residues


def write_sequences_into_file(file_path: str, sequences: List[Tuple[str, str]]) -> None:
    with open(file_path, 'w') as f:
        for chain, sequence in sequences:
            f.write(f'>{chain}\n{sequence}\n')


def extract_aa_seq_from_pdb(pdb_path: str, fasta_path: str, masks_path: str, wanted_chains_ids: List[str]) -> bool:
    """
    This function creates two files:
     - fasta file with inferred sequences for all chains
     - file with corresponding mask indicating whether a particular residue has known coordinates
       in the original pdb file (1) or not (0)
    
    Parameters:
        pdb_path (str): path to the pdb file with original structure
        fasta_path (str): path to the new fasta file with inferred sequences
        masks_path (str): path to the new file with corresponding masks
        wanted_chains_ids (List[str]): list with ids of chains whose sequences and masks should be inferred
    
    Returns:
        boolean indicating whether the process of retrieving data from pdb file was successful or not
    """
    wanted_chains_letters = [chain_id.split(":")[1] for chain_id in wanted_chains_ids]
    missing_residues = obtain_missing_residues(pdb_path, set(wanted_chains_letters))
    sequences, masks = [], []
    for chain_letter, chain_id in zip(wanted_chains_letters, wanted_chains_ids):
        chain = parsePDB(pdb_path, chain = chain_letter, subset = 'calpha')
        if chain is None:
            return False
        sequence, mask = obtain_sequence_and_mask(chain, missing_residues[chain_letter])
        sequences.append((chain_id, sequence))
        masks.append((chain_id, mask))
    write_sequences_into_file(fasta_path, sequences)
    write_sequences_into_file(masks_path, masks)
    return True



proteins["inferred_fasta_path"] = proteins.apply(lambda row: f"{INFERRED_FASTA_FILES_DIRECTORY}/{row['pdb_id']}.fasta", axis = 1)
proteins["masks_path"] = proteins.apply(lambda row: f"{MASKS_DIRECTORY}/{row['pdb_id']}_mask.fasta", axis = 1)

for index, row in proteins.iterrows():
    success = extract_aa_seq_from_pdb(row["original_pdb_path"],
                                      row["inferred_fasta_path"],
                                      row["masks_path"],
                                      row["unique_chains"])
    if not success:
        proteins.loc[proteins["pdb_id"] == row["pdb_id"], "inferred_fasta_path"] = None
        proteins.loc[proteins["pdb_id"] == row["pdb_id"], "masks_path"] = None

proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains,inferred_fasta_path,masks_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,[8XPV:A],proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,[8GQ4:A],proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,[8TIF:A],proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,[8H3Z:A],proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,[8ALL:A],proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta
...,...,...,...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,[8HNE:A],proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,[8FIN:A],proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,[8J0A:A],proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,[8HDU:A],proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta


Filter out proteins whose sequence cannot be determined:

In [18]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains,inferred_fasta_path,masks_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,[8XPV:A],proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,[8GQ4:A],proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,[8TIF:A],proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,[8H3Z:A],proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,[8ALL:A],proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta
...,...,...,...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,[8HNE:A],proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,[8FIN:A],proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,[8J0A:A],proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,[8HDU:A],proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta


## Create pandas dataframe for chains

In [19]:
chains = proteins.explode("unique_chains", ignore_index = True).rename(columns = {"unique_chains": "chain_id"})
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path,masks_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta,proteins/mask_files/8XPV_mask.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta,proteins/mask_files/8GQ4_mask.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta,proteins/mask_files/8TIF_mask.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta,proteins/mask_files/8H3Z_mask.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta,proteins/mask_files/8ALL_mask.fasta
...,...,...,...,...,...,...,...
1042,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,8HNE:A,proteins/inferred_fasta_files/8HNE.fasta,proteins/mask_files/8HNE_mask.fasta
1043,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,8FIN:A,proteins/inferred_fasta_files/8FIN.fasta,proteins/mask_files/8FIN_mask.fasta
1044,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,8J0A:A,proteins/inferred_fasta_files/8J0A.fasta,proteins/mask_files/8J0A_mask.fasta
1045,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,8HDU:A,proteins/inferred_fasta_files/8HDU.fasta,proteins/mask_files/8HDU_mask.fasta


## Save both dataframes as csv files

In [20]:
proteins.to_csv("proteins/proteins.csv", sep = ",", index = False)
chains.to_csv("proteins/chains.csv", sep = ",", index = False)