# Downlownloading the structures of selected proteins

In [13]:
import pandas as pd
import requests

## Prepare filesystem

In [9]:
ORIGINAL_PDB_FILES_DIRECTORY = "proteins/original_pdb_files"

ORIGINAL_FASTA_FILES_DIRECTORY = "proteins/original_fasta_files"

INFERRED_FASTA_FILES_DIRECTORY = "proteins/inferred_fasta_files"

In [11]:
!mkdir {ORIGINAL_PDB_FILES_DIRECTORY}
!mkdir {ORIGINAL_FASTA_FILES_DIRECTORY}
!mkdir {INFERRED_FASTA_FILES_DIRECTORY}

## Load `proteins` dataframe from csv file

In [12]:
proteins = pd.read_csv("proteins/original_protein_dataset.csv")
proteins

Unnamed: 0,pdb_id,label
0,8XPV,monomer
1,8GQ4,monomer
2,8TIF,monomer
3,8H3Z,monomer
4,8ALL,monomer
...,...,...
745,8HNE,synthetic
746,8FIN,synthetic
747,8J0A,synthetic
748,8HDU,synthetic


## Download PDB files according to the PDB IDs

In [14]:
def download_structure(pdb_id, directory):
    try:
        # download 3D structure based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://files.rcsb.org/download/{pdb_id_lower}.pdb')
        if not response.ok or response.text == 'N/A':
            return None
        structure = response.text

        # save the downloaded structure to a pdb file:
        path_file = f'{directory}/{pdb_id}.pdb'
        with open(path_file, 'w') as f:
            f.write(structure)
        return path_file

    except:
        return None


proteins["original_pdb_path"] = proteins.apply(lambda row: download_structure(row["pdb_id"], ORIGINAL_PDB_FILES_DIRECTORY),
                                               axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb
...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb


Filter out proteins whose pdb files cannot be downloaded:

In [15]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb
...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb


## Identify unique chains using downloaded fasta files
Download fasta files:

In [16]:
def download_fasta(pdb_id, directory):
    try:
        # download fasta file based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://www.rcsb.org/fasta/entry/{pdb_id_lower}')
        if not response.ok or response.text == 'N/A':
            return None
        sequence = response.text

        # save the downloaded sequence as a fasta file:
        path_file = f'{directory}/{pdb_id}.fasta'
        with open(path_file, 'w') as f:
            f.write(sequence)
        return path_file

    except:
        return None


proteins["original_fasta_path"] = proteins.apply(lambda row: download_fasta(row["pdb_id"], ORIGINAL_FASTA_FILES_DIRECTORY), axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta
...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta


Filter out proteins whose fasta files cannot be downloaded:

In [17]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta
...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta


Identify unique chains:

In [18]:
def extract_chain_id(line):
    id_substring = line.split("|")[1].replace("s", "").removeprefix("Chain ").split(",")[0]
    if "auth" in id_substring:
        return id_substring.split()[1][0]
    return id_substring[0]


def identify_unique_chains(pdb_id, fasta_path):
    unique_chains = set()
    with open(fasta_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                chain_letter = extract_chain_id(line)
                unique_chains.add(f"{pdb_id}:{chain_letter}")
    return unique_chains

"""
print(extract_chain_id(">8HEV_1|Chains A, C[auth B], E[auth C], G, I[auth H], K[auth I], M[auth J], O[auth K], Q[auth L], S[auth M], U, W[auth V]|Portal protein|Human betaherpesvirus 5 (10359)") == "A")
print(extract_chain_id(">8EB7_1|Chains AA[auth A], A[auth 0], B, BA[auth C], CA[auth F], C[auth D], DA[auth Y], D[auth X], EA[auth a], E[auth Z], FA[auth b], GA[auth d], G[auth c], HA[auth e], IA[auth f], JA[auth g], Y[auth h], Z[auth i]|Tail spike protein|Salmonella phage P22 (2908168)") == "A")
print(extract_chain_id(">5SSZ_1|Chain C|Formylglycine-generating enzyme|Homo sapiens (9606)") == "C")
print(extract_chain_id(">7G1X_1|Chains A, B|Fatty acid-binding protein, liver|Homo sapiens (9606)") == "A")
print(extract_chain_id(">8H61_1|Chain A[auth B]|Mutant M2 of ketoreductase CpKR|Candida parapsilosis (5480)") == "B")
print(extract_chain_id(">8DWV_1|Chains A, B, C[auth E], D[auth F], E[auth G], F[auth H]|Speckle-type POZ protein|Homo sapiens (9606)") == "A")
"""

proteins["unique_chains"] = proteins.apply(lambda row: identify_unique_chains(row["pdb_id"], row["original_fasta_path"]), axis = 1)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,{8XPV:A}
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,{8GQ4:A}
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,{8TIF:A}
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,{8H3Z:A}
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,{8ALL:A}
...,...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,{8HNE:A}
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,{8FIN:A}
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,{8J0A:A}
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,{8HDU:A}


## Obtain sequences of identified chains from pdb files

In [19]:
from prody import parsePDB
from Bio.PDB import parse_pdb_header
from Bio.SCOP.Raf import protein_letters_3to1


def extract_sequence_from_dict(residue_dict, lower, upper):
    sequence = []
    for position in range(lower, upper + 1):
        if position in residue_dict.keys():
            amino_acid = residue_dict[position]
            sequence.append(protein_letters_3to1[amino_acid])
    return "".join(sequence)


def obtain_chain_sequence(chain, missing_residues):
    residue_names = chain.getResnames()
    residue_numbers = chain.getResnums()
    residue_dict = dict(zip(residue_numbers, residue_names))
    for position, amino_acid in missing_residues:
        residue_dict[position] = amino_acid
    return extract_sequence_from_dict(residue_dict, residue_numbers[0], residue_numbers[-1])


def obtain_missing_residues(pdb_path, wanted_chain_letters):
    header = parse_pdb_header(pdb_path)
    missing_residues = {chain_letter: [] for chain_letter in wanted_chain_letters}
    if not header["has_missing_residues"]:
        return missing_residues
    missing_residues_list = header["missing_residues"]
    if not missing_residues_list:
        return missing_residues
    model = missing_residues_list[0]["model"]
    for residue in missing_residues_list:
        if residue["model"] != model or residue["chain"] not in wanted_chain_letters:
            continue
        missing_residues[residue["chain"]].append((residue["ssseq"], residue["res_name"]))
    return missing_residues


def write_sequences_into_fasta(fasta_path, sequences):
    with open(fasta_path, 'w') as f:
        for chain, sequence in sequences:
            f.write(f'>{chain}\n{sequence}\n')
    return fasta_path


def extract_aa_seq_from_pdb(pdb_path, fasta_path, wanted_chains):
    wanted_chains_dict = {chain_id.split(":")[1]: chain_id for chain_id in wanted_chains}
    missing_residues = obtain_missing_residues(pdb_path, wanted_chains_dict.keys())
    sequences = []
    for chain_letter, chain_id in wanted_chains_dict.items():
        chain = parsePDB(pdb_path, chain = chain_letter, subset = 'calpha')
        if chain is None:
            return None
        sequence = obtain_chain_sequence(chain, missing_residues[chain_letter])
        sequences.append((chain_id, sequence))
    return write_sequences_into_fasta(fasta_path, sequences)


proteins["inferred_fasta_path"] = proteins.apply(lambda row: extract_aa_seq_from_pdb(row["original_pdb_path"],
                                                                                     f"{INFERRED_FASTA_FILES_DIRECTORY}/{row['pdb_id']}.fasta",
                                                                                     row["unique_chains"]),
                                                 axis = 1)
proteins



Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains,inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,{8XPV:A},proteins/inferred_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,{8GQ4:A},proteins/inferred_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,{8TIF:A},proteins/inferred_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,{8H3Z:A},proteins/inferred_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,{8ALL:A},proteins/inferred_fasta_files/8ALL.fasta
...,...,...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,{8HNE:A},proteins/inferred_fasta_files/8HNE.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,{8FIN:A},proteins/inferred_fasta_files/8FIN.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,{8J0A:A},proteins/inferred_fasta_files/8J0A.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,{8HDU:A},proteins/inferred_fasta_files/8HDU.fasta


Filter out proteins whose sequence cannot be determined:

In [20]:
proteins.dropna(inplace = True)
proteins

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,unique_chains,inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,{8XPV:A},proteins/inferred_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,{8GQ4:A},proteins/inferred_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,{8TIF:A},proteins/inferred_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,{8H3Z:A},proteins/inferred_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,{8ALL:A},proteins/inferred_fasta_files/8ALL.fasta
...,...,...,...,...,...,...
745,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,{8HNE:A},proteins/inferred_fasta_files/8HNE.fasta
746,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,{8FIN:A},proteins/inferred_fasta_files/8FIN.fasta
747,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,{8J0A:A},proteins/inferred_fasta_files/8J0A.fasta
748,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,{8HDU:A},proteins/inferred_fasta_files/8HDU.fasta


## Create pandas dataframe for chains

In [21]:
chains = proteins.explode("unique_chains", ignore_index = True).rename(columns = {"unique_chains": "chain_id"})
chains

Unnamed: 0,pdb_id,label,original_pdb_path,original_fasta_path,chain_id,inferred_fasta_path
0,8XPV,monomer,proteins/original_pdb_files/8XPV.pdb,proteins/original_fasta_files/8XPV.fasta,8XPV:A,proteins/inferred_fasta_files/8XPV.fasta
1,8GQ4,monomer,proteins/original_pdb_files/8GQ4.pdb,proteins/original_fasta_files/8GQ4.fasta,8GQ4:A,proteins/inferred_fasta_files/8GQ4.fasta
2,8TIF,monomer,proteins/original_pdb_files/8TIF.pdb,proteins/original_fasta_files/8TIF.fasta,8TIF:A,proteins/inferred_fasta_files/8TIF.fasta
3,8H3Z,monomer,proteins/original_pdb_files/8H3Z.pdb,proteins/original_fasta_files/8H3Z.fasta,8H3Z:A,proteins/inferred_fasta_files/8H3Z.fasta
4,8ALL,monomer,proteins/original_pdb_files/8ALL.pdb,proteins/original_fasta_files/8ALL.fasta,8ALL:A,proteins/inferred_fasta_files/8ALL.fasta
...,...,...,...,...,...,...
1042,8HNE,synthetic,proteins/original_pdb_files/8HNE.pdb,proteins/original_fasta_files/8HNE.fasta,8HNE:A,proteins/inferred_fasta_files/8HNE.fasta
1043,8FIN,synthetic,proteins/original_pdb_files/8FIN.pdb,proteins/original_fasta_files/8FIN.fasta,8FIN:A,proteins/inferred_fasta_files/8FIN.fasta
1044,8J0A,synthetic,proteins/original_pdb_files/8J0A.pdb,proteins/original_fasta_files/8J0A.fasta,8J0A:A,proteins/inferred_fasta_files/8J0A.fasta
1045,8HDU,synthetic,proteins/original_pdb_files/8HDU.pdb,proteins/original_fasta_files/8HDU.fasta,8HDU:A,proteins/inferred_fasta_files/8HDU.fasta


## Save both dataframes as csv files

In [22]:
proteins.to_csv("proteins/proteins.csv", sep = ",", index = False)
chains.to_csv("proteins/chains.csv", sep = ",", index = False)