# Downloading the structures of selected proteins

In [1]:
import pandas as pd
import numpy as np
import requests, os, pickle
from typing import List, Dict, Set, Tuple

In [2]:
DATA_DIRECTORY = "data"
if not os.path.exists(DATA_DIRECTORY):
    os.makedirs(DATA_DIRECTORY)

PROTEIN_DIRECTORY = "data/proteins"
if not os.path.exists(PROTEIN_DIRECTORY):
    os.makedirs(PROTEIN_DIRECTORY)

## Load `proteins` dataframe from csv file

In [3]:
proteins = pd.read_csv("data/proteins_dataset.csv")
for pdb_id in proteins["pdb_id"]:
    if not os.path.exists(f"{PROTEIN_DIRECTORY}/{pdb_id}"):
        os.makedirs(f"{PROTEIN_DIRECTORY}/{pdb_id}")

proteins

Unnamed: 0,pdb_id,label
0,8P0E,monomer
1,8PX8,monomer
2,8B2E,monomer
3,8HOE,monomer
4,8TCE,monomer
...,...,...
1348,8G9J,synthetic
1349,8OYV,synthetic
1350,8TNO,synthetic
1351,8FJE,synthetic


## Download PDB files according to the PDB IDs

In [4]:
def download_structure(pdb_id: str, directory: str) -> bool:
    """
    Downloads the pdb file with structure corresponding to the given PDB ID.
    
    Parameters:
        pdb_id (str): PDB ID of the structure
        directory (str): path to the directory where the downloaded file should be stored
    
    Returns:
        boolean indicating whether downloading was successful or not
    """
    try:
        # download 3D structure based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://files.rcsb.org/download/{pdb_id_lower}.pdb')
        if not response.ok or response.text == 'N/A':
            return False
        structure = response.text

        # save the downloaded structure to a pdb file:
        path = f'{directory}/{pdb_id}/{pdb_id}_original.pdb'
        with open(path, 'w') as f:
            f.write(structure)
        return True

    except:
        return False


success = proteins.apply(lambda row: download_structure(row["pdb_id"], PROTEIN_DIRECTORY), axis = 1)

discarded_proteins = proteins.loc[~success].reset_index(drop=True)
discarded_proteins["reason"] = np.repeat("cannot download pdb file", len(discarded_proteins))
proteins = proteins.loc[success].reset_index(drop=True)

proteins

Unnamed: 0,pdb_id,label
0,8P0E,monomer
1,8PX8,monomer
2,8B2E,monomer
3,8HOE,monomer
4,8TCE,monomer
...,...,...
1348,8G9J,synthetic
1349,8OYV,synthetic
1350,8TNO,synthetic
1351,8FJE,synthetic


## Download fasta files

In [5]:
def download_fasta(pdb_id: str, directory: str) -> bool:
    """
    Downloads the fasta file with amino acid sequences corresponding to the given PDB ID.
    
    Parameters:
        pdb_id (str): PDB ID of the structure
        directory (str): path to the directory where the downloaded file should be stored
    
    Returns:
        boolean indicating whether downloading was successful or not
    """
    try:
        # download fasta file based on PDB ID:
        pdb_id_lower = pdb_id.lower()
        response = requests.get(f'https://www.rcsb.org/fasta/entry/{pdb_id_lower}')
        if not response.ok or response.text == 'N/A':
            return False
        sequence = response.text

        # save the downloaded sequence as a fasta file:
        path = f'{directory}/{pdb_id}/{pdb_id}_original.fasta'
        with open(path, 'w') as f:
            f.write(sequence)
        return True

    except:
        return False


success = proteins.apply(lambda row: download_fasta(row["pdb_id"], PROTEIN_DIRECTORY), axis = 1)

discarded_proteins_fasta = proteins.loc[~success].reset_index(drop=True)
discarded_proteins_fasta["reason"] = np.repeat("cannot download fasta file", len(discarded_proteins_fasta))
discarded_proteins = pd.concat([discarded_proteins, discarded_proteins_fasta], ignore_index = True)
proteins = proteins.loc[success].reset_index(drop=True)
proteins

Unnamed: 0,pdb_id,label
0,8P0E,monomer
1,8PX8,monomer
2,8B2E,monomer
3,8HOE,monomer
4,8TCE,monomer
...,...,...
1348,8G9J,synthetic
1349,8OYV,synthetic
1350,8TNO,synthetic
1351,8FJE,synthetic


## Obtain sequences of identified chains from pdb files

Identify unique chains:

In [6]:
def extract_chain_id(line: str) -> str:
    """
    Returns letter indicating the name of the protein chain used in the corresponding pdb file.
    
    Parameters:
        line (str): the line containing the header of a sequence in a fasta file
    """
    id_substring = line.split("|")[1].replace("s", "").removeprefix("Chain ").split(",")[0]
    if "auth" in id_substring:
        return id_substring.split()[1][0]
    return id_substring[0]


def identify_unique_chains(fasta_path: str) -> Dict[str, str]:
    """
    Returns dictionary, keys are indicating the names of all unique chains in the fasta file,
    the values are corresponding sequences.
    
    Parameters:
        fasta_path (str): path to the fasta file
    """
    unique_chains = {}
    with open(fasta_path, "r") as file:
        lines = file.readlines()
    
    for i in range(0, len(lines), 2):
        chain_letter = extract_chain_id(lines[i])
        unique_chains[chain_letter] = lines[i + 1].strip()
    return unique_chains

In [7]:
from Bio.Data.PDBData import protein_letters_3to1_extended

class Atom:
    def __init__(self, x: float, y: float, z: float):
        self.x = x
        self.y = y
        self.z = z


class Residue:
    def __init__(self, amino_acid: str, position: int, alpha: str, order: int):
        self.amino_acid = amino_acid
        self.position = position
        self.alpha = alpha
        self.order = order
        self.ca = None
        self.is_hetatm = False
        self.is_terminal = False

    def add_alpha_carbon(self, x: float, y: float, z: float) -> None:
        self.ca = Atom(x, y, z)
        

class Chain:
    def __init__(self, letter: str, expected_sequence: str):
        self.letter = letter
        self.expected_sequence = expected_sequence
        self.residues = {}
        self.residue_counter = 0
        self.sequence = None
        self.mask = None

    def add_residue(self, amino_acid: str, position_string: str) -> Residue:
        position, alpha = process_position(position_string)
        residue = Residue(amino_acid, position, alpha, self.residue_counter)
        self.residues[position_string] = residue
        self.residue_counter += 1
        return residue

    def get_residue(self, amino_acid: str, position_string: str) -> Residue:
        residue = self.residues.get(position_string)
        if residue is None:
            residue = self.add_residue(amino_acid, position_string)
        return residue

    def save_sequence_and_mask(self) -> bool:
        sequence, mask = [], []
        met_terminal = False
        sorted_residues = sorted(list(self.residues.items()), key = lambda x: (x[1].position, x[1].alpha, x[1].order))
        for position, residue in sorted_residues:
            if residue.is_hetatm and met_terminal:
                continue
            sequence.append(residue.amino_acid)
            if residue.ca is not None:
                mask.append("1")
            else:
                mask.append("0")
            if residue.is_terminal:
                met_terminal = True
        self.sequence = "".join(sequence)
        self.mask = "".join(mask)
        return self.expected_sequence.strip("X") == self.sequence.strip("X")


class Structure:
    def __init__(self, pdb_id: str, expected_chains: Dict[str, str]):
        self.pdb_id = pdb_id
        self.chains = {chain_letter: Chain(chain_letter, expected_sequence) for chain_letter, expected_sequence in expected_chains.items()}

    def parse_ATOM_and_HETATM(self, line: str) -> None:
        is_hetatm = line.startswith("HETATM")
        atom_name = line[12:16].strip()
        amino_acid = protein_letters_3to1_extended.get(line[17:20])
        chain = self.chains.get(line[21])
        position_string = line[22:27].strip()
        if amino_acid is not None and chain is not None:
            residue = chain.get_residue(amino_acid, position_string)
            if is_hetatm:
                residue.is_hetatm = True
            if atom_name == "CA":
                residue.add_alpha_carbon(float(line[30:38]), float(line[38:46]), float(line[46:54]))

    def parse_TER(self, line: str) -> None:
        amino_acid = protein_letters_3to1_extended.get(line[17:20], "X")
        chain = self.chains.get(line[21])
        position_string = line[22:27].strip()
        if chain is not None:
            chain.get_residue(amino_acid, position_string).is_terminal = True

    def parse_REMARK_465_and_MODRES(self, line: str, is_REMARK_465_line: bool) -> None:
        amino_acid_position = 2 if is_REMARK_465_line else 5
        attributes = line.split()
        if len(attributes) >= 5:
            amino_acid = protein_letters_3to1_extended.get(attributes[amino_acid_position], "X")
            chain = self.chains.get(attributes[3])
            position_string = attributes[4]
            if amino_acid is not None and chain is not None:
                chain.add_residue(amino_acid, position_string)

    def save_sequences_and_masks(self) -> bool:
        for chain in self.chains.values():
            if not chain.save_sequence_and_mask():
                return False
        return True

    def write_to_files(self, directory: str, wanted_chain_letters: Set[str]) -> None:
        sequences = {chain_letter: chain.sequence for chain_letter, chain in self.chains.items()}
        with open(f"{directory}/{self.pdb_id}_inferred.fasta", "w") as fasta_file:
            for chain_letter, sequence in sequences.items():
                if chain_letter in wanted_chain_letters:
                    fasta_file.write(f">{self.pdb_id}:{chain_letter}\n{sequence}\n")
                    with open(f"{directory}/{self.pdb_id}:{chain_letter}.fasta", "w") as chain_fasta_file:
                        chain_fasta_file.write(f">{self.pdb_id}:{chain_letter}\n{sequence}\n")
    

def process_lines(pdb_id: str, wanted_chains: Dict[str, str], lines: List[str]) -> Structure:
    structure = Structure(pdb_id, wanted_chains)
    i = 0
    while i < len(lines):
        if lines[i].startswith("REMARK 465") and "RES C SSSEQI" in lines[i]:
            i = process_REMARK_465_and_MODRES(lines, i + 1, structure, True)
        elif lines[i].startswith("MODRES"):
            i = process_REMARK_465_and_MODRES(lines, i, structure, False) 
        elif lines[i].startswith("MODEL") and "1" in lines[i]:
            process_MODEL(lines, i + 1, structure)
            break
        elif lines[i].startswith("ATOM"):
            process_structure(lines, i, structure)
            break
        else:
            i += 1
    return structure
            

def process_REMARK_465_and_MODRES(lines: List[str], i: int, structure: Structure, is_REMARK_465_line: bool) -> int:
    line_start = "REMARK 465" if is_REMARK_465_line else "MODRES"
    while i < len(lines) and lines[i].startswith(line_start):
        structure.parse_REMARK_465_and_MODRES(lines[i], is_REMARK_465_line)
        i += 1
    return i


def process_MODEL(lines: List[str], i: int, structure: Structure) -> None:
    while i < len(lines) and not lines[i].startswith("ENDMDL"):
        if lines[i].startswith("ATOM") or lines[i].startswith("HETATM"):
            structure.parse_ATOM_and_HETATM(lines[i])
        elif lines[i].startswith("TER"):
            structure.parse_TER(lines[i])
        i += 1
        

def process_structure(lines: List[str], i: int, structure: Structure) -> None:
    while i < len(lines):
        if lines[i].startswith("ATOM") or lines[i].startswith("HETATM"):
            structure.parse_ATOM_and_HETATM(lines[i])
        elif lines[i].startswith("TER"):
            structure.parse_TER(lines[i])
        i += 1

def process_position(str_pos: str) -> Tuple[int, str]:
    i = len(str_pos) - 1
    n, int_pos = 0, 0
    alpha = ""
    while i >= 0:
        if str_pos[i].isdigit():
            int_pos += int(str_pos[i]) * (10 ** n)
            n += 1
        elif str_pos[i] == "-":
            int_pos *= -1
        elif str_pos[i].isalpha():
            alpha = str_pos[i]
        i -= 1
    return int_pos, alpha


def process_pdb(pdb_id: str, directory: str) -> bool:
    """
    Processes the pdb file, determines the amino acid sequence of unique chains,
    saves the sequence into fasta files, and saves the position of alpha carbons into a pickle file.
    
    Parameters:
        pdb_id (str): PDB ID of the structure
        directory (str): path to the directory containing information about the proteins
    
    Returns:
        boolean indicating whether the process was successful or not
    """
    try:
        with open(f"{directory}/{pdb_id}/{pdb_id}_original.pdb") as file:
            lines = file.readlines()
    except IOError as e:
        print(f"I/O error({e.errno}): {e.strerror}")
        return False
    wanted_chains = identify_unique_chains(f"{directory}/{pdb_id}/{pdb_id}_original.fasta")
    structure = process_lines(pdb_id, wanted_chains, lines)
    if not structure.save_sequences_and_masks():
        return False
    structure.write_to_files(f"{directory}/{pdb_id}", wanted_chains.keys())
    with open(f"{directory}/{pdb_id}/{pdb_id}.pkl", "wb") as file: 
        pickle.dump(structure, file)
    return True


success = proteins.apply(lambda row: process_pdb(row["pdb_id"], PROTEIN_DIRECTORY), axis = 1)

discarded_proteins_sequence = proteins.loc[~success].reset_index(drop=True)
discarded_proteins_sequence["reason"] = np.repeat("cannot determine correct sequence", len(discarded_proteins_sequence))
discarded_proteins = pd.concat([discarded_proteins, discarded_proteins_sequence], ignore_index = True)
proteins = proteins.loc[success].reset_index(drop=True)

proteins

Unnamed: 0,pdb_id,label
0,8P0E,monomer
1,8PX8,monomer
2,8B2E,monomer
3,8HOE,monomer
4,8TCE,monomer
...,...,...
1324,8G9J,synthetic
1325,8OYV,synthetic
1326,8TNO,synthetic
1327,8FJE,synthetic


## Create pandas dataframe for chains and separate fasta files

In [8]:
def get_chain_ids(directory: str, pdb_id: str) -> List[str]:
    with open(f"{directory}/{pdb_id}/{pdb_id}.pkl", "rb") as file:
        structure = pickle.load(file)
    return [f"{pdb_id}:{chain_letter}" for chain_letter in structure.chains.keys()]

proteins["unique_chains"] = proteins.apply(lambda row: get_chain_ids(PROTEIN_DIRECTORY, row["pdb_id"]), axis = 1)
chains = proteins.explode("unique_chains", ignore_index = True).rename(columns = {"unique_chains": "chain_id"})
chains

Unnamed: 0,pdb_id,label,chain_id
0,8P0E,monomer,8P0E:A
1,8PX8,monomer,8PX8:A
2,8B2E,monomer,8B2E:A
3,8HOE,monomer,8HOE:A
4,8TCE,monomer,8TCE:A
...,...,...,...
1460,8G9J,synthetic,8G9J:A
1461,8OYV,synthetic,8OYV:A
1462,8TNO,synthetic,8TNO:A
1463,8FJE,synthetic,8FJE:A


Create fasta files containing only sequences of the given chains. This will be useful when running AlphaFold and ESMFold.

## Save both dataframes as csv files

In [9]:
proteins.to_csv(f"data/proteins.csv", sep = ",", index = False)
chains.to_csv(f"data/chains.csv", sep = ",", index = False)

## Discarded proteins

In [10]:
discarded_proteins

Unnamed: 0,pdb_id,label,reason
0,8EF4,monomer,cannot determine correct sequence
1,8CGM,monomer,cannot determine correct sequence
2,8JT8,monomer,cannot determine correct sequence
3,8BI9,monomer,cannot determine correct sequence
4,8ZBO,monomer,cannot determine correct sequence
5,8U24,monomer,cannot determine correct sequence
6,8SXC,monomer,cannot determine correct sequence
7,8YQ4,monomer,cannot determine correct sequence
8,8ILC,monomer,cannot determine correct sequence
9,8TYI,monomer,cannot determine correct sequence
