# Preprocessing the PDBs

In this notebook i am preprocessing the pdb structures to prepare them for further processing.
I will be using the preprocess.py module to perform the steps.

## 0. Imports

In [1]:
import os
from src.preprocess import clean_pdb
from rich.progress import track

import warnings
warnings.filterwarnings("ignore")

## 1. Preprocessing

Open all the pdb files and save them after preprocessing.

In [2]:
data_path = "data/raw_pdbs"
out_folder = "data/processed_pdbs/"

for pdb in track(os.listdir(data_path)):
    if not pdb.endswith(".pdb"):
        continue

    pdb_path = os.path.join(data_path, pdb)
    out_path = os.path.join(out_folder, pdb)

    clean_pdb(pdb_path, out_path)
    
print("Done !!!")


Output()

Done !!!


### Testing new preprocessing function using biopython functions

In [11]:
from Bio.PDB.Residue import Residue
from Bio.PDB.Polypeptide import is_aa, is_nucleic
from Bio.PDB.PDBIO import Select, PDBIO
from Bio.PDB import PDBParser

In [15]:
class ResSelect(Select):
    def accept_residue(self, residue):
        if is_aa(residue):
            return 1
        else:
            return 0

class NuSelect(Select):
    def accept_residue(self, residue):
        if is_nucleic(residue):
            return 1
        else:
            return 0

class ComplexSelect(Select):
    def accept_residue(self, residue):
        if is_aa(residue) or is_nucleic(residue):
            return 1
        else:
            return 0

# Select residues with positive index
class PositiveSelect(Select):
    def accept_residue(self, residue):
        if residue.id[1] > 0:
            return 1
        else:
            return 0

# Combine Complex and Positive Select also remove hydrogen atoms
class ComplexPositiveSelect(Select):
    def accept_residue(self, residue):
        if (is_aa(residue) or is_nucleic(residue)) and residue.id[1] > 0:
            return 1
        else:
            return 0

    def accept_atom(self, atom):
        if atom.element == "H":
            return 0
        else:
            return 1

In [16]:
def preprocess_pdb(pdb_path, out_path, select):
    pdb_id = pdb_path.split("/")[-1].split(".")[0]
    pdb_parser = PDBParser()
    structure = pdb_parser.get_structure(pdb_id, pdb_path)
    io = PDBIO()
    io.set_structure(structure)
    io.save(out_path, select)

In [17]:
pdb_path = "data/raw_pdbs/2CCZ.pdb"
out_path = "data/2CCZ_pro.pdb"
preprocess_pdb(pdb_path, out_path, ComplexPositiveSelect())



In [None]:
io = PDBIO()
io.set_structure(structure)
io.save("data/processed_pdbs/1a1e.pdb", ComplexSelect())