In [2]:
import os, glob, sys
import pandas as pd

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB import PDBIO,Select

In [21]:
def get_name(path):
    return path.split('/')[-1][:-4]

def get_chains(pdb_name):
    split = pdb_name.split('_')
    assert len(split) >= 3
    pep_chain_id = split[2]
    assert len(pep_chain_id) == 1
    prot_chain_ids = set([x for x in split[1]])
    return prot_chain_ids,pep_chain_id

# For writing specific sections of the PDB to the file
class selector(Select):
    def __init__(self, chain_ids = set(), res_num_range = (), res_range_chains = set()):
        self.chain_ids = chain_ids
        self.res_range_chains = res_range_chains
        self.res_num_range = res_num_range
    
    def accept_chain(self, chain):
        if self.chain_ids == set():
            return True
        elif chain.get_id() in self.chain_ids:
            return True
        return False
    
    def accept_residue(self, residue):
        # if no range provided, or residue not in res_range_chain, accept
        if self.res_num_range == ():
            return True
        if residue.parent.id not in self.res_range_chains:
            return True
        
        res_num = residue.id[1]
        if self.res_num_range[0] <= res_num and self.res_num_range[1] >= res_num:
            return True
        return False
    
def writeBinderFile(inputPDBPath,outputDir,peptide_segment_length):
    p = PDBParser(PERMISSIVE=1,QUIET=1)
    io = PDBIO()
    
    # load PDB
    s = p.get_structure(get_name(inputPDBPath),inputPDBPath)

    # get the protein/peptide chains
    pdb_name = get_name(inputPDBPath)
    protein_chains,peptide_chain = get_chains(pdb_name)
    
    # get first residue in peptide
    first_res_num = int(s[0][peptide_chain].child_list[0].get_full_id()[3][1])

    io.set_structure(s)
    with open(f"{os.path.join(outputDir,pdb_name)}_nativePeptideBinders_{peptide_segment_length}.pdb",'w') as out_file:
        out_file.write(f"HEADER    {pdb_name}_target"+'\n')
        io.save(out_file,selector(protein_chains))
        for i in range(first_res_num,first_res_num+len(s[0][peptide_chain].child_list)-peptide_segment_length+1):
            out_file.write(f"HEADER    {pdb_name}_{i}-{i+peptide_segment_length-1}"+'\n')
            io.save(out_file,selector({peptide_chain},(i,i+peptide_segment_length-1)))
            
def writeComplexFiles(inputPDBPath,outputDir,peptide_segment_length):
    p = PDBParser(PERMISSIVE=1,QUIET=1)
    io = PDBIO()
    
    # load PDB
    s = p.get_structure(get_name(inputPDBPath),inputPDBPath)

    # get the protein/peptide chains
    pdb_name = get_name(inputPDBPath)
    protein_chains,peptide_chain = get_chains(pdb_name)
    all_chains = protein_chains
    all_chains.add(peptide_chain)
    
    # get first residue in peptide
    first_res_num = int(s[0][peptide_chain].child_list[0].get_full_id()[3][1])

    io.set_structure(s)
    for i in range(first_res_num,first_res_num+len(s[0][peptide_chain].child_list)-peptide_segment_length+1):
        pdb_complex_name = f"{pdb_name}_{i}-{i+peptide_segment_length-1}"
        print('writing',pdb_complex_name)
        with open(f"{os.path.join(outputDir,pdb_complex_name)}.pdb",'w') as out_file:
            io.save(out_file,selector(all_chains,(i,i+peptide_segment_length-1),{peptide_chain}))

# load pixelDB728

In [12]:
path = '/home/gridsan/sswanson/jupyter_notebooks/pixelDB_dataset/pixelDB728_uniqueBindingModes_list.txt'
pixelDB_paths = []
with open(path,'r') as file:
    pixelDB_paths = [line.rstrip() for line in file]
print(len(pixelDB_paths),pixelDB_paths[0:5])

728 ['/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/1/3MRE_A_P_1_1.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/1/3SPV_A_C_1_2.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/1/4F7T_D_F_1_3.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/1/1I4F_A_C_1_4.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/1/1XR9_A_C_1_5.pdb']


# write pixelDB728 5/7 segments

In [35]:
peptide_segment_length = 7
outputDir = 'pixelDB728_peptideSegments_' + str(peptide_segment_length)
os.makedirs(outputDir,exist_ok=True)

for path in pixelDB_paths:
    writeBinderFile(path,outputDir,peptide_segment_length)

In [36]:
peptide_segment_length = 5
outputDir = 'pixelDB728_peptideSegments_' + str(peptide_segment_length)
os.makedirs(outputDir,exist_ok=True)

for path in pixelDB_paths:
    writeBinderFile(path,outputDir,peptide_segment_length)

# Load pixelDB30

In [14]:
path = '/home/gridsan/sswanson/jupyter_notebooks/pixelDB_dataset/pixelDB30_list.txt'
pixelDB30_paths = []
with open(path,'r') as file:
    pixelDB30_paths = [line.rstrip() for line in file]
print(len(pixelDB30_paths),pixelDB30_paths[0:5])

29 ['/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/1/1S7V_D_F_1_2.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/2/4JYI_A_G_2_1.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/3/1BBR_N_I_3_4.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/4/3E8U_LH_P_4_39.pdb', '/data1/groups/keatinglab/MST_workspace/PixelDB/clusters/5/3QAM_E_I_5_1.pdb']


# Write PixelDB30 5/7 segments

In [38]:
peptide_segment_length = 7
outputDir = 'pixelDB30_peptideSegments_' + str(peptide_segment_length)
os.makedirs(outputDir,exist_ok=True)

for path in pixelDB30_paths:
    writeBinderFile(path,outputDir,peptide_segment_length)

In [39]:
peptide_segment_length = 5
outputDir = 'pixelDB30_peptideSegments_' + str(peptide_segment_length)
os.makedirs(outputDir,exist_ok=True)

for path in pixelDB30_paths:
    writeBinderFile(path,outputDir,peptide_segment_length)

# write complex PDBs (each peptide fragment + protein pair is it's own PDB)

In [19]:
peptide_segment_length = 7
outputDir = 'pixelDB30_peptideSegments_complexPDBs_' + str(peptide_segment_length)
os.makedirs(outputDir,exist_ok=True)

for path in pixelDB30_paths:
    writeComplexFiles(path,outputDir,peptide_segment_length)

writing 1S7V_D_F_1_2_1-7
writing 1S7V_D_F_1_2_2-8
writing 1S7V_D_F_1_2_3-9
writing 4JYI_A_G_2_1_1-7
writing 4JYI_A_G_2_1_2-8
writing 4JYI_A_G_2_1_3-9
writing 4JYI_A_G_2_1_4-10
writing 1BBR_N_I_3_4_1-7
writing 1BBR_N_I_3_4_2-8
writing 1BBR_N_I_3_4_3-9
writing 1BBR_N_I_3_4_4-10
writing 3E8U_LH_P_4_39_1-7
writing 3E8U_LH_P_4_39_2-8
writing 3E8U_LH_P_4_39_3-9
writing 3QAM_E_I_5_1_1-7
writing 3QAM_E_I_5_1_2-8
writing 3QAM_E_I_5_1_3-9
writing 3QAM_E_I_5_1_4-10
writing 3QAM_E_I_5_1_5-11
writing 3QAM_E_I_5_1_6-12
writing 3QAM_E_I_5_1_7-13
writing 3QAM_E_I_5_1_8-14
writing 3QAM_E_I_5_1_9-15
writing 3QAM_E_I_5_1_10-16
writing 3QAM_E_I_5_1_11-17
writing 3QAM_E_I_5_1_12-18
writing 3QAM_E_I_5_1_13-19
writing 3UKW_B_C_8_1_1-7
writing 3UKW_B_C_8_1_2-8
writing 3UKW_B_C_8_1_3-9
writing 3UKW_B_C_8_1_4-10
writing 3UKW_B_C_8_1_5-11
writing 3UKW_B_C_8_1_6-12
writing 3UKW_B_C_8_1_7-13
writing 3UKW_B_C_8_1_8-14
writing 3UKW_B_C_8_1_9-15
writing 3UKW_B_C_8_1_10-16
writing 3UKW_B_C_8_1_11-17
writing 3UKW_B_C_8

In [22]:
# write out relB peptide fragments, since we care about those
path = '/data1/groups/keatinglab/swans/score_binders/221003_relEseeds_4FXE_E_B/files/4FXE_E_B.pdb'
peptide_segment_length = 7
outputDir = 'relB_peptideSegments_complexPDBs_' + str(peptide_segment_length)
os.makedirs(outputDir,exist_ok=True)
writeComplexFiles(path,outputDir,peptide_segment_length)

writing 4FXE_E_B_48-54
writing 4FXE_E_B_49-55
writing 4FXE_E_B_50-56
writing 4FXE_E_B_51-57
writing 4FXE_E_B_52-58
writing 4FXE_E_B_53-59
writing 4FXE_E_B_54-60
writing 4FXE_E_B_55-61
writing 4FXE_E_B_56-62
writing 4FXE_E_B_57-63
writing 4FXE_E_B_58-64
writing 4FXE_E_B_59-65
writing 4FXE_E_B_60-66
writing 4FXE_E_B_61-67
writing 4FXE_E_B_62-68
writing 4FXE_E_B_63-69
writing 4FXE_E_B_64-70
writing 4FXE_E_B_65-71
writing 4FXE_E_B_66-72
writing 4FXE_E_B_67-73
writing 4FXE_E_B_68-74
writing 4FXE_E_B_69-75
writing 4FXE_E_B_70-76
writing 4FXE_E_B_71-77
writing 4FXE_E_B_72-78
writing 4FXE_E_B_73-79
