In [1]:
from Bio.PDB import *
import pandas as pd
import os
from tqdm import tqdm
import numpy as np

In [22]:
pdb_list = pd.read_csv("", header=None)
compound_list = pd.read_csv("")
DATA_DIR = ""

In [23]:
rna_list = pdb_list
sm_list = list(compound_list["id"].values)

In [16]:
for i in rna_list[0]:
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(i, file_format="pdb", pdir=DATA_DIR)

Downloading PDB structure '3ges'...
Downloading PDB structure '3f4h'...
Downloading PDB structure '7e9i'...
Downloading PDB structure '6tfg'...
Downloading PDB structure '1q8n'...
Downloading PDB structure '4nfo'...
Downloading PDB structure '2m4q'...
Downloading PDB structure '5zej'...
Downloading PDB structure '3gx6'...
Downloading PDB structure '2o3x'...
Downloading PDB structure '6e1w'...
Downloading PDB structure '5fkg'...
Downloading PDB structure '6dn2'...
Downloading PDB structure '1aju'...
Downloading PDB structure '5kx9'...
Downloading PDB structure '6va4'...
Downloading PDB structure '3sd3'...
Downloading PDB structure '2l94'...
Downloading PDB structure '2juk'...
Downloading PDB structure '2tra'...
Downloading PDB structure '6g7z'...
Downloading PDB structure '6qiv'...
Downloading PDB structure '6va3'...
Downloading PDB structure '4kqy'...
Downloading PDB structure '3fo6'...
Downloading PDB structure '2kxm'...
Downloading PDB structure '3tzr'...
Downloading PDB structure '5

In [17]:
parser = PDBParser() 
parser.QUIET = True

In [18]:
def is_het(residue):
    res = residue.id[0]
    # if res != " " and res != "W":
    #     print(res)
    return res != " " and res != "W" and res.split("_")[1].upper() in sm_list

class ResidueSelect(Select):
    def __init__(self, chain, residue):
        self.chain = chain
        self.residue = residue

    def accept_chain(self, chain):
        return chain.id == self.chain.id

    def accept_residue(self, residue):
        """ Recognition of heteroatoms - Remove water molecules """
        return residue == self.residue and is_het(residue)

class NonHetSelect(Select):
    def accept_residue(self, residue):
        return 1 if residue.id[0] == " " else 0
    
# class ChainSelect(Select):
#     def __init__(self, chain, residue):
#         self.chain = chain
#         self.residue = residue

#     def accept_chain(self, chain):
#         return chain.id == self.chain.id

#     def accept_residue(self, residue):
#         """ Recognition of heteroatoms - Remove water molecules """
#         return residue == self.residue and not is_het(residue)

In [28]:
def check_ligand(structure, file_name, output_dir):
    io = PDBIO()
    io.set_structure(structure)
    model = structure[0]
    io.set_structure(model)
    # directory = f"{output_dir}/{file_name}"
    # if not os.path.exists(directory):
    #     os.makedirs(directory)
    res_list = []
    ligand_counter = 0
    for chain in model:
        for residue in chain:
            if not is_het(residue):
                continue
            res = residue.id[0]
            code = res.split("_")[1].upper()
            if code not in res_list:
                res_list.append(code)
    return res_list

            
        # break


In [29]:
map_dict = {}
for i in tqdm(rna_list[0]):
    structure = parser.get_structure(i, f"")
    ligand_code_list = check_ligand(structure, i, "")
    map_dict[i] = ligand_code_list
    # break
    # print(i)

100%|██████████| 49/49 [00:01<00:00, 31.03it/s]


In [30]:
map_dict

{'1ykv': ['DAI'],
 '7eog': ['GTP', 'J8F'],
 '7eom': ['GTP', 'J8R'],
 '5btp': ['AMZ'],
 '6hbt': ['CBV', 'FXQ'],
 '7kum': ['GP3'],
 '4f8u': ['SIS'],
 '6dme': ['GTP', 'G4P'],
 '6tfh': ['CBV', 'NAD'],
 '6cb3': ['GTP'],
 '4jf2': ['GTP', 'PRF'],
 '3f30': ['GTP', 'FMN'],
 '5v3f': ['74G'],
 '3d2g': ['TPP'],
 '3f2w': ['GTP', 'FMN'],
 '4k32': ['GET'],
 '5hbw': ['PZG'],
 '6xko': ['GTP', 'PRF'],
 '4gpy': ['6HS'],
 '7edl': ['GET'],
 '1n7a': ['SPM'],
 '7eon': ['GTP', 'J8U'],
 '7eop': ['GTP', 'J93'],
 '7kvv': ['GTP', '747'],
 '2be0': ['JS5'],
 '2yie': ['GTP', 'FMN'],
 '3npq': ['SAH'],
 '6n5k': ['GTP', '2BA'],
 '6c65': ['EKJ'],
 '3dil': ['1PE'],
 '3ski': ['GTP', 'GNG'],
 '6v9b': ['QSA'],
 '2esi': ['KAN'],
 '6n5q': ['GTP', '2BA'],
 '6n5o': ['GTP', '2BA'],
 '6wtl': ['GTP', '2BA'],
 '6n5s': ['GTP', '2BA'],
 '6n5t': ['GTP', '2BA'],
 '4qk8': ['GTP', '2BA'],
 '6c63': ['EKJ'],
 '6v9d': ['QW4'],
 '6c8m': ['EQ4', 'EQ1'],
 '2qwy': ['SAM'],
 '2pwt': ['LHA'],
 '3fu2': ['PRF'],
 '6n5p': ['GTP', '2BA'],
 '6n5n': ['

In [31]:
import pickle 

save_path = ""
with open(save_path, 'wb') as f:
    pickle.dump(map_dict, f)