In [1]:
import pandas as pd
import os
from tqdm import tqdm
from rdkit import Chem
import pickle
#import functions from PLIG utilities script
from PLIG_utils import *

In [2]:
#load in raw data PDB files
raw_data = "example_PDBbind_data/"
list_of_pdbcodes = [i for i in os.listdir(raw_data)]

#load in all atom types for the 20 proteinogenic amino acids
Atom_Keys=pd.read_csv("PDB_Atom_Keys.csv", sep=",")


In [3]:
#FOR PLIG
mol_graphs_crystal_8A = {}
mol_graphs_crystal_7A = {}
mol_graphs_crystal_6A = {}
mol_graphs_crystal_5A = {}
mol_graphs_crystal_4A = {}

#make the crystal PLIGs
for pdb in tqdm(list_of_pdbcodes):
    
    
    # load the ligand 
    c_file = os.path.join(raw_data, pdb, f'{pdb}_ligand.mol2')
    c_mol = Chem.AddHs(Chem.MolFromMol2File(c_file), addCoords=True) 
    # load the protein file -> WE HAVE ISOLATED THE PROTEIN POLYMER FROM SOLVENT AND WATER BEFORE!
    pdb_file_cleaned = os.path.join(raw_data, pdb, f'{pdb}_pol_protein.pdb')

    #8 Angstrom
    contacts_8A = GetAtomContacts(pdb_file_cleaned, c_mol, Atom_Keys, distance_cutoff=8.0)
    graph_c_8 = mol_to_graph(c_mol, contacts_8A, Atom_Keys)
    mol_graphs_crystal_8A[pdb] = graph_c_8

    #7 Angstrom
    contacts_7A = GetAtomContacts(pdb_file_cleaned, c_mol, Atom_Keys, distance_cutoff=7.0)
    graph_c_7 = mol_to_graph(c_mol, contacts_7A, Atom_Keys)
    mol_graphs_crystal_7A[pdb] = graph_c_7

    #6 Angstrom
    contacts_6A = GetAtomContacts(pdb_file_cleaned, c_mol, Atom_Keys, distance_cutoff=6.0)
    graph_c_6 = mol_to_graph(c_mol, contacts_6A, Atom_Keys)
    mol_graphs_crystal_6A[pdb] = graph_c_6

    #5 Angstrom
    contacts_5A = GetAtomContacts(pdb_file_cleaned, c_mol, Atom_Keys, distance_cutoff=5.0)
    graph_c_5 = mol_to_graph(c_mol, contacts_5A, Atom_Keys)
    mol_graphs_crystal_5A[pdb] = graph_c_5

    #4 Angstrom
    contacts_4A = GetAtomContacts(pdb_file_cleaned, c_mol, Atom_Keys, distance_cutoff=4.0)
    graph_c_4 = mol_to_graph(c_mol, contacts_4A, Atom_Keys)
    mol_graphs_crystal_4A[pdb] = graph_c_4

    #---------------------------------------------------------------------------------------------

100%|█████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  6.87it/s]


In [4]:
#save the graphs to use as input for the GNN models

output_file_graphs = os.path.join("PLIG_output/", "PLIG_test_run_8A.pickle")
with open(output_file_graphs, 'wb') as handle:
    pickle.dump(mol_graphs_crystal_8A, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
output_file_graphs = os.path.join("PLIG_output/", "PLIG_test_run_7A_std.pickle")
with open(output_file_graphs, 'wb') as handle:
    pickle.dump(mol_graphs_crystal_7A, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
output_file_graphs = os.path.join("PLIG_output/", "PLIG_test_run_6A_std.pickle")
with open(output_file_graphs, 'wb') as handle:
    pickle.dump(mol_graphs_crystal_6A, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
output_file_graphs = os.path.join("PLIG_output/", "PLIG_test_run_5A_std.pickle")
with open(output_file_graphs, 'wb') as handle:
    pickle.dump(mol_graphs_crystal_5A, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
output_file_graphs = os.path.join("PLIG_output/", "PLIG_test_run_4A_std.pickle")
with open(output_file_graphs, 'wb') as handle:
    pickle.dump(mol_graphs_crystal_4A, handle, protocol=pickle.HIGHEST_PROTOCOL)