In [1]:
!pip install rdkit-pypi



In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
import os

def prepare_protein(pdb_path, output_path):
    protein = Chem.MolFromPDBFile(pdb_path, removeHs=False)
    if protein is None:
        print(f"Failed to load PDB file: {pdb_path}")
        return
    protein = Chem.AddHs(protein)
    Chem.MolToPDBFile(protein, output_path)
    print(f"Prepared and saved: {output_path}")

def prepare_all_pdb_files(directory, output_directory):
    for filename in os.listdir(directory):
        if filename.endswith(".pdb"):
            pdb_path = os.path.join(directory, filename)
            output_path = os.path.join(output_directory, filename.replace('.pdb', '_prepared.pdb'))
            prepare_protein(pdb_path, output_path)

prepare_all_pdb_files("human_p2rx7", "prepared_human_p2rx7")

Prepared and saved: prepared_human_p2rx7/AF-Q99572-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-F5H2X6-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-F5H237-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-Q15G98-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-J3KN30-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-C6KE32-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-A0A7G3W903-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-A0A7G3W907-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-Q0IJ51-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-A0A7G3W904-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-A0A7G3W905-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-A0A4Y5R1Y6-F1-model_v4_prepared.pdb
Prepared and saved: prepared_human_p2rx7/AF-A0A4Y5R233-F1-model_v4_p

In [4]:
import pandas as pd

df = pd.read_csv('5534_human_Chembyl.csv')
df = df[df['Standard Type'] == 'IC50']

In [5]:
print(df.shape)

(4008, 11)


In [6]:
df = df[df['Standard Units'] == 'nM']
print(df.shape)

(3936, 11)


In [7]:
df = df.drop(columns=['Standard Type', 'Standard Units'])
df = df.rename(columns={'Standard Value': 'IC50_nM'})

In [8]:
print(df.shape)
print(df.head())

(3936, 9)
   Molecular Weight  AlogP                                             Smiles  \
0            338.28   4.94        O=C(NCC12CC3CC(CC(C3)C1)C2)c1cc(Cl)cc(Cl)c1   
1            423.54   2.46  O=C(CNS(=O)(=O)c1cccc2ccccc12)N1CCN(Cc2ccccc2)CC1   
2            438.55   2.25  O=C(CNS(=O)(=O)c1cccc2cnccc12)N1CCCN(Cc2ccccc2...   
3            459.57   3.62  O=C(CNS(=O)(=O)c1cccc2ccccc12)N1CCN(c2cccc3ccc...   
4            425.56   2.58  CC(=O)N1CCN(C[C@@H](C)NC(=O)c2cc3c(-c4ccccc4)n...   

    IC50_nM  pChEMBL Value  Ligand Efficiency BEI  Ligand Efficiency LE  \
0   3981.07           5.40                    NaN                   NaN   
1   1000.00            NaN                    NaN                   NaN   
2    900.00           6.05                    NaN                   NaN   
3   1000.00            NaN                    NaN                   NaN   
4  10000.00            NaN                    NaN                   NaN   

   Ligand Efficiency LLE  Ligand Efficiency SEI  
0 

In [9]:
bei_count = df['Ligand Efficiency BEI'].isna().sum()
le_count = df['Ligand Efficiency LE'].isna().sum()
lle_count = df['Ligand Efficiency LLE'].isna().sum()
sei_count = df['Ligand Efficiency SEI'].isna().sum()

print("Number of rows with NaN for BEI:", bei_count)
print("Number of rows with NaN for LE:", le_count)
print("Number of rows with NaN for LLE:", lle_count)
print("Number of rows with NaN for SEI:", sei_count)

Number of rows with NaN for BEI: 1008
Number of rows with NaN for LE: 1014
Number of rows with NaN for LLE: 1014
Number of rows with NaN for SEI: 1014


In [10]:
df = df.dropna(subset=['Ligand Efficiency LE'])

In [11]:
bei_count = df['Ligand Efficiency BEI'].isna().sum()
le_count = df['Ligand Efficiency LE'].isna().sum()
lle_count = df['Ligand Efficiency LLE'].isna().sum()
sei_count = df['Ligand Efficiency SEI'].isna().sum()

print("Number of rows with NaN for BEI:", bei_count)
print("Number of rows with NaN for LE:", le_count)
print("Number of rows with NaN for LLE:", lle_count)
print("Number of rows with NaN for SEI:", sei_count)

Number of rows with NaN for BEI: 0
Number of rows with NaN for LE: 0
Number of rows with NaN for LLE: 0
Number of rows with NaN for SEI: 0


In [12]:
print(df.head())

    Molecular Weight  AlogP  \
7             380.38   4.55   
9             436.83   3.78   
10            427.84   4.12   
13            360.82   3.47   
14            298.75   2.11   

                                               Smiles   IC50_nM  \
7      Cc1cccc(C(=O)NCC2(c3ccc(F)nc3)CCC(F)(F)CC2)c1F    78.000   
9   O=C1c2nnc(-c3cc[nH]n3)n2CC(C2CC2)N1Cc1cccc(C(F...     1.100   
10  C[C@H]1Cn2c(nnc2-c2cscn2)C(=O)N1Cc1cccc(C(F)(F...     0.200   
13       Cc1ccc(C)c(N2CCN(Cc3ccc(F)cc3Cl)C(=O)C2=O)c1     7.943   
14              CCN1C(=O)CC[C@H]1C(=O)NCc1c(F)cccc1Cl  1000.000   

    pChEMBL Value  Ligand Efficiency BEI  Ligand Efficiency LE  \
7            7.11                  18.69                  0.36   
9            8.96                  20.51                  0.41   
10           9.70                  22.67                  0.47   
13           8.10                  22.45                  0.44   
14           6.00                  20.08                  0.41   

    Ligand Ef

In [13]:
df.to_csv('Processed_Human_Smules_Data.csv', index=False)

In [14]:
def smiles_to_3d_structures(df, column_name='Smiles', output_dir='./ligand_3d_structures'):
    import os
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for index, row in df.iterrows():
        smiles = row[column_name]
        mol = Chem.MolFromSmiles(smiles)
        mol = Chem.AddHs(mol)  # Add hydrogens
        AllChem.EmbedMolecule(mol, AllChem.ETKDG())  # Generate 3D coordinates
        AllChem.UFFOptimizeMolecule(mol)  # Perform a UFF optimization
        output_file = os.path.join(output_dir, f"ligand_{index+1}.sdf")
        writer = Chem.SDWriter(output_file)
        writer.write(mol)
        writer.close()
        print(f"Saved: {output_file}")

In [16]:
print(f"Number of ligands to process: {df.shape[0]}")

Number of ligands to process: 2922


In [21]:
import shutil

folder_path = 'ligand_3d_structures'
if os.path.exists(folder_path):
    shutil.rmtree(folder_path)
    print(f"Deleted folder: {folder_path}")
else:
    print(f"No folder found at {folder_path}, nothing to delete.")

Deleted folder: ligand_3d_structures


In [22]:
def smiles_to_3d_structures(df, column_name='Smiles', output_dir='./ligand_3d_structures'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for index, row in df.iterrows():
        smiles = row[column_name]
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"Skipping invalid SMILES: {smiles}")
            continue
        mol = Chem.AddHs(mol)  # Add hydrogens
        if AllChem.EmbedMolecule(mol, AllChem.ETKDG()) != 0:  # Check if embedding is successful
            print(f"Embedding failed for: {smiles}")
            continue
        try:
            AllChem.UFFOptimizeMolecule(mol)  # Perform a UFF optimization
        except ValueError as e:
            print(f"Optimization failed for: {smiles}, Error: {str(e)}")
            continue
        output_file = os.path.join(output_dir, f"ligand_{index+1}.sdf")
        writer = Chem.SDWriter(output_file)
        writer.write(mol)
        writer.close()
        print(f"Saved: {output_file}")

In [23]:
smiles_to_3d_structures(df)

Saved: ./ligand_3d_structures/ligand_8.sdf
Saved: ./ligand_3d_structures/ligand_10.sdf
Saved: ./ligand_3d_structures/ligand_11.sdf
Saved: ./ligand_3d_structures/ligand_14.sdf
Saved: ./ligand_3d_structures/ligand_15.sdf
Saved: ./ligand_3d_structures/ligand_16.sdf
Saved: ./ligand_3d_structures/ligand_17.sdf
Saved: ./ligand_3d_structures/ligand_18.sdf
Saved: ./ligand_3d_structures/ligand_19.sdf
Saved: ./ligand_3d_structures/ligand_20.sdf
Saved: ./ligand_3d_structures/ligand_21.sdf
Saved: ./ligand_3d_structures/ligand_22.sdf
Saved: ./ligand_3d_structures/ligand_23.sdf
Saved: ./ligand_3d_structures/ligand_24.sdf
Saved: ./ligand_3d_structures/ligand_25.sdf
Saved: ./ligand_3d_structures/ligand_39.sdf
Saved: ./ligand_3d_structures/ligand_40.sdf
Saved: ./ligand_3d_structures/ligand_46.sdf
Saved: ./ligand_3d_structures/ligand_47.sdf
Saved: ./ligand_3d_structures/ligand_58.sdf
Saved: ./ligand_3d_structures/ligand_59.sdf
Saved: ./ligand_3d_structures/ligand_60.sdf
Saved: ./ligand_3d_structures/lig

[18:34:02] UFFTYPER: Unrecognized charge state for atom: 1
[18:34:02] UFFTYPER: Unrecognized charge state for atom: 1


Saved: ./ligand_3d_structures/ligand_3084.sdf
Saved: ./ligand_3d_structures/ligand_3085.sdf
Saved: ./ligand_3d_structures/ligand_3099.sdf
Saved: ./ligand_3d_structures/ligand_3100.sdf
Saved: ./ligand_3d_structures/ligand_3101.sdf
Saved: ./ligand_3d_structures/ligand_3102.sdf
Saved: ./ligand_3d_structures/ligand_3103.sdf
Saved: ./ligand_3d_structures/ligand_3104.sdf
Saved: ./ligand_3d_structures/ligand_3108.sdf
Saved: ./ligand_3d_structures/ligand_3109.sdf
Saved: ./ligand_3d_structures/ligand_3110.sdf
Saved: ./ligand_3d_structures/ligand_3111.sdf
Saved: ./ligand_3d_structures/ligand_3112.sdf
Saved: ./ligand_3d_structures/ligand_3113.sdf
Saved: ./ligand_3d_structures/ligand_3125.sdf
Saved: ./ligand_3d_structures/ligand_3126.sdf
Saved: ./ligand_3d_structures/ligand_3128.sdf
Saved: ./ligand_3d_structures/ligand_3129.sdf
Saved: ./ligand_3d_structures/ligand_3147.sdf
Saved: ./ligand_3d_structures/ligand_3153.sdf
Saved: ./ligand_3d_structures/ligand_3155.sdf
Saved: ./ligand_3d_structures/liga

In [24]:
import os
print(f"Files created: {len(os.listdir('./ligand_3d_structures'))}")

Files created: 2921
