In [3]:
!git clone https://github.com/zhouxiangxin1998/DualDiff.git

Cloning into 'DualDiff'...
remote: Enumerating objects: 125, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 125 (delta 21), reused 119 (delta 21), pack-reused 0 (from 0)[K
Receiving objects: 100% (125/125), 37.02 MiB | 20.83 MiB/s, done.
Resolving deltas: 100% (21/21), done.


In [1]:
!pip install rdkit



In [9]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors
import pandas as pd
import glob

In [8]:
sdf_files = glob.glob('/content/DualDiff/outputs/composition_score/8/*/gt_ligand*.sdf')

print(f"Found {len(sdf_files)} SDF files.")

def compute_admet(mol):
    return {
        "MolWt": Descriptors.MolWt(mol),
        "LogP": Crippen.MolLogP(mol),
        "TPSA": rdMolDescriptors.CalcTPSA(mol),
        "NumHDonors": Descriptors.NumHDonors(mol),
        "NumHAcceptors": Descriptors.NumHAcceptors(mol),
        "NumRotatableBonds": Descriptors.NumRotatableBonds(mol),
        "NumRings": Descriptors.RingCount(mol)
    }

all_data = []

for sdf in sdf_files:
    mols = Chem.SDMolSupplier(sdf)
    mols = [m for m in mols if m is not None]
    sdf_name = sdf.split('8')[-1]

    for m in mols:
        desc = compute_admet(m)
        desc['SourceFile'] = sdf_name
        all_data.append(desc)

admet_df = pd.DataFrame(all_data)

def lipinski_filter(row):
    conditions = [
        row['MolWt'] <= 500,
        row['LogP'] <= 5,
        row['NumHDonors'] <= 5,
        row['NumHAcceptors'] <= 10
    ]
    return int(all(conditions))

def solubility_class(logp, tpsa):
    if logp < 5 and tpsa > 75:
        return 'Good'
    elif logp > 5:
        return 'Poor'
    else:
        return 'Moderate'

admet_df['Lipinski_Pass'] = admet_df.apply(lipinski_filter, axis=1)
admet_df['Solubility'] = [solubility_class(l, t) for l, t in zip(admet_df['LogP'], admet_df['TPSA'])]
columns_order = ['SourceFile'] + [col for col in admet_df.columns if col != 'SourceFile']
admet_df = admet_df[columns_order]

admet_df

Found 4 SDF files.


Unnamed: 0,SourceFile,MolWt,LogP,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,NumRings,Lipinski_Pass,Solubility
0,/226/gt_ligand_2.sdf,298.379,2.8408,46.15,0,5,1,5,1,Moderate
1,/226/gt_ligand_1.sdf,277.496,5.2954,12.03,1,1,4,3,0,Poor
2,/104/gt_ligand_2.sdf,284.352,2.1867,57.15,1,5,0,5,1,Moderate
3,/104/gt_ligand_1.sdf,277.496,5.2954,12.03,1,1,4,3,0,Poor
