In [8]:
import deepchem as dc

# Load HIV dataset with a smaller fraction of the data
tasks, datasets, transformers = dc.molnet.load_hiv(
    featurizer='GraphConv', splitter='index', frac_train=0.1, frac_valid=0.1, frac_test=0.1)
train_dataset, valid_dataset, test_dataset = datasets

# print statistics about the dataset
print('Number of tasks in dataset:', len(tasks))
print('Number of samples in training set:', len(train_dataset))
print('Number of samples in validation set:', len(valid_dataset))
print('Number of samples in test set:', len(test_dataset))


Number of tasks in dataset: 1
Number of samples in training set: 32896
Number of samples in validation set: 4112
Number of samples in test set: 4112


In [12]:
# Check one sample
X, y, w, ids = train_dataset.X, train_dataset.y, train_dataset.w, train_dataset.ids
print(f"Features (Graph Representation): {X[0]}")
print(f"Label: {y[0]}")


Features (Graph Representation): <deepchem.feat.mol_graphs.ConvMol object at 0x794aa60aee90>
Label: [0.]


In [13]:
# Define model
model = dc.models.GraphConvModel(
    n_tasks=1, 
    mode='classification', 
    batch_size=32, 
    learning_rate=0.001
)

# Train model
model.fit(train_dataset, nb_epoch=20)


2025-03-17 13:11:29.484507: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


0.6974327723185222

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Get fingerprint features
tasks, datasets, transformers = dc.molnet.load_hiv(featurizer='ECFP')
train_dataset, valid_dataset, test_dataset = datasets

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train_dataset.X, train_dataset.y.ravel())

# Predict
y_pred = rf.predict(valid_dataset.X)
print(f"Predicted Labels: {y_pred[:5]}")


[13:26:43] Explicit valence for atom # 3 Al, 6, is greater than permitted
Failed to featurize datapoint 137, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[13:26:44] Explicit valence for atom # 5 B, 5, is greater than permitted
Failed to featurize datapoint 987, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
Exception message: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1

Predicted Labels: [0. 0. 0. 0. 0.]


In [15]:
# Evaluate GraphConvModel
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
score = model.evaluate(valid_dataset, [metric], transformers)
print(f"GraphConv Model AUC: {score['roc_auc_score']}")


AttributeError: 'numpy.ndarray' object has no attribute 'atom_features'

In [None]:
from sklearn.metrics import roc_auc_score

rf_auc = roc_auc_score(valid_dataset.y, rf.predict_proba(valid_dataset.X)[:, 1])
print(f"Random Forest AUC: {rf_auc}")


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

# Example molecules
smiles_list = ["CCO", "CCN", "CCC", "C1CCCCC1", "CCOCC"]
mols = [Chem.MolFromSmiles(s) for s in smiles_list]

# Convert to features
featurizer = dc.feat.ConvMolFeaturizer()
features = featurizer.featurize(mols)


In [None]:
predictions = model.predict_on_batch(features)
print(f"Predicted Activity: {predictions}")


In [None]:
from rdkit.Chem import Descriptors

def lipinski_filter(mol):
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)
    
    return mw < 500 and logp < 5 and h_donors <= 5 and h_acceptors <= 10

filtered_mols = [mol for mol in mols if lipinski_filter(mol)]
print(f"Number of drug-like molecules: {len(filtered_mols)}")
