# Molecular Validation Framework

In [1]:
import matplotlib.pyplot as plt

from diffusion_gnn.models import MolecularPropertyPredictor, create_molecular_gnn
from diffusion_gnn.data.deepchem import DeepChemMolecularDataset
from diffusion_gnn.utils.validation import *


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/jantinebroek/miniconda3/envs/diff_gnn/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/jantinebroek/miniconda3/envs/diff_gnn/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/jantinebroek/miniconda3/envs/diff_gnn/lib/python3.11/site-packages/ipykernel/kernelapp.py

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset and extract real SMILES
dataset = DeepChemMolecularDataset('tox21', max_atoms=30)
training_smiles = [dataset.dataset.ids[i] for i in range(min(500, len(dataset.dataset)))]

print(f"Loaded {len(training_smiles)} training SMILES")

# Extract real molecular properties
property_df = extract_molecular_properties(training_smiles)
print(f"Extracted properties for {len(property_df)} valid molecules")
print(property_df.head())

INFO:diffusion_gnn.data.deepchem:Loading DeepChem dataset: tox21
INFO:deepchem.data.datasets:Loading dataset from disk.
INFO:deepchem.data.datasets:Loading dataset from disk.
INFO:deepchem.data.datasets:Loading dataset from disk.
INFO:diffusion_gnn.data.deepchem:Loaded tox21 dataset with 6258 molecules


Loaded 500 training SMILES
Extracted properties for 500 valid molecules
                                     smiles  molecular_weight    logp    tpsa  \
0                 CC(O)(P(=O)(O)O)P(=O)(O)O           206.027 -0.9922  135.29   
1      CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C           290.444  4.8172   36.92   
2             OC[C@H](O)[C@@H](O)[C@H](O)CO           152.146 -2.9463  101.15   
3  CCCCCCCC(=O)[O-].CCCCCCCC(=O)[O-].[Zn+2]           351.802  2.1911   80.26   
4                         CC(C)COC(=O)C(C)C           144.214  1.8416   26.30   

   num_atoms  num_bonds  
0         11         10  
1         20         19  
2         10          9  
3         21         18  
4         10          9  




In [None]:
# Create property predictor
property_gnn = create_molecular_gnn(
    atom_dim=dataset._get_atom_feature_dim(),
    bond_dim=dataset._get_bond_feature_dim(),
    hidden_dim=64,
    num_layers=2,
    gnn_type='gat'
)

property_predictor = MolecularPropertyPredictor(
    gnn=property_gnn,
    num_tasks=3  # MW, LogP, TPSA
).to(device)

# Prepare training data
training_data = []
for _, row in property_df.iterrows():
    graph = dataset.smiles_to_graph(row['smiles'])
    if graph is not None:
        targets = [row['molecular_weight'], row['logp'], row['tpsa']]
        training_data.append((graph, targets))

print(f"Created {len(training_data)} training examples")

# Train property predictor
optimizer = torch.optim.Adam(property_predictor.parameters(), lr=1e-3)
property_predictor.train()

for epoch in range(50):
    total_loss = 0
    for graph, targets in training_data[:100]:  # Use first 100 for speed
        graph = graph.to(device)
        target_tensor = torch.tensor([targets], dtype=torch.float32, device=device)
        batch = torch.zeros(graph.x.shape[0], dtype=torch.long, device=device)

        pred = property_predictor(graph.x, graph.edge_index, graph.edge_attr, batch)
        loss = torch.nn.functional.mse_loss(pred, target_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {total_loss/len(training_data[:100]):.4f}")

print("Property predictor trained!")



In [None]:
# For demo, create some test "generated" molecules from real data
# In practice, these would come from your generation pipeline
generated_smiles = training_smiles[400:450]  # Pretend these are generated
training_subset = training_smiles[:400]

print(f"Testing validation on {len(generated_smiles)} molecules")

# Calculate validation metrics
validity_results = calculate_validity_metrics(generated_smiles)
novelty_results = calculate_novelty_uniqueness(generated_smiles, training_subset)
generated_properties = extract_molecular_properties(generated_smiles)

print("\nValidation Results:")
print(f"Validity: {validity_results['validity']:.3f}")
print(f"Uniqueness: {novelty_results['uniqueness']:.3f}")
print(f"Novelty: {novelty_results['novelty']:.3f}")

In [None]:
def create_validation_dashboard(generated_props, training_props, validity, novelty):
    """Create comprehensive validation dashboard"""

    fig, axes = plt.subplots(2, 3, figsize=(18, 10))

    # Validity pie chart
    valid_count = validity['valid_count']
    invalid_count = validity['total_count'] - valid_count

    if invalid_count > 0:
        axes[0,0].pie([valid_count, invalid_count], labels=['Valid', 'Invalid'], autopct='%1.1f%%')
    else:
        axes[0,0].pie([valid_count], labels=['Valid'], autopct='%1.1f%%')
    axes[0,0].set_title(f"Validity: {validity['validity']:.2f}")

    # Novelty/Uniqueness bars
    metrics = ['Uniqueness', 'Novelty']
    values = [novelty['uniqueness'], novelty['novelty']]
    bars = axes[0,1].bar(metrics, values, color=['skyblue', 'lightcoral'])
    axes[0,1].set_ylim(0, 1)
    axes[0,1].set_title('Uniqueness & Novelty')
    axes[0,1].set_ylabel('Fraction')

    # Add value labels on bars
    for bar, value in zip(bars, values):
        axes[0,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                      f'{value:.3f}', ha='center', va='bottom')

    # Property distributions comparison
    properties = ['molecular_weight', 'logp', 'tpsa']
    for i, prop in enumerate(properties):
        ax = axes[0,2] if i == 0 else axes[1, i-1]

        if len(training_props) > 0:
            ax.hist(training_props[prop], alpha=0.6, label='Training', bins=20, density=True)
        if len(generated_props) > 0:
            ax.hist(generated_props[prop], alpha=0.6, label='Generated', bins=20, density=True)

        ax.set_title(f'{prop.replace("_", " ").title()}')
        ax.legend()
        ax.set_ylabel('Density')

    plt.tight_layout()
    plt.show()

# Create dashboard
create_validation_dashboard(
    generated_properties,
    property_df,
    validity_results,
    novelty_results
)

# Print summary
print("\n" + "="*60)
print("MOLECULAR GENERATION VALIDATION SUMMARY")
print("="*60)
print(f"Total molecules analyzed: {validity_results['total_count']}")
print(f"Validity:     {validity_results['validity']:.3f}")
print(f"Uniqueness:   {novelty_results['uniqueness']:.3f}")
print(f"Novelty:      {novelty_results['novelty']:.3f}")
print(f"Novel count:  {novelty_results['novel_count']}")

if len(generated_properties) > 0:
    print(f"\nProperty Statistics (Generated):")
    print(f"Avg MW:       {generated_properties['molecular_weight'].mean():.1f}")
    print(f"Avg LogP:     {generated_properties['logp'].mean():.2f}")
    print(f"Avg TPSA:     {generated_properties['tpsa'].mean():.1f}")