# Getting Started with OChem Helper

This notebook demonstrates basic usage of the OChem Helper molecular discovery platform.

In [None]:
# Setup
import sys
sys.path.append('../src')

# Import necessary modules
from models.generative import MoleculeGenerator
from models.predictive import PropertyPredictor
from core.validators import MoleculeValidator
from core.descriptors import MolecularDescriptors

# For visualization
from rdkit import Chem
from rdkit.Chem import Draw
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Molecule Generation

Generate novel molecules using the trained VAE model.

In [None]:
# Initialize molecule generator
generator = MoleculeGenerator()

# Generate molecules with no constraints
molecules = generator.generate(n_molecules=10)
print(f"Generated {len(molecules)} molecules:")
for i, mol in enumerate(molecules[:5]):
    print(f"{i+1}. {mol}")

In [None]:
# Visualize generated molecules
mols = [Chem.MolFromSmiles(smi) for smi in molecules[:6] if Chem.MolFromSmiles(smi)]
img = Draw.MolsToGridImage(mols, molsPerRow=3, subImgSize=(300, 300))
img

## 2. Targeted Generation

Generate molecules with specific property constraints.

In [None]:
# Generate drug-like molecules
target_properties = {
    'logP': [2.0, 4.0],      # Lipophilicity range
    'MW': [250, 450],        # Molecular weight range
    'TPSA': [40, 100]        # Polar surface area range
}

targeted_molecules = generator.generate(
    n_molecules=10,
    target_properties=target_properties
)

print(f"Generated {len(targeted_molecules)} molecules with constraints")

## 3. Molecule Validation

Validate generated molecules and check drug-likeness.

In [None]:
# Initialize validator
validator = MoleculeValidator()

# Validate molecules
validation_results = []

for mol in molecules[:10]:
    # Validate SMILES
    validation = validator.validate_smiles(mol)
    
    if validation['valid']:
        # Check drug-likeness
        drug_like = validator.check_drug_likeness(mol)
        
        validation_results.append({
            'smiles': mol,
            'valid': validation['valid'],
            'canonical': validation['canonical_smiles'],
            'lipinski': drug_like['lipinski_compliant'],
            'veber': drug_like['veber_compliant'],
            'violations': ', '.join(drug_like['violations'])
        })

# Display results
df = pd.DataFrame(validation_results)
df

## 4. Property Calculation

Calculate molecular descriptors for the generated molecules.

In [None]:
# Initialize descriptor calculator
descriptor_calc = MolecularDescriptors()

# Calculate descriptors for valid molecules
descriptor_results = []

for result in validation_results:
    if result['valid']:
        smiles = result['canonical']
        descriptors = descriptor_calc.calculate(smiles)
        
        if descriptors:
            descriptor_results.append({
                'smiles': smiles,
                **descriptors
            })

# Display as dataframe
df_desc = pd.DataFrame(descriptor_results)
df_desc.round(2)

## 5. Property Distribution Analysis

Analyze the distribution of molecular properties.

In [None]:
# Plot property distributions
if len(df_desc) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Molecular weight distribution
    axes[0, 0].hist(df_desc['molecular_weight'], bins=10, alpha=0.7, color='blue')
    axes[0, 0].axvline(x=500, color='red', linestyle='--', label='Lipinski limit')
    axes[0, 0].set_xlabel('Molecular Weight')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Molecular Weight Distribution')
    axes[0, 0].legend()
    
    # LogP distribution
    axes[0, 1].hist(df_desc['logP'], bins=10, alpha=0.7, color='green')
    axes[0, 1].axvline(x=5, color='red', linestyle='--', label='Lipinski limit')
    axes[0, 1].set_xlabel('LogP')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].set_title('LogP Distribution')
    axes[0, 1].legend()
    
    # TPSA distribution
    axes[1, 0].hist(df_desc['tpsa'], bins=10, alpha=0.7, color='orange')
    axes[1, 0].axvline(x=140, color='red', linestyle='--', label='Veber limit')
    axes[1, 0].set_xlabel('TPSA')
    axes[1, 0].set_ylabel('Count')
    axes[1, 0].set_title('TPSA Distribution')
    axes[1, 0].legend()
    
    # QED distribution
    axes[1, 1].hist(df_desc['qed'], bins=10, alpha=0.7, color='purple')
    axes[1, 1].set_xlabel('QED Score')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].set_title('QED Score Distribution')
    
    plt.tight_layout()
    plt.show()

## 6. Property Prediction

Use the predictive model to estimate properties.

In [None]:
# Initialize property predictor
predictor = PropertyPredictor()

# Predict properties for some known molecules
test_molecules = ['CCO', 'CC(=O)O', 'c1ccccc1', 'CC(C)CC(=O)O']

predictions = predictor.predict(test_molecules)

# Display results
if 'predictions' in predictions:
    pred_df = pd.DataFrame({
        'molecule': predictions['molecules'],
        'prediction': predictions['predictions'],
        'uncertainty': predictions['uncertainties']
    })
    print("Property Predictions:")
    print(pred_df)

## 7. Molecular Filtering

Filter molecules based on multiple criteria.

In [None]:
# Define filtering criteria
def filter_molecules(molecules, validator, descriptor_calc):
    """Filter molecules based on drug-likeness and property criteria."""
    filtered = []
    
    for mol in molecules:
        # Validate SMILES
        validation = validator.validate_smiles(mol)
        if not validation['valid']:
            continue
        
        # Check drug-likeness
        drug_like = validator.check_drug_likeness(mol)
        if not drug_like['lipinski_compliant']:
            continue
        
        # Calculate descriptors
        desc = descriptor_calc.calculate(mol)
        if not desc:
            continue
        
        # Apply additional filters
        if (desc['molecular_weight'] < 500 and
            desc['logP'] < 5 and
            desc['qed'] > 0.5):
            
            filtered.append({
                'smiles': validation['canonical_smiles'],
                'MW': desc['molecular_weight'],
                'logP': desc['logP'],
                'QED': desc['qed']
            })
    
    return filtered

# Apply filtering
filtered_molecules = filter_molecules(molecules, validator, descriptor_calc)
print(f"Filtered {len(filtered_molecules)} molecules from {len(molecules)} generated")

# Display filtered molecules
if filtered_molecules:
    filtered_df = pd.DataFrame(filtered_molecules)
    filtered_df.round(2)

## Summary

In this notebook, we demonstrated:

1. **Molecule Generation**: Creating novel molecules using the VAE model
2. **Targeted Generation**: Generating molecules with specific property constraints
3. **Validation**: Checking SMILES validity and drug-likeness rules
4. **Property Calculation**: Computing molecular descriptors
5. **Property Analysis**: Visualizing property distributions
6. **Property Prediction**: Using ML models to predict properties
7. **Filtering**: Selecting molecules based on multiple criteria

Next steps:
- Try different property constraints for targeted generation
- Explore structure optimization (see notebook 02)
- Learn about synthesis planning (see notebook 03)
- Use the MCP server for AI integration (see notebook 04)