# Data Loading for AnyMol-MoleculeSTM

This notebook demonstrates how to load and preprocess molecular datasets for AnyMol-MoleculeSTM.

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw

# Add parent directory to path
module_path = os.path.abspath(os.path.join(os.path.dirname("__file__"), '../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import AnyMol-MoleculeSTM modules
from anymol.data.utils import (
    load_smiles_from_file,
    calculate_molecular_properties,
    visualize_property_distribution,
    visualize_molecule_grid,
    preprocess_dataset
)
from anymol.data.dataset import MoleculeTokenizer, MoleculeDataset, MoleculeCollator
from anymol.initialization import AnyMolSTMInitializer

## 1. Creating Datasets

First, let's see how to create datasets using the provided script. We'll run the script in three different modes.

In [None]:
# Create output directory
os.makedirs('../../data/examples', exist_ok=True)

# Display the command to run
print("Command to create a custom dataset:")
print("python ../../examples/create_dataset.py --dataset_type custom --num_molecules 100 --output_dir ../../data/examples")

# Comment out the following line to actually run the command
# !python ../../examples/create_dataset.py --dataset_type custom --num_molecules 100 --output_dir ../../data/examples

In [None]:
# Using ChEMBL (requires internet connection and chembl_webresource_client)
print("Command to create a ChEMBL dataset:")
print("python ../../examples/create_dataset.py --dataset_type chembl --num_molecules 100 --output_dir ../../data/examples")

# Comment out the following line to actually run the command
# !python ../../examples/create_dataset.py --dataset_type chembl --num_molecules 100 --output_dir ../../data/examples

In [None]:
# Using ZINC (requires internet connection)
print("Command to create a ZINC dataset:")
print("python ../../examples/create_dataset.py --dataset_type zinc --num_molecules 100 --output_dir ../../data/examples")

# Comment out the following line to actually run the command
# !python ../../examples/create_dataset.py --dataset_type zinc --num_molecules 100 --output_dir ../../data/examples

## 2. Creating a Custom Dataset Programmatically

Instead of using the script, we can create a dataset programmatically using the utility functions.

In [None]:
# Define some example molecules (common drug molecules)
smiles_list = [
    "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
    "C1=CC=C2C(=C1)C=C(C=C2)CC3C(=O)NC(=O)S3",  # A penicillin derivative
    "COC1=CC=C(C=C1)CCN(C)C",  # Methamphetamine
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Theophylline
    "CC(C)NCC(O)COC1=CC=CC2=C1C=CN2",  # Propranolol
    "COC1=C(C=C(C=C1)CCN)OC",  # 2,5-Dimethoxy-4-methylamphetamine
    "CC(=O)NCCCCCCCCCCN1C(=O)C2=CC=CC=C2NC1=O",  # Capsaicin analog
    "COC1=CC(=C(C(=C1)OC)OC)CC=C"  # Estragole
]

# Calculate molecular properties
df = calculate_molecular_properties(smiles_list, property_names=["logP", "TPSA", "MolWt", "QED"])

# Display the dataframe
df.head()

In [None]:
# Visualize property distribution
visualize_property_distribution(df, property_names=["logP", "TPSA", "MolWt", "QED"])

In [None]:
# Visualize molecules
visualize_molecule_grid(df["SMILES"].tolist(), n_cols=3, legends=[
    "Aspirin", "Ibuprofen", "Caffeine", "Penicillin", "Methamphetamine", 
    "Theophylline", "Propranolol", "DMMA", "Capsaicin analog", "Estragole"
])

In [None]:
# Save the dataset
output_file = '../../data/examples/custom_drugs.csv'
df.to_csv(output_file, index=False)
print(f"Saved dataset to {output_file}")

## 3. Loading Custom Datasets from Local Files

You can also load your own datasets from local files.

In [None]:
# Function to load molecules from a local file
def load_local_dataset(input_file, output_file=None, smiles_column="SMILES"):
    """Load molecules from a local file and preprocess them."""
    # Check file extension
    file_extension = os.path.splitext(input_file)[1].lower()
    
    # Load SMILES from file
    smiles_list = load_smiles_from_file(input_file, smiles_column=smiles_column)
    
    # Calculate properties
    df = calculate_molecular_properties(smiles_list)
    
    # Save to output file if specified
    if output_file:
        df.to_csv(output_file, index=False)
        print(f"Saved processed dataset to {output_file}")
    
    return df

In [None]:
# Example: load from the custom dataset we created earlier
df_loaded = load_local_dataset('../../data/examples/custom_drugs.csv')
df_loaded.head()

## 4. Creating a MoleculeDataset for Training

Now let's create a MoleculeDataset that can be used for training the model.

In [None]:
# Initialize a tokenizer
tokenizer = MoleculeTokenizer(max_length=100)

# Build vocabulary from our SMILES
tokenizer.build_vocab_from_smiles(df["SMILES"].tolist())

# Display vocabulary size
print(f"Vocabulary size: {len(tokenizer.token2idx)}")

In [None]:
# Create a MoleculeDataset
dataset = MoleculeDataset(
    data_file='../../data/examples/custom_drugs.csv',
    tokenizer=tokenizer,
    property_names=["logP", "TPSA", "MolWt", "QED"],
    smiles_column="SMILES",
    max_length=100,
    calculate_properties=False,  # Properties already calculated
)

# Check dataset size
print(f"Dataset size: {len(dataset)}")

In [None]:
# Inspect an example from the dataset
example = dataset[0]
print(f"SMILES: {example['smiles']}")
print(f"Input IDs shape: {example['input_ids'].shape}")
print(f"Attention mask shape: {example['attention_mask'].shape}")
print(f"Properties: {example['properties']}")

# Decode the input IDs back to SMILES
decoded = tokenizer.decode(example['input_ids'])
print(f"Decoded SMILES: {decoded}")

## 5. Using the AnyMolSTMInitializer

Finally, let's use the AnyMolSTMInitializer to load data and initialize the model.

In [None]:
# Create a configuration dictionary
config = {
    "model": {
        "embedding_dim": 256,  # Smaller for this example
        "hidden_dim": 128,
        "num_layers": 2,
        "num_heads": 4,
        "property_names": ["logP", "TPSA", "MolWt", "QED"],
    },
    "data": {
        "train_file": '../../data/examples/custom_drugs.csv',
        "batch_size": 2,
        "num_workers": 0,  # For Jupyter notebook
    }
}

# Initialize the system
initializer = AnyMolSTMInitializer(config_dict=config)

# Prepare datasets
datasets = initializer.prepare_datasets()

In [None]:
# Create dataloaders
dataloaders = initializer.create_dataloaders()

# Check dataloader
print(f"Number of batches in training dataloader: {len(dataloaders['train'])}")

In [None]:
# Initialize model (this will be very quick in this example since we're using a small model)
# Note: In a real application, you would use the default model size
model = initializer.initialize_model(device="cpu")

# Print model structure
print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")

## 6. Visualizing the Dataset

Let's visualize some molecules from our dataset.

In [None]:
# Visualize sample molecules from the dataset
initializer.visualize_sample_molecules(dataset_key="train", num_samples=6)

In [None]:
# Visualize property distributions
initializer.visualize_dataset_properties(dataset_key="train")

## Summary

In this notebook, we've demonstrated how to:

1. Create datasets using the provided script
2. Create custom datasets programmatically
3. Load datasets from local files
4. Create a MoleculeDataset and tokenizer
5. Use the AnyMolSTMInitializer to streamline the process
6. Visualize molecules and property distributions

These methods can be extended to work with larger and more complex datasets as needed for your specific application.