1. Simple Example: Loading and Tokenizing a Single Molecule


In [None]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from moltoken.tokenizer import MoleculeTokenizer

# Initialize the tokenizer
tokenizer = MoleculeTokenizer(
    max_atoms=50,
    spatial_resolution=0.1,
    consider_bonds=True
)

# Create a simple molecule (ethanol in this case)
mol = Chem.MolFromSmiles('CCO')
mol = Chem.AddHs(mol)  # Add hydrogen atoms
AllChem.EmbedMolecule(mol)  # Generate 3D coordinates

# Get coordinates and atom types
conf = mol.GetConformer()
coordinates = conf.GetPositions()
atom_types = [atom.GetSymbol() for atom in mol.GetAtoms()]

# Get bonds
bonds = []
for bond in mol.GetBonds():
    bonds.append((
        bond.GetBeginAtomIdx(),
        bond.GetEndAtomIdx(),
        bond.GetBondTypeAsDouble()
    ))

# Tokenize the molecule
tokens = tokenizer.encode_molecule(
    coordinates=coordinates,
    atom_types=atom_types,
    bonds=bonds
)

print(f"Generated tokens: {tokens}")

# Decode back to molecular representation
reconstructed_coords, reconstructed_atoms, reconstructed_bonds = tokenizer.decode_to_molecule(tokens)

print("\nOriginal structure:")
print(f"Atoms: {atom_types}")
print(f"Coordinates shape: {coordinates.shape}")
print(f"Number of bonds: {len(bonds)}")

print("\nReconstructed structure:")
print(f"Atoms: {reconstructed_atoms}")
print(f"Coordinates shape: {reconstructed_coords.shape}")
print(f"Number of bonds: {len(reconstructed_bonds)}")

2. Visualizing the Tokenization


In [None]:
from moltoken.visualization import MoleculeVisualizer

visualizer = MoleculeVisualizer()

# Plot token distribution
visualizer.plot_token_distribution(tokens)

# Visualize the molecule with token assignments
visualizer.visualize_attention(mol, attention_weights=None)

3. Batch Processing Multiple Molecules


In [None]:
# Process a list of SMILES strings
smiles_list = ['CCO', 'CC(=O)O', 'c1ccccc1']
all_tokens = []

for smiles in smiles_list:
    # Convert SMILES to 3D structure
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    
    # Extract molecular information
    conf = mol.GetConformer()
    coordinates = conf.GetPositions()
    atom_types = [atom.GetSymbol() for atom in mol.GetAtoms()]
    
    bonds = []
    for bond in mol.GetBonds():
        bonds.append((
            bond.GetBeginAtomIdx(),
            bond.GetEndAtomIdx(),
            bond.GetBondTypeAsDouble()
        ))
    
    # Tokenize
    tokens = tokenizer.encode_molecule(
        coordinates=coordinates,
        atom_types=atom_types,
        bonds=bonds
    )
    all_tokens.append(tokens)

print("Tokens for each molecule:")
for smiles, tokens in zip(smiles_list, all_tokens):
    print(f"{smiles}: {tokens}")

4. Working with Different File Formats


In [None]:
# Reading from different file formats
from moltoken.data import MoleculeProcessor

processor = MoleculeProcessor()

# From PDB file
pdb_file = "example.pdb"
coords_pdb, atoms_pdb, bonds_pdb = processor.read_pdb(pdb_file)
tokens_pdb = tokenizer.encode_molecule(coords_pdb, atoms_pdb, bonds_pdb)

# From MOL file
mol_file = "example.mol"
coords_mol, atoms_mol, bonds_mol = processor.read_mol(mol_file)
tokens_mol = tokenizer.encode_molecule(coords_mol, atoms_mol, bonds_mol)

5. Advanced Usage: Rotation Invariance


In [None]:
# Generate rotated versions of a molecule
rotated_coords = processor.generate_rotations(coordinates, n_rotations=4)

# Check that tokens are the same for all rotations
rotation_tokens = []
for coords in rotated_coords:
    tokens = tokenizer.encode_molecule(coords, atom_types, bonds)
    rotation_tokens.append(tokens)

# All rotations should give similar tokens
for i, tokens in enumerate(rotation_tokens):
    print(f"Rotation {i}: {tokens}")