In [5]:
from rdkit import Chem
from rdkit.Chem import Draw
import random

def detailed_smiles_parse(original_smiles, augmented_smiles):
    """
    Provide a detailed comparison of SMILES strings
    
    Args:
    - original_smiles (str): Original molecule SMILES
    - augmented_smiles (str): Augmented molecule SMILES
    
    Returns:
    - Detailed difference description
    """
    def highlight_differences(orig, aug):
        # Identify differences character by character
        diffs = []
        for i, (o, a) in enumerate(zip(orig, aug)):
            if o != a:
                diffs.append(f"Position {i}: '{o}' → '{a}'")
        return diffs
    
    differences = highlight_differences(original_smiles, augmented_smiles)
    return differences

def safe_mol_from_smiles(smiles):
    """
    Safely convert SMILES to RDKit molecule with advanced parsing
    """
    # List of common SMILES replacements
    replacements = [
        ('[si]', 'Si'),   # Silicon notation
        ('[Si]', 'Si'),
        ('[Se]', 'Se'),   # Selenium notation
        ('=[Se]', '=Se')  # Alternative selenium notation
    ]
    
    # Try original SMILES first
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return mol
    
    # Try replacements
    for old, new in replacements:
        modified_smiles = smiles.replace(old, new)
        mol = Chem.MolFromSmiles(modified_smiles)
        if mol is not None:
            return mol
    
    print(f"Could not parse SMILES: {smiles}")
    return None

def visualize_augmentations(original_smiles, augmented_smiles_list):
    """
    Visualize augmentations with detailed analysis
    """
    print(f"Original SMILES: {original_smiles}")
    
    # Process each augmented molecule
    for i, aug_smiles in enumerate(augmented_smiles_list, 1):
        print(f"\nAugmentation {i}:")
        print(f"Augmented SMILES: {aug_smiles}")
        
        # Detailed SMILES comparison
        differences = detailed_smiles_parse(original_smiles, aug_smiles)
        if differences:
            print("SMILES Differences:")
            for diff in differences:
                print(f"  - {diff}")
        
        # Attempt to convert to molecule
        mol = safe_mol_from_smiles(aug_smiles)
        if mol is None:
            print("  Could not parse augmented molecule!")

def main():
    # Example molecules (Acetaminophen and its augmentations)
    original_smiles = 'CC(=O)Nc1ccc(N)cc1'
    
    # Manual Augmentations
    manual_augmentations = [
        'NC(=O)Nc1ccc(N)cc1',     # Nitrogen addition
        'CC(=[Se])Nc1ccc(N)cc1',  # Selenium substitution
        'CC(=O)Pc1ccc(N)cc1',     # Phosphorus substitution
        'CC(=O)N[si]1ccc(N)cc1',  # Silicon substitution
        'CC(=O)Nc1ccc(N)c[siH]1'  # Silicon in ring
    ]
    
    # GAN-like Augmentations
    gan_augmentations = [
        'CC(=O)N[si]1ccc(N)cc1',     # Silicon substitution
        'CC(=[Se])Nc1ccc(N)cc1'      # Selenium substitution
    ]
    
    print("--- Manual Augmentations ---")
    visualize_augmentations(original_smiles, manual_augmentations)
    
    print("\n--- GAN-like Augmentations ---")
    visualize_augmentations(original_smiles, gan_augmentations)

if __name__ == '__main__':
    main()

--- Manual Augmentations ---
Original SMILES: CC(=O)Nc1ccc(N)cc1

Augmentation 1:
Augmented SMILES: NC(=O)Nc1ccc(N)cc1
SMILES Differences:
  - Position 0: 'C' → 'N'

Augmentation 2:
Augmented SMILES: CC(=[Se])Nc1ccc(N)cc1
SMILES Differences:
  - Position 4: 'O' → '['
  - Position 5: ')' → 'S'
  - Position 6: 'N' → 'e'
  - Position 7: 'c' → ']'
  - Position 8: '1' → ')'
  - Position 9: 'c' → 'N'
  - Position 11: 'c' → '1'
  - Position 12: '(' → 'c'
  - Position 13: 'N' → 'c'
  - Position 14: ')' → 'c'
  - Position 15: 'c' → '('
  - Position 16: 'c' → 'N'
  - Position 17: '1' → ')'

Augmentation 3:
Augmented SMILES: CC(=O)Pc1ccc(N)cc1
SMILES Differences:
  - Position 6: 'N' → 'P'

Augmentation 4:
Augmented SMILES: CC(=O)N[si]1ccc(N)cc1
SMILES Differences:
  - Position 7: 'c' → '['
  - Position 8: '1' → 's'
  - Position 9: 'c' → 'i'
  - Position 10: 'c' → ']'
  - Position 11: 'c' → '1'
  - Position 12: '(' → 'c'
  - Position 13: 'N' → 'c'
  - Position 14: ')' → 'c'
  - Position 15: 'c' → 