In [2]:
from ord_schema import message_helpers
from ord_schema.proto import dataset_pb2, reaction_pb2

# Load Chan-Lam dataset
chan_lam_path = "ord-data/data/5c/ord_dataset-5c9a10329a8a48968d18879a48bb8ab2.pb.gz"
chan_lam = message_helpers.load_message(chan_lam_path, dataset_pb2.Dataset)
print(f"Dataset: {chan_lam.name}")
print(f"Reactions: {len(chan_lam.reactions)}")

# Pick first reaction
rxn = chan_lam.reactions[0]

print("\n=== INPUTS (Reactants) ===")
for key, inp in rxn.inputs.items():
    print(f"\n{key}:")
    for comp in inp.components:
        role = reaction_pb2.ReactionRole.ReactionRoleType.Name(comp.reaction_role)
        for ident in comp.identifiers:
            if reaction_pb2.CompoundIdentifier.CompoundIdentifierType.Name(ident.type) == "SMILES":
                print(f"  {role}: {ident.value}")

print("\n=== OUTCOMES (Products) ===")
for outcome in rxn.outcomes:
    for product in outcome.products:
        for ident in product.identifiers:
            if reaction_pb2.CompoundIdentifier.CompoundIdentifierType.Name(ident.type) == "SMILES":
                print(f"  PRODUCT: {ident.value}")

Dataset: Chan-Lam coupling of primary sulfonamides with boronic acids
Reactions: 9632

=== INPUTS (Reactants) ===

Boronic Acid:
  REACTANT: OB(O)C1=CC=C(C(F)(F)F)C=C1
  SOLVENT: ClCCCl

Sulfonamide:
  REACTANT: BrC1=CC(S(=O)(N)=O)=CC(C(OC)=O)=C1
  SOLVENT: ClCCCl

Base_Solid:
  REAGENT: [F-].[Cs+]

Catalyst:
  CATALYST: CC(=O)[O-].CC(=O)[O-].[Cu+2]

=== OUTCOMES (Products) ===
  PRODUCT: BrC1=CC(S(=O)(NC2=CC=C(C=C2)C(F)(F)F)=O)=CC(C(OC)=O)=C1
  PRODUCT: BrC1=CC(S(=O)(N(C2=CC=C(C(F)(F)F)C=C2)C3=CC=C(C=C3)C(F)(F)F)=O)=CC(C(OC)=O)=C1


In [3]:
def build_reaction_smiles(reaction):
    """
    Build Reaction SMILES: reactants>agents>products
    """
    reactants = []
    agents = []
    products = []
    
    # Get inputs
    for key, inp in reaction.inputs.items():
        for comp in inp.components:
            role = reaction_pb2.ReactionRole.ReactionRoleType.Name(comp.reaction_role)
            
            for ident in comp.identifiers:
                if reaction_pb2.CompoundIdentifier.CompoundIdentifierType.Name(ident.type) == "SMILES":
                    smiles = ident.value
                    
                    if role == "REACTANT":
                        reactants.append(smiles)
                    else:  # CATALYST, SOLVENT, REAGENT, etc.
                        agents.append(smiles)
                    break
    
    # Get products
    for outcome in reaction.outcomes:
        for product in outcome.products:
            for ident in product.identifiers:
                if reaction_pb2.CompoundIdentifier.CompoundIdentifierType.Name(ident.type) == "SMILES":
                    products.append(ident.value)
                    break
    
    # Build Reaction SMILES
    reactant_str = ".".join(reactants)
    agent_str = ".".join(agents)
    product_str = ".".join(products)
    
    reaction_smiles = f"{reactant_str}>{agent_str}>{product_str}"
    
    return reaction_smiles, reactants, agents, products

# Test on Chan-Lam reaction
rxn_smiles, reactants, agents, products = build_reaction_smiles(rxn)

print("=== REACTION SMILES ===")
print(f"\nReactants: {reactants}")
print(f"\nAgents: {agents}")
print(f"\nProducts: {products}")
print(f"\n\nFull Reaction SMILES:\n{rxn_smiles}")

=== REACTION SMILES ===

Reactants: ['OB(O)C1=CC=C(C(F)(F)F)C=C1', 'BrC1=CC(S(=O)(N)=O)=CC(C(OC)=O)=C1']

Agents: ['ClCCCl', 'ClCCCl', '[F-].[Cs+]', 'CC(=O)[O-].CC(=O)[O-].[Cu+2]']

Products: ['BrC1=CC(S(=O)(NC2=CC=C(C=C2)C(F)(F)F)=O)=CC(C(OC)=O)=C1', 'BrC1=CC(S(=O)(N(C2=CC=C(C(F)(F)F)C=C2)C3=CC=C(C=C3)C(F)(F)F)=O)=CC(C(OC)=O)=C1']


Full Reaction SMILES:
OB(O)C1=CC=C(C(F)(F)F)C=C1.BrC1=CC(S(=O)(N)=O)=CC(C(OC)=O)=C1>ClCCCl.ClCCCl.[F-].[Cs+].CC(=O)[O-].CC(=O)[O-].[Cu+2]>BrC1=CC(S(=O)(NC2=CC=C(C=C2)C(F)(F)F)=O)=CC(C(OC)=O)=C1.BrC1=CC(S(=O)(N(C2=CC=C(C(F)(F)F)C=C2)C3=CC=C(C=C3)C(F)(F)F)=O)=CC(C(OC)=O)=C1


In [4]:
# Build Reaction SMILES for first 5 Chan-Lam reactions
print("Chan-Lam reactions:\n")
for i in range(5):
    rxn_smiles, reactants, agents, products = build_reaction_smiles(chan_lam.reactions[i])
    print(f"Reaction {i+1}:")
    print(f"  Reactants: {len(reactants)}, Agents: {len(agents)}, Products: {len(products)}")
    print(f"  Has product: {'Yes' if products else 'NO'}")
    print()

# Now load a C-N coupling dataset and compare
cn_path = "ord-data/data/00/ord_dataset-00005539a1e04c809a9a78647bea649c.pb.gz"
cn_dataset = message_helpers.load_message(cn_path, dataset_pb2.Dataset)

print("\nAstraZeneca C-N coupling reactions:\n")
for i in range(5):
    rxn_smiles, reactants, agents, products = build_reaction_smiles(cn_dataset.reactions[i])
    print(f"Reaction {i+1}:")
    print(f"  Reactants: {len(reactants)}, Agents: {len(agents)}, Products: {len(products)}")
    print(f"  Has product: {'Yes' if products else 'NO'}")
    print(f"  Reaction SMILES: {rxn_smiles[:80]}...")
    print()

Chan-Lam reactions:

Reaction 1:
  Reactants: 2, Agents: 4, Products: 2
  Has product: Yes

Reaction 2:
  Reactants: 2, Agents: 4, Products: 2
  Has product: Yes

Reaction 3:
  Reactants: 2, Agents: 4, Products: 2
  Has product: Yes

Reaction 4:
  Reactants: 2, Agents: 4, Products: 2
  Has product: Yes

Reaction 5:
  Reactants: 2, Agents: 4, Products: 2
  Has product: Yes


AstraZeneca C-N coupling reactions:

Reaction 1:
  Reactants: 2, Agents: 3, Products: 1
  Has product: Yes
  Reaction SMILES: CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C(=O)OCC)Br.CC(C)N1CCNCC1>C(=O)([O-...

Reaction 2:
  Reactants: 2, Agents: 4, Products: 1
  Has product: Yes
  Reaction SMILES: C1=CC=C(C=C1)I.CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC>C(=O)([O-])[O-].[Cs+].[Cs+].COC1=...

Reaction 3:
  Reactants: 2, Agents: 4, Products: 1
  Has product: Yes
  Reaction SMILES: C1=CC=C(C=C1)I.CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC>C(=O)([O-])[O-].[Cs+].[Cs+].COC1=...

Reaction 4:
  Reactants: 2, Agents: 4, Products: 1
  Has product

In [5]:
# Compare transformations side by side
from rdkit import Chem

def analyze_transformation(reaction, name):
    """Show what bonds change from reactants to products."""
    rxn_smiles, reactants, agents, products = build_reaction_smiles(reaction)
    
    print(f"=== {name} ===")
    print(f"\nReactants:")
    for r in reactants:
        mol = Chem.MolFromSmiles(r)
        if mol:
            atoms = set(a.GetSymbol() for a in mol.GetAtoms())
            print(f"  {r[:60]}...")
            print(f"  Elements: {atoms}")
    
    print(f"\nProducts:")
    for p in products:
        mol = Chem.MolFromSmiles(p)
        if mol:
            atoms = set(a.GetSymbol() for a in mol.GetAtoms())
            print(f"  {p[:60]}...")
            print(f"  Elements: {atoms}")
    
    print(f"\nFull Reaction SMILES:")
    print(f"  {rxn_smiles[:100]}...")
    print()

# Chan-Lam (Suzuki-type)
analyze_transformation(chan_lam.reactions[0], "Chan-Lam (Suzuki-type)")

# AstraZeneca C-N
analyze_transformation(cn_dataset.reactions[0], "AstraZeneca C-N Coupling")

=== Chan-Lam (Suzuki-type) ===

Reactants:
  OB(O)C1=CC=C(C(F)(F)F)C=C1...
  Elements: {'C', 'F', 'O', 'B'}
  BrC1=CC(S(=O)(N)=O)=CC(C(OC)=O)=C1...
  Elements: {'N', 'Br', 'O', 'S', 'C'}

Products:
  BrC1=CC(S(=O)(NC2=CC=C(C=C2)C(F)(F)F)=O)=CC(C(OC)=O)=C1...
  Elements: {'N', 'Br', 'O', 'S', 'C', 'F'}
  BrC1=CC(S(=O)(N(C2=CC=C(C(F)(F)F)C=C2)C3=CC=C(C=C3)C(F)(F)F)...
  Elements: {'N', 'Br', 'O', 'S', 'C', 'F'}

Full Reaction SMILES:
  OB(O)C1=CC=C(C(F)(F)F)C=C1.BrC1=CC(S(=O)(N)=O)=CC(C(OC)=O)=C1>ClCCCl.ClCCCl.[F-].[Cs+].CC(=O)[O-].CC...

=== AstraZeneca C-N Coupling ===

Reactants:
  CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C(=O)OCC)Br...
  Elements: {'N', 'Br', 'O', 'C', 'F'}
  CC(C)N1CCNCC1...
  Elements: {'N', 'C'}

Products:
  CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C(=O)OCC)N4CCN(...
  Elements: {'N', 'C', 'F', 'O'}

Full Reaction SMILES:
  CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C(=O)OCC)Br.CC(C)N1CCNCC1>C(=O)([O-])[O-].[Cs+].[Cs+].C...



In [6]:
def get_elements(smiles):
    """Get set of elements in a molecule."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return set()
    return set(a.GetSymbol() for a in mol.GetAtoms())

def analyze_element_changes(reaction):
    """What elements appear/disappear from reactants to products."""
    rxn_smiles, reactants, agents, products = build_reaction_smiles(reaction)
    
    # Elements in all reactants combined
    reactant_elements = set()
    for r in reactants:
        reactant_elements |= get_elements(r)
    
    # Elements in all products combined
    product_elements = set()
    for p in products:
        product_elements |= get_elements(p)
    
    lost = reactant_elements - product_elements    # In reactants, not in products
    gained = product_elements - reactant_elements  # In products, not in reactants
    
    return {
        'reactant_elements': reactant_elements,
        'product_elements': product_elements,
        'elements_lost': lost,
        'elements_gained': gained,
        'boron_lost': 'B' in lost,
        'halide_lost': bool(lost & {'Br', 'Cl', 'I', 'F'}),
    }

# Test on both
print("=== Chan-Lam (Suzuki-type) ===")
result = analyze_element_changes(chan_lam.reactions[0])
print(f"  Reactant elements: {result['reactant_elements']}")
print(f"  Product elements: {result['product_elements']}")
print(f"  Lost: {result['elements_lost']}")
print(f"  Gained: {result['elements_gained']}")
print(f"  Boron lost: {result['boron_lost']}")

print("\n=== AstraZeneca C-N ===")
result = analyze_element_changes(cn_dataset.reactions[0])
print(f"  Reactant elements: {result['reactant_elements']}")
print(f"  Product elements: {result['product_elements']}")
print(f"  Lost: {result['elements_lost']}")
print(f"  Gained: {result['elements_gained']}")
print(f"  Boron lost: {result['boron_lost']}")

=== Chan-Lam (Suzuki-type) ===
  Reactant elements: {'N', 'Br', 'O', 'S', 'C', 'F', 'B'}
  Product elements: {'N', 'Br', 'O', 'S', 'C', 'F'}
  Lost: {'B'}
  Gained: set()
  Boron lost: True

=== AstraZeneca C-N ===
  Reactant elements: {'N', 'C', 'Br', 'O', 'F'}
  Product elements: {'N', 'C', 'F', 'O'}
  Lost: {'Br'}
  Gained: set()
  Boron lost: False
