### Route efficiency tutorial

This tutorial shows how to read routes from RDF files and then compute route efficiency metrics according to this paper:

[Genheden S., Howell G. Measuring the efficiency of synthetic routes and transformations using vectors derived from similarity and complexity, 2025. ChemRxiv; 10.26434/chemrxiv-2025-t3fmt](https://chemrxiv.org/engage/chemrxiv/article-details/681dbeee927d1c2e66bae776)

We will use three experimental routes of atovastatin as examples

In [None]:
#@title Installation -- Run this cell to install rnxutils

!pip install reaction-utils
!pip install seaborn
!wget https://raw.githubusercontent.com/MolecularAI/reaction_utils/route-efficiency-notebook/examples/route-scoring/complexity.py -O complexity.py
!wget https://raw.githubusercontent.com/MolecularAI/reaction_utils/refs/heads/main/examples/route-comparison/Roth%201991%20Chiral%20aux%20via%20Claisen.rdf -O "Roth 1991 Chiral aux via Claisen.rdf"
!wget https://raw.githubusercontent.com/MolecularAI/reaction_utils/refs/heads/main/examples/route-comparison/Roth%201991%20rac.rdf -O  "Roth 1991 rac.rdf"
!wget https://raw.githubusercontent.com/MolecularAI/reaction_utils/refs/heads/main/examples/route-comparison/US5298627%20route.rdf -O "US5298627 route.rdf"

In [None]:
import glob
from collections import defaultdict

import seaborn as sns
import pandas as pd

from rxnutils.routes.readers import read_rdf_file

We will read each of the RDF files in reverse alphabetical order and make them into a `SynthesisRoute` object using `read_rdf_file`

In [None]:
rdf_filenames = glob.glob("*.rdf")
rdf_filenames.sort(reverse=True)
routes = [read_rdf_file(filename) for filename in rdf_filenames]
len(routes)

Here is the first route

In [None]:
routes[0].image()

Now, we will define some helper functions to calculate the compound similarities and complexities

In [None]:
from rdkit import Chem
from rdkit.Chem import rdFMCS
from complexity import calc_cm_star
import numpy as np

def norm_cm_star(smiles: str, min_val=3.5, max_val=12.0) -> float:
    """ Calculate the normalized CM* of a molecule,
    clipping values outside the range [min_val, max_val]
    """
    cstar = calc_cm_star(smiles)
    if cstar == np.inf or cstar == -np.inf:
        return 0

    if cstar < min_val:
        return 0

    if cstar > max_val:
        return max_val
    
    ret = (cstar - min_val) / (max_val - min_val)
    return ret

def calc_mcs_similarity(smiles_list: list[str]) -> np.ndarray:
    """ Calculate the MCS similarity between the targets and all the other molecules in the chain
    """
    params = rdFMCS.MCSParameters()
    params.AtomTyper = rdFMCS.AtomCompare.CompareElements
    params.BondTyper = rdFMCS.BondCompare.CompareAny
    params.BondCompareParameters.RingMatchesRingOnly = False
    params.BondCompareParameters.CompleteRingsOnly = False
    params.Timeout=30

    rd_mols = [Chem.MolFromSmiles(smi) for smi in smiles_list] 
    natombonds = [mol.GetNumAtoms()+mol.GetNumBonds() for mol in rd_mols]
    sim_mcs = []
    for mol, mol_atombonds in zip(rd_mols, natombonds):
        mcs = rdFMCS.FindMCS([mol, rd_mols[-1]], params)
        if mcs.canceled: # ie timeout reached
            sim_mcs.append(0)
            continue
        mcs_natomsbonds = mcs.numAtoms + mcs.numBonds
        norm = mol_atombonds + natombonds[-1] - mcs_natomsbonds
        sim_mcs.append(mcs_natomsbonds / norm)
    return np.asarray(sim_mcs)

def calc_vmin(complexities, similarities):
    """ Calculate the V_min of a route, see Figure 6 in publication
    
    :param complexities: list of complexities of all the molecules in the chain
    :param similarities: list of similarities of all the molecules in the chain
    """
    return np.sqrt((complexities[-1]-complexities[0])**2+(similarities[-1]-similarities[0])**2)

def calc_veff(complexities, similarities, min_clip=0.1):
    """ Calculate the η (efficiency) of a route, see Figure 6 in publication

    :param complexities: list of complexities of all the molecules in the chain
    :param similarities: list of similarities of all the molecules in the chain
    :param min_clip: minimum value to clip the path length, default is 0.1
    """
    vmin = calc_vmin(complexities, similarities)
    delta_comp = complexities[1:]-complexities[:-1]
    delta_sim = similarities[1:]-similarities[:-1]
    v_path = np.sqrt(delta_comp**2 + delta_sim**2)
    if min_clip is None:
        return vmin/v_path.sum()
    return vmin/v_path.clip(min_clip,None).sum()

Now, we can calculate the similarity and complexity vectors for all the 3 routes

In [None]:
data = defaultdict(list)

for route, filename in zip(routes, rdf_filenames):
    chains = route.chains(norm_cm_star)
    lls = chains[0]

    smiles_list = [mol["smiles"] for mol in lls]
    similarities = calc_mcs_similarity(smiles_list)
    complexities = np.asarray([mol["complexity"] for mol in lls])
    
    data["route"].extend([filename.split(".")[0]]*len(complexities))
    data["nC"].extend(complexities)
    data["$S_{MCES}$"].extend(similarities)
data = pd.DataFrame(data)

In [None]:
sns.relplot(
    data=data, 
    x="$S_{MCES}$", 
    y="nC", 
    col="route", 
    kind="line", 
    marker="o", 
    dashes=False,
    facet_kws={"sharey": True, "sharex": True}
)

And we can calculate the $V_{min}$ and $\eta$ values of the routes

In [None]:
def calc_route_efficiency(route_df):
    return pd.Series(
        {
            "Vmin": calc_vmin(route_df["nC"].values, route_df["$S_{MCES}$"].values),
            "η": calc_veff(route_df["nC"].values, route_df["$S_{MCES}$"].values)
        }
    )
data.groupby("route").apply(calc_route_efficiency, include_groups=False)