# General tutorial

**Introduction.** Welcome to the SynPlanner tutorial, a detailed guide on utilizing a sophisticated retrosynthetic planning tool that combines Monte Carlo Tree Search (MCTS) with neural networks. This tutorial is designed for synthetic chemists and chemoinformaticians looking to deepen their understanding and application of SynPlanner in their work.

**About SynPlanner.** SynPlanner is aimed as a one-box solution for building retrosynthesis planners and includes the original modules for reaction data curation, reaction rules extraction, retrosynthetic models training, and retrosynthesis planning. SynPlanner is based on the open-source Python 3 packages for reaction chemoinformatics and deep learning frameworks.

**Tutorial focus.** This tutorial will lead through the steps of extraction of reaction rules, training retrosynthetic models (ranking policy network), and retrosynthesis planning in SynPlanner.

## 1. Set up input and output data locations

In [5]:
import os
import pickle
import shutil
from pathlib import Path
from synplan.utils.loading import download_all_data

# download SynPlanner data
data_folder = Path("synplan_data").resolve()
download_all_data(save_to=data_folder)

# results folder
results_folder = Path("general_tutorial_results").resolve()
results_folder.mkdir(exist_ok=True)

# input data
original_data_path = data_folder.joinpath("tutorial/uspto_tutorial.smi").resolve(strict=True) # replace with custom data if needed
# original_data_path = data_folder.joinpath("uspto/uspto_standardized.smi").resolve(strict=True) # replace with custom data if needed

building_blocks_path = data_folder.joinpath("building_blocks/building_blocks_em_sa_ln.smi").resolve(strict=True)

# output data
standardized_data_path = results_folder.joinpath("uspto_standardized.smi")
filtered_data_path = results_folder.joinpath("uspto_filtered.smi")
reaction_rules_path = results_folder.joinpath("uspto_reaction_rules.pickle")
ranking_policy_network_folder = results_folder.joinpath("ranking_policy_network")
ranking_policy_dataset_path = ranking_policy_network_folder.joinpath("ranking_policy_dataset.pt") # the generated training set

Fetching 25 files:   0%|          | 0/25 [00:00<?, ?it/s]

## 2. Reaction data standardization

In [6]:
from synplan.chem.data.standardizing import (
    ReactionStandardizationConfig,
    standardize_reactions_from_file,
)

from synplan.chem.data.standardizing import (
    ReactionMappingStandardizer,
    FunctionalGroupsConfig,
    KekuleFormConfig,
    CheckValenceConfig,
    ImplicifyHydrogensConfig,
    CheckIsotopesConfig,
    AromaticFormConfig,
    MappingFixConfig,
    UnchangedPartsConfig,
    DuplicateReactionConfig,
)

### Standardization configuration

In [7]:
standardization_config = ReactionStandardizationConfig(
    reaction_mapping_config=ReactionMappingStandardizer(),
    functional_groups_config=FunctionalGroupsConfig(),
    kekule_form_config=KekuleFormConfig(),
    check_valence_config=CheckValenceConfig(),
    implicify_hydrogens_config=ImplicifyHydrogensConfig(),
    check_isotopes_config=CheckIsotopesConfig(),
    aromatic_form_config=AromaticFormConfig(),
    mapping_fix_config=MappingFixConfig(),
    unchanged_parts_config=UnchangedPartsConfig(),
    duplicate_reaction_config=DuplicateReactionConfig(),
)

### Running standardization

In [8]:
shutil.copy(original_data_path, results_folder.joinpath('uspto_original.smi'))

standardize_reactions_from_file(
    config=standardization_config,
    input_reaction_data_path=original_data_path,
    standardized_reaction_data_path=standardized_data_path,
    num_cpus=4,
    batch_size=100,
)

Number of reactions processed: 71832 [07:24]


Initial number of parsed reactions: 71832
Standardized number of reactions: 69446


## 3. Reaction data filtration

In [9]:
from synplan.chem.data.filtering import (
    ReactionFilterConfig,  # the main config class
    filter_reactions_from_file,  # reaction filtration function
    # reaction filters:
    CCRingBreakingConfig,
    WrongCHBreakingConfig,
    CCsp3BreakingConfig,
    DynamicBondsConfig,
    MultiCenterConfig,
    NoReactionConfig,
    SmallMoleculesConfig,
)

### Filtration configuration

In [10]:
filtration_config = ReactionFilterConfig(
    dynamic_bonds_config=DynamicBondsConfig(
        min_bonds_number=1,  # minimum number of dynamic bonds for a reaction
        max_bonds_number=6,
    ),  # maximum number of dynamic bonds for a reaction
    no_reaction_config=NoReactionConfig(),  # configuration for the 'no reaction' filter.
    multi_center_config=MultiCenterConfig(),  # configuration for the 'multi-center reaction' filter
    wrong_ch_breaking_config=WrongCHBreakingConfig(),  # configuration for the 'C-H breaking' filter
    cc_sp3_breaking_config=CCsp3BreakingConfig(),  # configuration for the 'Csp3-C breaking' filter
    cc_ring_breaking_config=CCRingBreakingConfig(),
)  # configuration for the 'C-C ring breaking' filter

### Runing filtration

In [11]:
filter_reactions_from_file(
    config=filtration_config,
    input_reaction_data_path=standardized_data_path,  # the path to the standardized reaction data file
    filtered_reaction_data_path=filtered_data_path,  # the path to the filtered reaction data file
    num_cpus=4,
    batch_size=100,
)

Number of reactions processed: 69446 [04:17]


Initial number of reactions: 69446
Removed number of reactions: 1


## 4. Reaction rules extraction

In [12]:
from synplan.utils.config import RuleExtractionConfig
from synplan.chem.reaction_rules.extraction import extract_rules_from_reactions

### Rule extraction configuration

In [13]:
extraction_config = RuleExtractionConfig(
    min_popularity=3,
    environment_atom_count=1,
    multicenter_rules=True,
    include_rings=False,
    keep_leaving_groups=True,
    keep_incoming_groups=False,
    keep_reagents=False,
    atom_info_retention={
        "reaction_center": {
            "neighbors": True,  # retains information about neighboring atoms to the reaction center
            "hybridization": True,  # preserves the hybridization state of atoms at the reaction center
            "implicit_hydrogens": False,  # includes data on implicit hydrogen atoms attached to the reaction center
            "ring_sizes": False,  # keeps information about the sizes of rings that reaction center atoms are part of
        },
        "environment": {
            "neighbors": False,  # retains information about neighboring atoms to the atoms in the environment of the reaction center
            "hybridization": False,  # preserves the hybridization state of atoms in the environment
            "implicit_hydrogens": False,  # includes data on implicit hydrogen atoms attached to atoms in the environment
            "ring_sizes": False,  # keeps information about the sizes of rings that environment atoms are part of
        },
    },
)

### Running rule extraction

In [14]:
extract_rules_from_reactions(
    config=extraction_config,  # the configuration settings for rule extraction
    reaction_data_path=filtered_data_path,  # path to the reaction data file
    reaction_rules_path=reaction_rules_path,  # path to the pickle file where the extracted reaction rules will be stored
    num_cpus=4,
    batch_size=100,
)

Number of reactions processed: 69445 [03:47]


Number of extracted reaction rules: 21881


## 5. Ranking policy training

In [15]:
from synplan.utils.config import PolicyNetworkConfig
from synplan.ml.training.supervised import create_policy_dataset, run_policy_training

### Ranking policy configuration

In [16]:
training_config = PolicyNetworkConfig(
    policy_type="ranking",  # the type of policy network
    num_conv_layers=5,  # the number of graph convolutional layers in the network
    vector_dim=512,  # the dimensionality of the final embedding vector
    learning_rate=0.0008,  # the learning rate for the training process
    dropout=0.4,  # the dropout rate
    num_epoch=100,  # the number of epochs for training
    batch_size=100,
)  # the size of training batch of input data

### Ranking policy training dataset

In [17]:
datamodule = create_policy_dataset(
    dataset_type="ranking",
    reaction_rules_path=reaction_rules_path,
    molecules_or_reactions_path=filtered_data_path,
    output_path=ranking_policy_dataset_path,
    batch_size=training_config.batch_size,
    num_cpus=4,
)

Number of reactions processed: 69445 [09:42]


Training set size: 52512, validation set size: 13128


### Running ranking policy training

In [None]:
run_policy_training(
    datamodule,  # the prepared data module for training
    config=training_config,  # the training configuration
    results_path=ranking_policy_network_folder,
)  # path to save the training results

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


## 6. Tree search with the ranking policy network and rollout evaluation

In [None]:
from CGRtools import smiles

from IPython.display import SVG, display

from synplan.utils.visualisation import get_route_svg
from synplan.mcts.tree import Tree
from synplan.mcts.expansion import PolicyNetworkFunction
from synplan.utils.config import TreeConfig
from synplan.utils.loading import load_reaction_rules, load_building_blocks

### MCTS configuration

In [None]:
tree_config = TreeConfig(
    search_strategy="expansion_first",
    evaluation_type="rollout",
    max_iterations=300,
    max_time=120,
    max_depth=9,
    min_mol_size=0,
    init_node_value=0.5,
    ucb_type="uct",
    c_ucb=0.1,
)

### Choose the target molecule

In [None]:
example_molecule = "CC(C)(C)[Si](OCc1ccn[nH]c1=O)(c1ccccc1)c1ccccc1"

target = smiles(example_molecule)
target.canonicalize()
target.clean2d()
target

### Run retrosynthetic planning

In [None]:
# initialize the policy function
ranking_policy_network = ranking_policy_network_folder.joinpath("policy_network.ckpt")
policy_config = PolicyNetworkConfig(weights_path=ranking_policy_network)
policy_function = PolicyNetworkFunction(policy_config=policy_config)

reaction_rules = load_reaction_rules(reaction_rules_path)
building_blocks = load_building_blocks(building_blocks_path)

In [None]:
tree = Tree(
    target=target,
    config=tree_config,
    reaction_rules=reaction_rules,
    building_blocks=building_blocks,
    expansion_function=policy_function,
    evaluation_function=None,
)

In [None]:
tree_solved = False
for solved, node_id in tree:
    if solved:
        tree_solved = True
tree

### Retrosynthetic routes visualisation

In [None]:
for n, node_id in enumerate(tree.winning_nodes):
    print(
        f"-------- Path starts from node #{node_id} with total route score {tree.route_score(node_id)} --------"
    )
    display(SVG(get_route_svg(tree, node_id)))
    if n == 3:
        break