In [None]:
import yaml
from datetime import datetime
import os
from phantoms.models.denovo.parser import train_decoder
from phantoms.utils.parser import train_model, extract_and_save_embeddings, validate_config

In [None]:
config_path = "/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs/config_decoder_local.yml"
# Load the config
with open(config_path, "r") as f:
    config = yaml.safe_load(f) 

# Set up an experiment folder with timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_folder = os.path.join("/Users/macbook/CODE/PhantoMS/experiments_run",
                                 f"{timestamp}_{config['experiment_base_name']}")


In [None]:
train_decoder(config, experiment_folder, config_path)

# Train DeNovo

In [None]:
config_path = "/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs/config_denovo_local.yml"

# Load configuration.
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

validate_config(config)

In [None]:
# Create a unique experiment folder (for logs, checkpoints, configs, etc.)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_folder = os.path.join("/Users/macbook/CODE/PhantoMS/experiments_run", f"{timestamp}_{config['experiment_base_name']}")

# In denovo training we do not need a cut_tree_level (or you may set it to None)
cut_tree_level = None

# Train the model (this will also initialize the data module, loggers, callbacks, etc.)
train_model(config, experiment_folder, config_path, cut_tree_level)

In [None]:
extract_and_save_embeddings(config, cut_tree_level, experiment_folder)

# Bonus De Novo

In [None]:
config_path = "/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs/config_denovo_bonus_local.yml"

# Load configuration.
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

validate_config(config)

In [None]:
# Create a unique experiment folder (for logs, checkpoints, configs, etc.)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_folder = os.path.join("/Users/macbook/CODE/PhantoMS/experiments_run", f"{timestamp}_{config['experiment_base_name']}")

# In denovo training we do not need a cut_tree_level (or you may set it to None)
cut_tree_level = None

# Train the model (this will also initialize the data module, loggers, callbacks, etc.)
train_model(config, experiment_folder, config_path, cut_tree_level)

In [None]:
extract_and_save_embeddings(config, cut_tree_level, experiment_folder)

# Run multi experiment

In [None]:
import wandb
import yaml
import os
from datetime import datetime
import time
from typing import List, Optional

from phantoms.utils.parser import validate_config, train_model, extract_and_save_embeddings
from phantoms.optimizations.training import set_global_seeds

def run_all_experiments(config_dir: str,
                        experiment_parent_dir: str,
                        config_files: List[str],
                        cut_tree_levels: Optional[List[int]],
                        wandb_project_name: str):
    """
    Iterate over all configuration files and tree levels to run experiments.

    Args:
        config_dir (str): Directory containing configuration YAML files.
        experiment_parent_dir (str): Parent directory to store all experiments.
        config_files (List[str]): List of configuration YAML filenames.
        cut_tree_levels (Optional[List[int]]): List of cut_tree_at_level values.
        wandb_project_name (str): Name of the wandb project.
    """
    # Set global seeds for reproducibility
    set_global_seeds(42)

    # Create the parent experiment directory if it doesn't exist
    os.makedirs(experiment_parent_dir, exist_ok=True)

    # Iterate over each configuration file
    for config_file in config_files:
        config_path = os.path.join(config_dir, config_file)

        if not os.path.exists(config_path):
            print(f"Configuration file {config_path} does not exist. Skipping.")
            continue

        # Load the configuration
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)

        # Validate the configuration
        try:
            validate_config(config)
        except ValueError as ve:
            print(f"Configuration validation error in {config_file}: {ve}")
            continue

        # Iterate over each tree level
        for level in cut_tree_levels:
            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            # Adding a slight sleep to ensure different timestamps
            time.sleep(1.0)

            # Define a unique experiment folder name
            config_name = os.path.splitext(os.path.basename(config_file))[0]
            experiment_folder_name = f"{config_name}_cut_tree_{level}_{timestamp}"
            experiment_folder = os.path.join(experiment_parent_dir, experiment_folder_name)

            # Print experiment details for debugging
            print(f"\nRunning Experiment: {experiment_folder_name}")
            print(f"W&B Project: {wandb_project_name}")
            print(f"Cut Tree Level: {level}")

            # Create the experiment folder
            os.makedirs(experiment_folder, exist_ok=True)

            # Modify the config dict's 'trainer.checkpoint_dir' to point to experiment_folder subdirectories
            config['trainer']['checkpoint_dir'] = os.path.join(experiment_folder, 'checkpoints')

            # Optionally, modify 'experiment_base_name' to include the experiment_folder name or set to a unified value
            config['experiment_base_name'] = 'experiment_trial'  # Or any desired base name

            # Update W&B project name
            config['wandb']['project'] = wandb_project_name

            # Train the model
            train_model(config, experiment_folder, config_path, level)

            # Extract and save embeddings
            extract_and_save_embeddings(config, level, experiment_folder)

            # Finish the W&B run to ensure it's properly logged
            wandb.finish()

    print("\nAll experiments completed successfully.")

In [None]:
# Define parameters
config_directory = '/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs'
experiment_parent_directory = '/Users/macbook/CODE/PhantoMS/experiments_run/cut_trees_denovo'

configuration_files = [
    'config_denovo_local.yml',
]
tree_levels = [0, 1, 2, 3]

wandb_project_name = 'cut_trees_denovo'

# Authenticate with W&B
print("Logging into Weights & Biases...")
wandb.login()

# Run all experiments
run_all_experiments(config_dir=config_directory,
                    experiment_parent_dir=experiment_parent_directory,
                    config_files=configuration_files,
                    cut_tree_levels=tree_levels,
                    wandb_project_name=wandb_project_name)

# UPDATED DE NOVO

In [1]:
import os
import yaml
import time
from datetime import datetime
from phantoms.utils.parser import validate_config, train_model, extract_and_save_embeddings

In [2]:
# Path to your local config file.
config_path = "/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs/config_denovo_spectra_local.yml"

# Load configuration.
with open(config_path, "r") as f:
    config = yaml.safe_load(f)
validate_config(config)

In [10]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_folder = os.path.join("/Users/macbook/CODE/PhantoMS/experiments_run", f"{timestamp}_{config['experiment_base_name']}")
os.makedirs(experiment_folder, exist_ok=True)
print(f"Experiment folder: {experiment_folder}")

Experiment folder: /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-08-47_denovo_bonus_test


In [11]:
cut_tree_level = None

# Train the final de novo model.
train_model(config, experiment_folder, config_path, cut_tree_level)


Starting training for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-08-47_denovo_bonus_test
Using de novo task/model.
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.


  pretrained_state = torch.load(config['model'].get('decoder_pretrained_path'), map_location="cpu")
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Pretrained de_novo_scripts weights loaded into GATDeNovoTransformer.
Loaded pretrained de_novo_scripts into de novo model.



  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | gat_layers          | ModuleList         | 6.2 M  | train
1 | formula_encoder     | Sequential         | 4.9 K  | train
2 | encoder_fc          | Linear             | 1.1 M  | train
3 | transformer_decoder | TransformerDecoder | 67.2 M | train
4 | pos_encoder         | PositionalEncoding | 0      | train
5 | decoder_embed       | Embedding          | 3.1 M  | train
6 | decoder_fc          | Linear             | 3.1 M  | train
7 | criterion           | CrossEntropyLoss   | 0      | train
-------------------------------------------------------------------
80.7 M    Trainable params
0         Non-trainable params
80.7 M    Total params
322.624   Total estimated model params size (MB)
84        Modules in train mode
0         Modules in eval mode


Train dataset size: 82
Val dataset size: 6


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.

Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [5]:
extract_and_save_embeddings(config, cut_tree_level, experiment_folder)


Extracting embeddings for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-34-27_denovo_test
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.
Test dataset size: 17
Embeddings saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-34-27_denovo_test/embeddings


# DeNovo Bonus

In [6]:
# Path to your local config file.
config_path = "/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs/config_denovo_spectra_bonus_local.yml"

# Load configuration.
with open(config_path, "r") as f:
    config = yaml.safe_load(f)
validate_config(config)

In [7]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_folder = os.path.join("/Users/macbook/CODE/PhantoMS/experiments_run", f"{timestamp}_{config['experiment_base_name']}")
os.makedirs(experiment_folder, exist_ok=True)
print(f"Experiment folder: {experiment_folder}")

Experiment folder: /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-59-57_denovo_bonus_test


In [8]:
cut_tree_level = None

# Train the final de novo model.
train_model(config, experiment_folder, config_path, cut_tree_level)


Starting training for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-59-57_denovo_bonus_test
Using de novo task/model.
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.


  pretrained_state = torch.load(config['model'].get('decoder_pretrained_path'), map_location="cpu")
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Pretrained de_novo_scripts weights loaded into GATDeNovoTransformer.
Loaded pretrained de_novo_scripts into de novo model.



  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | gat_layers          | ModuleList         | 6.2 M  | train
1 | formula_encoder     | Sequential         | 4.9 K  | train
2 | encoder_fc          | Linear             | 1.1 M  | train
3 | transformer_decoder | TransformerDecoder | 67.2 M | train
4 | pos_encoder         | PositionalEncoding | 0      | train
5 | decoder_embed       | Embedding          | 3.1 M  | train
6 | decoder_fc          | Linear             | 3.1 M  | train
7 | criterion           | CrossEntropyLoss   | 0      | train
-------------------------------------------------------------------
80.7 M    Trainable params
0         Non-trainable params
80.7 M    Total params
322.624   Total estimated model params size (MB)
84        Modules in train mode
0         Modules in eval mode


Train dataset size: 82
Val dataset size: 6


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
[19:00:14] SMILES Parse Error: syntax error while parsing: CCCCCCCCCCCCOSCSCS CCCCCCCCCCCCOSCOCBr CCCCCCCCCCCCOSCOCBrCCCCNCSCS CCCCSSCCCNCCCNC CCCCCCCCCCCCOSCOCBrCCCCN CCCCCCCCCCCCSSCCCNCCCNCCCOCCOCCOCCOCCOCCOCCOCCOCCCCOCCOCCOCCOCCOCCOCCOCCOCCCNNCCSCCBrCCNCCCNCCCOCCOCCOCCOCCOCCOCCOCCOCCCCCCCCCCCCCCCCCCCCCCCOCCCC CCCCCCCCCCCCOS CCCCCCCCCOCCCCCNNCCSCCBrCCNCCCCNCCCCN CNCCCOC CCCCCCCCCOCCCC CNCCCOC CCCCCCCCCOCCCC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC CNCCCOC�COCCOCCCCCCC CNCCCOC� CCCCCCCCCCCOCCCCCOCC CCCNCC CNCCCOC�COCCNCCNS CCCCCCCCCCCCOSSSCSSC CNCCCOC CNCCCOC C

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[19:00:31] SMILES Parse Error: syntax error while parsing: C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(
[19:00:31] SMILES Parse Error: Failed parsing SMILES 'C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(' for input: 'C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C('
[19:00:31] SMILES Parse Error: syntax error while parsing: C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(
[19:00:31]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


Test dataset size: 17


/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[19:02:45] SMILES Parse Error: syntax error while parsing: )C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C
[19:02:45] SMILES Parse Error: Failed parsing SMILES ')C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C' for input: ')C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C'
[19:02:45] SMILES Parse Error: extra open parentheses for input: 'C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(CO)C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C'
[1

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_loss               4.844666004180908
    test_num_valid_mols                 0.0
    test_top_10_accuracy                0.0
test_top_10_max_tanimoto_sim            0.0
   test_top_10_mces_dist               100.0
    test_top_1_accuracy                 0.0
test_top_1_max_tanimoto_sim             0.0
    test_top_1_mces_dist               100.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Model saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-59-57_denovo_bonus_test/checkpoints/final_model.ckpt


0,1
epoch,▁▅█
test_loss,▁
test_num_valid_mols,▁
test_top_10_accuracy,▁
test_top_10_max_tanimoto_sim,▁
test_top_10_mces_dist,▁
test_top_1_accuracy,▁
test_top_1_max_tanimoto_sim,▁
test_top_1_mces_dist,▁
trainer/global_step,▁▆█

0,1
epoch,2.0
test_loss,4.84467
test_num_valid_mols,0.0
test_top_10_accuracy,0.0
test_top_10_max_tanimoto_sim,0.0
test_top_10_mces_dist,100.0
test_top_1_accuracy,0.0
test_top_1_max_tanimoto_sim,0.0
test_top_1_mces_dist,100.0
trainer/global_step,4.0


Configuration saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-59-57_denovo_bonus_test/configs/config_denovo_spectra_bonus_local.yml


In [9]:
extract_and_save_embeddings(config, cut_tree_level, experiment_folder)


Extracting embeddings for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-59-57_denovo_bonus_test
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.
Test dataset size: 17
Embeddings saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_18-59-57_denovo_bonus_test/embeddings


# Train DreaMS

In [12]:
config_path = "/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs/config_denovo_dreams_local.yml"

# Load configuration.
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

validate_config(config)

In [13]:
# Create a unique experiment folder (for logs, checkpoints, configs, etc.)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_folder = os.path.join("/Users/macbook/CODE/PhantoMS/experiments_run", f"{timestamp}_{config['experiment_base_name']}")

# In denovo training we do not need a cut_tree_level (or you may set it to None)
cut_tree_level = None

# Train the model (this will also initialize the data module, loggers, callbacks, etc.)
train_model(config, experiment_folder, config_path, cut_tree_level)


Starting training for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-32-38_denovo_dreams_test
Using de novo task/model.
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.


  pretrained_state = torch.load(config['model'].get('decoder_pretrained_path'), map_location="cpu")
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | gat_layers          | ModuleList         | 3.2 M  | train
1 | encoder_fc          | Linear             | 1.0 M  | train
2 | transformer_decode

Pretrained de_novo_scripts weights loaded into GATDeNovoTransformer.
Loaded pretrained de_novo_scripts into de novo model.
Train dataset size: 82
Val dataset size: 6


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
[19:33:03] non-ring atom 0 marked aromatic
[19:33:03] non-ring atom 0 marked aromatic
[19:33:12] non-ring atom 0 marked aromatic
[19:33:12] Explicit valence for atom # 51 Cl, 2, is greater than permitted
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:310: T

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[19:33:39] SMILES Parse Error: syntax error while parsing: ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
[19:33:39] SMILES Parse Error: Failed parsing SMILES '))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))' for input: '))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))'
[19:33:39] SMILES Parse Error: syntax error while parsing: ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
[19:33:39]

Test dataset size: 17


/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[19:33:55] SMILES Parse Error: syntax error while parsing: ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
[19:33:55] SMILES Parse Error: Failed parsing SMILES '))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))' for input: '))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))'
[19:33:55] SMILES Parse Error: syntax error while parsing: ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
[19:33:55]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_loss               4.304252624511719
    test_num_valid_mols                 0.0
    test_top_10_accuracy                0.0
test_top_10_max_tanimoto_sim            0.0
   test_top_10_mces_dist               100.0
    test_top_1_accuracy                 0.0
test_top_1_max_tanimoto_sim             0.0
    test_top_1_mces_dist               100.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Model saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-32-38_denovo_dreams_test/checkpoints/final_model.ckpt


0,1
epoch,▁▅█
test_loss,▁
test_num_valid_mols,▁
test_top_10_accuracy,▁
test_top_10_max_tanimoto_sim,▁
test_top_10_mces_dist,▁
test_top_1_accuracy,▁
test_top_1_max_tanimoto_sim,▁
test_top_1_mces_dist,▁
trainer/global_step,▁▆█

0,1
epoch,2.0
test_loss,4.30425
test_num_valid_mols,0.0
test_top_10_accuracy,0.0
test_top_10_max_tanimoto_sim,0.0
test_top_10_mces_dist,100.0
test_top_1_accuracy,0.0
test_top_1_max_tanimoto_sim,0.0
test_top_1_mces_dist,100.0
trainer/global_step,4.0


Configuration saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-32-38_denovo_dreams_test/configs/config_denovo_dreams_local.yml


In [14]:
extract_and_save_embeddings(config, cut_tree_level, experiment_folder)


Extracting embeddings for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-32-38_denovo_dreams_test
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.
Test dataset size: 17
Embeddings saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-32-38_denovo_dreams_test/embeddings


# Train DreaMS Bonus

In [17]:
config_path = "/Users/macbook/CODE/PhantoMS/phantoms/models/denovo/configs/config_denovo_dreams_bonus_local.yml"

# Load configuration.
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

validate_config(config)

In [18]:
# Create a unique experiment folder (for logs, checkpoints, configs, etc.)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_folder = os.path.join("/Users/macbook/CODE/PhantoMS/experiments_run", f"{timestamp}_{config['experiment_base_name']}")

# In denovo training we do not need a cut_tree_level (or you may set it to None)
cut_tree_level = None

# Train the model (this will also initialize the data module, loggers, callbacks, etc.)
train_model(config, experiment_folder, config_path, cut_tree_level)


Starting training for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-35-26_denovo_dreams_bonus_test
Using de novo task/model.
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.


  pretrained_state = torch.load(config['model'].get('decoder_pretrained_path'), map_location="cpu")
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | gat_layers          | ModuleList         | 3.2 M  | train
1 | formula_encoder     | Sequential         | 4.9 K  | train
2 | encoder_fc        

Pretrained de_novo_scripts weights loaded into GATDeNovoTransformer.
Loaded pretrained de_novo_scripts into de novo model.
Train dataset size: 82
Val dataset size: 6


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=10). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[19:36:05] SMILES Parse Error: extra close parentheses while parsing: CCCCCCCCCCCCCCCCCCCCCCC)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)CCCCCCCCCCC)C)C)CCCC)C)C)C)CCCCCC)C)CC)C)C)C)C)CCCCCCCCCCCCCCC)C)CCCCC)CCCCC)C)C)C)C)CCCC)C)CCC)C)CCCCCCCCCCC)C)CCCCCCCCCCCCCCCCCCCC
[19:36:05] SMILES Parse Error: Failed parsing SMILES 'CCCCCCCCCCCCCCCCCCCCCCC)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)CCCCCCCCCCC)C)C)CCCC)C)C)C)CCCCCC)C)CC)C)C)C)C)CCCCCCCCCCCCCCC)C)CCCCC)CCCCC)C)C)C)C)CCCC)C)CCC)C)CCCCCCCCCCC)C)CCCCCCCCCCCCCCCCCCCC' for input: 'CCCCCCCCCCCCCCCCCCCCCCC)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)CCCCCCCCCCC)C)C)CCCC)C)C)C)CCCCCC)C)CC)C)C)C)C)CCCCCCCCCCCCCCC)C)CCCCC)CCCCC)C)C)C)C)CCCC)C)CCC)C)CCCCCCCCCCC)C)CCCCCCCCCCCCCCCCCCCC'
[19:36:05] SMILES Parse Error: extra close parentheses while parsing: CCCCCCCCCCCCCCCCCCCCCCC)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)C)CCCCCCCCCCC)C)C)CCCC)C)C)C)CCCCCC)CCCC)C)C)C)C)CCCCCCCCCCCCCCC)C)CCCCC)CCCCC)C)C)C)C)CCCC)C)CCC)C)CCCCCCCCCCCCCCCCCCCCCCC

Validation: |          | 0/? [00:00<?, ?it/s]

[19:36:21] SMILES Parse Error: syntax error while parsing: =C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C
[19:36:21] SMILES Parse Error: Failed parsing SMILES '=C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C' for input: '=C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C'
[19:36:21] SMILES Parse Error: syntax error while parsing: =C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C
[19:36:21]

Test dataset size: 17


/Users/macbook/UTILS/anaconda3/envs/phantoms_env/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[19:38:12] SMILES Parse Error: syntax error while parsing: =C(C(C(C=C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C
[19:38:12] SMILES Parse Error: Failed parsing SMILES '=C(C(C(C=C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C' for input: '=C(C(C(C=C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C'
[19:38:12] SMILES Parse Error: syntax error while parsing: =C(C(C(C=C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C(C
[19:38:12]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_loss               4.326637268066406
    test_num_valid_mols                 0.0
    test_top_10_accuracy                0.0
test_top_10_max_tanimoto_sim            0.0
   test_top_10_mces_dist               100.0
    test_top_1_accuracy                 0.0
test_top_1_max_tanimoto_sim             0.0
    test_top_1_mces_dist               100.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Model saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-35-26_denovo_dreams_bonus_test/checkpoints/final_model.ckpt


0,1
epoch,▁▅█
test_loss,▁
test_num_valid_mols,▁
test_top_10_accuracy,▁
test_top_10_max_tanimoto_sim,▁
test_top_10_mces_dist,▁
test_top_1_accuracy,▁
test_top_1_max_tanimoto_sim,▁
test_top_1_mces_dist,▁
trainer/global_step,▁▆█

0,1
epoch,2.0
test_loss,4.32664
test_num_valid_mols,0.0
test_top_10_accuracy,0.0
test_top_10_max_tanimoto_sim,0.0
test_top_10_mces_dist,100.0
test_top_1_accuracy,0.0
test_top_1_max_tanimoto_sim,0.0
test_top_1_mces_dist,100.0
trainer/global_step,4.0


Configuration saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-35-26_denovo_dreams_bonus_test/configs/config_denovo_dreams_bonus_local.yml


In [19]:
extract_and_save_embeddings(config, cut_tree_level, experiment_folder)


Extracting embeddings for /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-35-26_denovo_dreams_bonus_test
Loaded tokenizer from /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_09-07-15_tokenizer_large/smiles_tokenizer.json.
Test dataset size: 17
Embeddings saved to /Users/macbook/CODE/PhantoMS/experiments_run/2025-02-21_19-35-26_denovo_dreams_bonus_test/embeddings
