In [22]:
from datasets import load_dataset, Dataset
from rdkit import RDLogger
import random
import selfies as sf
from presto.chemistry_tools.reaction import multicomponent_smiles_to_list, list_to_multicomponent_smiles
from presto.chemistry_tools.smiles import convert_to_canonical_smiles

RDLogger.DisableLog('rdApp.*')

[nltk_data] Downloading package wordnet to /home/ys792/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
mol_fs_dataset = load_dataset("OpenMol/MolInst_125K_FS_SMILES-SFT")
mol_rs_dataset = load_dataset("OpenMol/MolInst_125K_RS_SMILES-SFT")
mol_fs_mm_dataset = load_dataset("OpenMol/MolInst_FS_125K_SMILES-MMChat")
mol_rs_mm_dataset = load_dataset("OpenMol/MolInst_RS_125K_SMILES-MMChat")
smol_fs_dataset = load_dataset("OpenMol/SMol_FS_Filtered_875K_SMILES-SFT")
smol_rs_dataset = load_dataset("OpenMol/SMol_RS_Filtered_825K_SMILES-SFT")

Downloading readme: 100%|██████████| 640/640 [00:00<00:00, 2.10MB/s]
Downloading data: 100%|██████████| 18.0M/18.0M [00:00<00:00, 25.3MB/s]
Downloading data: 100%|██████████| 141k/141k [00:00<00:00, 873kB/s]
Generating train split: 100%|██████████| 124384/124384 [00:00<00:00, 173291.97 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 71638.72 examples/s]
Downloading readme: 100%|██████████| 640/640 [00:00<00:00, 2.15MB/s]
Downloading data: 100%|██████████| 15.7M/15.7M [00:00<00:00, 48.6MB/s]
Downloading data: 100%|██████████| 125k/125k [00:00<00:00, 614kB/s]
Generating train split: 100%|██████████| 128684/128684 [00:00<00:00, 203939.68 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 83336.06 examples/s]


In [23]:
SYSTEM_PROMPT = """You are a chemist. Now you are given a reaction equation. Please predict the product of the reaction.
    The reaction equation has the following format:
    ```
    reactant1.reactant2. ... .reactantN>>product
    ```
    Your task is to predict the <REP_1> representation of the product molecule. We provide the <REP_2> of the reactants."""

MOLECULE_TOKEN = "<molecule_2d>"

RS_PROMPT_TEMPLATES = [
    {
        "input": "Based on the given product, provide some plausible reactants that might have been utilized to prepare it. <INPUT>",
        "output": "<OUTPUT>"
    },
    {
        "input": "Can you identify the reactant(s) that might result in the given product <INPUT> ?",
        "output": "<OUTPUT>"
    },
    {
        "input": "Given the following product, please provide possible reactants. <INPUT>",
        "output": "<OUTPUT>"
    },
    {
        "input": "Do retrosynthesis with the product <INPUT> .",
        "output": "<OUTPUT>"
    },
    {
        "input": "<INPUT> Given the product provided, propose some possible reactants that could have been employed in its formation.",
        "output": "<OUTPUT>"
    },
    {
        "input": "To synthesis <INPUT>, what are the possible reactants? Write in the SMILES representation.",
        "output": "<OUTPUT>"
    },
    {
        "input": "Provide the potential reactants that may be used to produce the product <INPUT> .",
        "output": "<OUTPUT>"
    },
    {
        "input": "What reactants could lead to the production of the following product? <INPUT>",
        "output": "<OUTPUT>"
    },
    {
        "input": "With the given product <INPUT>, suggest some likely reactants that were used in its synthesis.",
        "output": "<OUTPUT>"
    },
    {
        "input": "Identify possible reactants that could have been used to create the specified product. <INPUT>",
        "output": "<OUTPUT>"
    },
    {
        "input": "Could you tell which reactants might have been used to generate the following product? <INPUT>",
        "output": "<OUTPUT>"
    },
    {
        "input": "Suggest possible substances that may have been involved in the synthesis of the presented compound. <INPUT>",
        "output": "<OUTPUT>"
    },
    {
        "input": "Can you list the reactants that might result in the chemical product <INPUT> ?",
        "output": "<OUTPUT>"
    }
]

FS_PROMPT_TEMPLATES = [
    {
        "input": "<INPUT> Based on the reactants and reagents given above, suggest a possible product.",
        "output": "<OUTPUT>",
    },
    {
        "input": "Based on the given reactants and reagents: <INPUT>, what product could potentially be produced?",
        "output": "<OUTPUT>",
    },
    {
        "input": "Given the following reactants and reagents, please provide a possible product. <INPUT>",
        "output": "<OUTPUT>",
    },
    {
        "input": "<INPUT> Given the above reactants and reagents, what could be a probable product of their reaction?",
        "output": "<OUTPUT>",
    },
    {
        "input": "Please provide a feasible product that could be formed using these reactants and reagents: <INPUT> .",
        "output": "<OUTPUT>",
    },
    {
        "input": "Consider that for a chemical reaction, if <INPUT> is/are the reactants and reagents, what can be the product?",
        "output": "<OUTPUT>",
    },
    {
        "input": "Propose a potential product given these reactants and reagents. <INPUT>",
        "output": "<OUTPUT>",
    },
    {
        "input": "Predict the product of a chemical reaction with <INPUT> as the reactants and reagents.",
        "output": "<OUTPUT>",
    },
    {
        "input": "Can you tell me the potential product of a chemical reaction that uses <INPUT> as the reactants and reagents?",
        "output": "<OUTPUT>",
    },
    {
        "input": "Using <INPUT> as the reactants and reagents, tell me the potential product.",
        "output": "<OUTPUT>",
    },
    {
        "input": "Predict a possible product from the listed reactants and reagents. <INPUT>",
        "output": "<OUTPUT>",
    },
    {
        "input": "<INPUT> Considering the given starting materials, what might be the resulting product in a chemical reaction?",
        "output": "<OUTPUT>",
    },
    {
        "input": "A chemical reaction has started with the substance(s) <INPUT> as the reactants and reagents, what could be a probable product?",
        "output": "<OUTPUT>",
    }
]

# use regex to parse <INPUT> from instruction
import re

def get_molecule(instruction):
    patterns = [
        r"Given the following reactants and reagents: (.+), please provide a possible product\.",
        r"Given the following reactants and reagents, please provide a possible product\. (.+)",
        r"Based on the given reactants and reagents: (.+), what product could potentially be produced\?",
        r"Please provide a feasible product that could be formed using these reactants and reagents: (.+)",
        r"Consider that for a chemical reaction, if (.+) is/are the reactants and reagents, what can be the product\?",
        r"Propose a potential product given these reactants and reagents. (.+)",
        r"Predict the product of a chemical reaction with (.+) as the reactants and reagents\.",
        r"Can you tell me the potential product of a chemical reaction that uses (.+) as the reactants and reagents\?",
        r"Using (.+) as the reactants and reagents, tell me the potential product\.",
        r"Predict a possible product from the listed reactants and reagents\. (.+)",
        r"(.+) Based on the reactants and reagents given above, suggest a possible product\.",
        r"(.+) Considering the given starting materials, what might be the resulting product in a chemical reaction\?",
        r"A chemical reaction has started with the substance\(s\) (.+) as the reactants and reagents, what could be a probable product\?",
        r"Based on the given product, provide some plausible reactants that might have been utilized to prepare it\. (.+)",
        r"Can you identify the reactant\(s\) that might result in the given product (.+) \?",
        r"Given the following product, please provide possible reactants\. (.+)",
        r"Do retrosynthesis with the product (.+)\.",
        r"(.+) Given the above reactants and reagents, what could be a probable product of their reaction\?",
        r"(.+) Given the product provided, propose some possible reactants that could have been employed in its formation\.",
        r"To synthesis (.+), what are the possible reactants\? Write in the SMILES representation\.",
        r"Provide the potential reactants that may be used to produce the product (.+)\.",
        r"What reactants could lead to the production of the following product\? (.+)",
        r"With the given product (.+), suggest some likely reactants that were used in its synthesis\.",
        r"Identify possible reactants that could have been used to create the specified product\. (.+)",
        r"Could you tell which reactants might have been used to generate the following product? (.+)",
        r"Suggest possible substances that may have been involved in the synthesis of the presented compound\. (.+)",
        r"Can you list the reactants that might result in the chemical product (.+) \?",
        r"To synthesis (.+), what are the possible reactants\? Write in the SMILES representation\.",
        r"Can you identify the reactant\(s\) that might result in the given product (.+) \?",
        r"Could you tell which reactants might have been used to generate the following product\? (.+)",
        r"What reactants could lead to the production of the following product\? (.+)",
    ]
    for pattern in patterns:
        match = re.search(pattern, instruction)
        if match:
            return match.group(1).strip(' .')
    return None

In [None]:
def write_dataset_files(dataset, prefix):
    for split in dataset.keys():
        with open(f"{prefix}_{split}_input.txt", "w") as f:
            for item in dataset[split]:
                match = get_molecule(item["instruction"])
                if match:
                    f.write(match + "\n")
                else:
                    raise ValueError(item["instruction"])

        with open(f"{prefix}_{split}_output.txt", "w") as f:
            for item in dataset[split]:
                f.write(item["output"] + "\n")

write_dataset_files(mol_fs_dataset, "mol_fs")
write_dataset_files(mol_rs_dataset, "mol_rs")
write_dataset_files(smol_fs_dataset, "smol_fs")
write_dataset_files(smol_rs_dataset, "smol_rs")

In [4]:
def read_files(file_names):
    inputs = []
    outputs = []

    for file_name in file_names:
        with open(f"{file_name}_input.txt", "r") as f:
            inputs += [line.strip() for line in f]

        with open(f"{file_name}_output.txt", "r") as f:
            outputs += [line.strip() for line in f]

    return inputs, outputs

mol_fs_test_input, mol_fs_test_output = read_files(["mol_fs_test", "smol_fs_test", "smol_fs_dev"])
mol_fs_train_input, mol_fs_train_output = read_files(["mol_fs_train"])
mol_rs_test_input, mol_rs_test_output = read_files(["mol_rs_test", "smol_rs_test", "smol_rs_dev"])
mol_rs_train_input, mol_rs_train_output = read_files(["mol_rs_train"])

In [28]:
mol_fs_scaffold_selected_test_input, mol_fs_scaffold_selected_test_output = read_files(["mol_fs_scaffold_selected_test"])
mol_rs_scaffold_selected_test_input, mol_rs_scaffold_selected_test_output = read_files(["mol_rs_scaffold_selected_test"])


def process_reaction_equation(reaction, format = "smiles", token=True):
    smiles = multicomponent_smiles_to_list(reaction)
    smiles = [convert_to_canonical_smiles(smi) for smi in smiles]
    selfies = [sf.encoder(smi) for smi in smiles]
    if token:
        molecules = ".".join([MOLECULE_TOKEN for _ in range(len(smiles))])
    elif format == "smiles":
        molecules = ".".join(smiles)
    elif format == "selfies":
        molecules = ".".join(selfies)
    else:
        raise ValueError(f"Unsupported molecule format: {format}")
    
    return selfies, smiles, molecules


def generate_alpaca_dataset(inputs, outputs, prompt_templates):
    for input, output in zip(inputs, outputs):
        prompt_template = random.choice(prompt_templates)
        yield {
            "instruction": prompt_template["input"].replace("<INPUT>", input),
            "input": "",
            "output": prompt_template["output"].replace("<OUTPUT>", output)
        }


def generate_mmchat_dataset(inputs, outputs, prompt_templates, format="smiles", token=True):
    for id, (input, output) in enumerate(zip(inputs, outputs)):
        selfies, smiles, molecules = process_reaction_equation(input, format, token)
        _, _, output = process_reaction_equation(output, format, False)
        prompt_template = random.choice(prompt_templates)
        input_template = prompt_template["input"].replace("<INPUT>", molecules)
        system_prompt = SYSTEM_PROMPT.replace("<REP_1>", "structure" if token else format.upper()).replace("<REP_2>", format.upper())
            
        yield {
            "id": id,
            "molecules": {"selfies": selfies, "smiles": smiles},
            "ground_truth": output,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": input_template
                }
            ],
        }

mol_fs_dataset["test"] = Dataset.from_generator(generate_alpaca_dataset, gen_kwargs={"inputs": mol_fs_scaffold_selected_test_input, "outputs": mol_fs_scaffold_selected_test_output, "prompt_templates": FS_PROMPT_TEMPLATES})
mol_rs_dataset["test"] = Dataset.from_generator(generate_alpaca_dataset, gen_kwargs={"inputs": mol_rs_scaffold_selected_test_input, "outputs": mol_rs_scaffold_selected_test_output, "prompt_templates": RS_PROMPT_TEMPLATES})
mol_fs_mm_dataset["test"] = Dataset.from_generator(generate_mmchat_dataset, gen_kwargs={"inputs": mol_fs_scaffold_selected_test_input, "outputs": mol_fs_scaffold_selected_test_output, "prompt_templates": FS_PROMPT_TEMPLATES})
mol_rs_mm_dataset["test"] = Dataset.from_generator(generate_mmchat_dataset, gen_kwargs={"inputs": mol_rs_scaffold_selected_test_input, "outputs": mol_rs_scaffold_selected_test_output, "prompt_templates": RS_PROMPT_TEMPLATES})

print("mol_fs_test", len(mol_fs_dataset["test"]))
print("mol_rs_test", len(mol_rs_dataset["test"]))
print("mol_fs_mm_test", len(mol_fs_mm_dataset["test"]))
print("mol_rs_mm_test", len(mol_rs_mm_dataset["test"]))

mol_fs_dataset.push_to_hub("OpenMol/MolInst_125K_Scaffold_FS_SMILES-SFT", private=True)
mol_rs_dataset.push_to_hub("OpenMol/MolInst_125K_Scaffold_RS_SMILES-SFT", private=True)
mol_fs_mm_dataset.push_to_hub("OpenMol/MolInst_FS_125K_Scaffold_SMILES-MMChat", private=True)
mol_rs_mm_dataset.push_to_hub("OpenMol/MolInst_RS_125K_Scaffold_SMILES-MMChat", private=True)

Generating train split: 265 examples [00:00, 617.32 examples/s]

Generating train split: 1004 examples [00:01, 514.54 examples/s]
Generating train split: 1000 examples [00:02, 487.04 examples/s]


mol_fs_test 1004
mol_rs_test 1000
mol_fs_mm_test 1004
mol_rs_mm_test 1000


Creating parquet from Arrow format: 100%|██████████| 125/125 [00:00<00:00, 1134.01ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  4.48it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 1628.54ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00, 14.24it/s]
Creating parquet from Arrow format: 100%|██████████| 129/129 [00:00<00:00, 665.45ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  3.23it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 703.86ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00, 14.84it/s]
Creating parquet from Arrow format: 100%|██████████| 125/125 [00:00<00:00, 249.49ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.55it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 494.44ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
Creating parquet from Arrow form

CommitInfo(commit_url='https://huggingface.co/datasets/OpenMol/MolInst_RS_125K_Scaffold_SMILES-MMChat/commit/c75881dcfc86eb467d5712cc95550779fb330f2d', commit_message='Upload dataset', commit_description='', oid='c75881dcfc86eb467d5712cc95550779fb330f2d', pr_url=None, pr_revision=None, pr_num=None)

In [29]:
print("mol_fs_dataset", mol_fs_dataset['test'][0])
print("mol_rs_dataset", mol_rs_dataset['test'][0])
print("mol_fs_mm_dataset", mol_fs_mm_dataset['test'][0])
print("mol_rs_mm_dataset", mol_rs_mm_dataset['test'][0])

mol_fs_dataset {'instruction': 'Can you tell me the potential product of a chemical reaction that uses O=C(Cl)C(=O)Cl.O=C(O)CCC(F)(F)F.CN(C)C=O.ClCCl as the reactants and reagents?', 'input': '', 'output': 'O=C(Cl)CCC(F)(F)F'}
mol_rs_dataset {'instruction': 'Do retrosynthesis with the product CCCn1c(COC)nc2c(N)nc3ccc(OCCN4CCCCC4)cc3c21 .', 'input': '', 'output': 'CCCN1C(COC)=NC2=C(N)N=C3C=CC(O)=CC3=C21.ClCCN4CCCCC4'}
mol_fs_mm_dataset {'id': 0, 'molecules': {'selfies': ['[O][=C][Branch1][C][Cl][C][=Branch1][C][=O][Cl]', '[O][=C][Branch1][C][O][C][C][C][Branch1][C][F][Branch1][C][F][F]', '[C][N][Branch1][C][C][C][=O]', '[Cl][C][Cl]'], 'smiles': ['O=C(Cl)C(=O)Cl', 'O=C(O)CCC(F)(F)F', 'CN(C)C=O', 'ClCCl']}, 'ground_truth': 'O=C(Cl)CCC(F)(F)F', 'messages': [{'content': 'You are a chemist. Now you are given a reaction equation. Please predict the product of the reaction.\n    The reaction equation has the following format:\n    ```\n    reactant1.reactant2. ... .reactantN>>product\n    ```\