# Compute the Storage Capacity of Matched Molecules

In [1]:
from mongoengine import connect
from cfree.store import MoleculeRecord
from cfree.descriptors import compute_wth2, count_h2_difference, saturate_molecule
from xtb.ase.calculator import XTB
from ase.optimize import QuasiNewton
from ase.io import read
from ase import units
from rdkit.Chem import AllChem
from rdkit import Chem
from io import StringIO
from tqdm import tqdm
import pandas as pd
import numpy as np

Configuration

In [2]:
to_run_per_target = 2  # How many baseline structures to test
match_type = 'abs-cosine'

Connect to the database

In [3]:
client = connect(port=27894)
coll = client['cfree']['molecule_record']

## Load the Matches
They are stored in CSV files produced by Zhi Hong. Get the top molecules out of Each

In [4]:
relevant = pd.read_csv(f'matched/25apr23-partial-PRD/Known_Pareto-Random_Valid-molecules-{match_type}-all.csv')
top_relevant = relevant.sort_values('Cosine Similarity').groupby('Known Pareto Molecule').tail(to_run_per_target)
print(f'Loaded {len(top_relevant)} molecules')

Loaded 14 molecules


In [5]:
random = pd.read_csv(f'matched/25apr23-partial-PRD/Known_Pareto-Relevant_ENA-molecules-{match_type}-all.csv')
top_random = random.sort_values('Cosine Similarity').groupby('Known Pareto Molecule').tail(to_run_per_target)
print(f'Loaded {len(top_random)} molecules')

Loaded 14 molecules


## Get Some Baseline Structures
Randomly select records from the database

In [6]:
%%time
rng = np.random.RandomState(1)
random_smiles = sorted([x['identifier']['smiles'] for x in coll.find({'subsets': 'random-valid'}, projection=['identifier.smiles'])])
random_smiles = rng.choice(random_smiles, replace=False, size=(len(top_relevant),)).tolist()
print(f'Pulled {len(random_smiles)} random "valid" molecules from the database')

Pulled 14 random "valid" molecules from the database
CPU times: user 659 ms, sys: 406 ms, total: 1.06 s
Wall time: 21.4 s


In [7]:
%%time
rng = np.random.RandomState(1)
relevant_smiles = sorted([x['identifier']['smiles'] for x in coll.find({'subsets': 'relevant-ENA'}, projection=['identifier.smiles'])])
relevant_smiles = rng.choice(relevant_smiles, replace=False, size=(len(top_relevant),)).tolist()
print(f'Pulled {len(relevant_smiles)} random "relevant" molecules from the database')

Pulled 14 random "relevant" molecules from the database
CPU times: user 244 ms, sys: 32.6 ms, total: 276 ms
Wall time: 2.45 s


## Get their Smiles Strings
Look that up from the database

Get them via projection

In [8]:
def find_smiles(key):
    """Get the record matching a certain InChI Key and return the smiles"""
    return coll.find_one({'_id': key})['identifier']['smiles']

In [9]:
top_random.head(1)

Unnamed: 0,Known Pareto Molecule,Cosine Similarity,Relevant ENA Molecule
1061,UFWIBTONFRDIAS-UHFFFAOYSA-N,0.69438,ZFIFHAKCBWOSRN-UHFFFAOYSA-N


In [10]:
for data in [top_relevant, top_random]:
    for col in data.columns[[0, 2]]:
        data[f'{col}-SMILES'] = data[col].apply(find_smiles)

In [11]:
data

Unnamed: 0,Known Pareto Molecule,Cosine Similarity,Relevant ENA Molecule,Known Pareto Molecule-SMILES,Relevant ENA Molecule-SMILES
1061,UFWIBTONFRDIAS-UHFFFAOYSA-N,0.69438,ZFIFHAKCBWOSRN-UHFFFAOYSA-N,c1ccc2ccccc2c1,NS(=O)(=O)c1cccc2ccccc12
1060,UFWIBTONFRDIAS-UHFFFAOYSA-N,0.729625,NQMUGNMMFTYOHK-UHFFFAOYSA-N,c1ccc2ccccc2c1,COc1cccc2ccccc12
2121,PCNDJXKNXGMECE-UHFFFAOYSA-N,0.733671,XWSSUYOEOWLFEI-UHFFFAOYSA-N,c1ccc2nc3ccccc3nc2c1,c1ccc(-c2cccnn2)cc1
2120,PCNDJXKNXGMECE-UHFFFAOYSA-N,0.758909,ZYDGCYWJDWIJCS-UHFFFAOYSA-N,c1ccc2nc3ccccc3nc2c1,COc1cccc2nc3ccccc3nc12
6361,AWJUIBRHMBBTKR-UHFFFAOYSA-N,0.784903,YQZGQXPHGLAEHA-UHFFFAOYSA-N,c1ccc2cnccc2c1,c1ccc(-c2ccc3ccccc3n2)nc1
5301,JIHQDMXYYFUGFV-UHFFFAOYSA-N,0.795952,UIXYUQXWIFEYBN-UHFFFAOYSA-N,c1ncncn1,c1ccc(-c2cn(-c3ccccc3)nn2)cc1
6360,AWJUIBRHMBBTKR-UHFFFAOYSA-N,0.80156,FSEXLNMNADBYJU-UHFFFAOYSA-N,c1ccc2cnccc2c1,c1ccc(-c2ccc3ccccc3n2)cc1
5300,JIHQDMXYYFUGFV-UHFFFAOYSA-N,0.814451,ODKHOKLXMBWVOQ-UHFFFAOYSA-N,c1ncncn1,c1ccc(-c2ncoc2-c2ccccc2)cc1
1,WTKZEGDFNFYCGP-UHFFFAOYSA-N,0.826533,RQCBPOPQTLHDFC-UHFFFAOYSA-N,c1cn[nH]c1,c1ccc(-c2ncco2)cc1
0,WTKZEGDFNFYCGP-UHFFFAOYSA-N,0.829909,WYKHSBAVLOPISI-UHFFFAOYSA-N,c1cn[nH]c1,c1ccc(-c2nccs2)cc1


## Compute the Storage Capacity
This is a simple calculation from the parsed string. Run it, then store the result in the database

Do it for the known molecules

In [12]:
def compute_wth2_if_needed(smiles: str) -> float:
    """Compute the wt%H2 of a molecule if we have not already
    
    Also store the result in the database if it's new.
    
    Args:
        smiles: SMILES string of the molecule in question
    Returns:
        The storage capacity (wt%H2)
    """
    # Get the document based on the inchi key
    key = Chem.MolToInchiKey(Chem.MolFromSmiles(smiles))
    record = coll.find_one({'_id': key}, projection=['property'])
    if record is None:
        return None
    
    # Check if property is set
    if 'wt%H2' not in record.get('property', {}):
        wt = compute_wth2(smiles)
        coll.update_one({'_id': key}, {'$set': {'property.wt%H2': wt}})
    else:
        wt = record['property']['wt%H2']
    return wt

In [13]:
compute_wth2_if_needed('c1nc[nH]n1')

5.517491899109785

In [14]:
known_molecules = pd.read_csv('../screen-search-space/to-compare.smi', names=['smiles'])

In [15]:
for smiles in known_molecules['smiles']:
    compute_wth2_if_needed(smiles)

Run it for the baselines

In [16]:
for smiles in tqdm(random_smiles + relevant_smiles):
    compute_wth2_if_needed(smiles)

100%|██████████| 28/28 [00:00<00:00, 500.20it/s]


Run it for everyone

In [17]:
for smiles in tqdm(top_random.iloc[:, 4].values):
    compute_wth2_if_needed(smiles)

100%|██████████| 14/14 [00:00<00:00, 893.80it/s]


In [18]:
for smiles in tqdm(top_relevant.iloc[:, 4].values):
    compute_wth2_if_needed(smiles)

100%|██████████| 14/14 [00:00<00:00, 842.37it/s]


## Compute the Storage Energy
We'll use XTB to make it fast and generally accurate.

Make the functions first

In [21]:
def compute_eng_if_needed(smiles: str) -> float:
    """Compute the storage energy of a molecule if we have not already
    
    Also store the result in the database if it's new.
    
    Args:
        smiles: SMILES string of the molecule in question
    Returns:
        The storage energy (kJ/mol)
    """
    # Get the document based on the inchi key
    key = Chem.MolToInchiKey(Chem.MolFromSmiles(smiles))
    record = coll.find_one({'_id': key}, projection=['property'])
    if record is None:
        return None
    
    # Check if property is set
    if 'storage_eng' not in record.get('property', {}):
        try:
            wt = compute_storage_energy(smiles)
        except:
            return None
        coll.update_one({'_id': key}, {'$set': {'property.storage_eng': wt}})
    else:
        wt = record['property']['storage_eng']
    return wt

Run it for the known molecules

In [22]:
def compute_eng_if_needed(smiles: str) -> float:
    """Compute the energy penalty of a molecule if we have not already
    
    Also store the result in the database if it's new.
    
    Args:
        smiles: SMILES string of the molecule in question
    Returns:
        Energy barrier in kJ/mol
    """
    # Get the document based on the inchi key
    key = Chem.MolToInchiKey(Chem.MolFromSmiles(smiles))
    record = coll.find_one({'_id': key}, projection=['property'])
    if record is None:
        return None
    
    # Check if property is set
    if 'storage_eng' not in record.get('property', {}):
        try:
            wt = compute_storage_energy(smiles)
        except:
            return
        coll.update_one({'_id': key}, {'$set': {'storage_eng': wt}})
    else:
        wt = record['property']['wt%H2']
    return wt

In [23]:
for smiles in tqdm(known_molecules['smiles']):
    compute_eng_if_needed(smiles)

100%|██████████| 149/149 [00:00<00:00, 910.22it/s]


Run it for the baselines

In [24]:
for smiles in tqdm(relevant_smiles + random_smiles):
    compute_eng_if_needed(smiles)

100%|██████████| 28/28 [00:52<00:00,  1.86s/it]


Run it for everyone

In [None]:
for smiles in tqdm(top_random.iloc[:, 4].values):
    compute_eng_if_needed(smiles)

In [None]:
for smiles in tqdm(top_relevant.iloc[:, 4].values):
    compute_eng_if_needed(smiles)