# Match Known Materials
Find which materials in the database are known to be similar to

In [1]:
from more_itertools import batched
from mongoengine import connect
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
client = connect('cfree', port=27856)
db = client['cfree']

## Load in the "Known" materials
These are from a list provided by Hassan. We want to load them then get the 

In [3]:
molecules = pd.read_csv('../screen-search-space/to-compare.smi', names=['smiles'])
print(f'Loaded {len(molecules)} molecules')

Loaded 149 molecules


Compute the InChI Key

In [4]:
%%time
molecules['key'] = molecules['smiles'].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)

CPU times: user 50.8 ms, sys: 6.49 ms, total: 57.3 ms
Wall time: 55.2 ms


## Mark them in the Database
Append the 'known' subset to all entries that match the key

In [5]:
coll = db['molecule_record']
mol_count = coll.estimated_document_count()
print(f'Database contains around {mol_count} molecules')

Database contains around 111995381 molecules


Perform the update

In [6]:
%%time
result = coll.update_many({'_id': {'$in': molecules.key.tolist()}}, {'$addToSet': {'subsets': 'known'}})
print(f'Matched {result.matched_count} molecules. Updated {result.modified_count} records.')

Matched 124 molecules. Updated 0 records.
CPU times: user 771 µs, sys: 1.48 ms, total: 2.25 ms
Wall time: 4.79 ms


In [7]:
%%time
count = coll.count_documents({'subsets': 'known'})
print(f'We have {count} molecules in that category now.')

We have 124 molecules in that category now.
CPU times: user 965 µs, sys: 0 ns, total: 965 µs
Wall time: 1.01 ms


## Repeat with the "Relevant" Molecules
Get the molecules with we have found to be relevant in another search

In [8]:
relevant = pd.read_csv('../screen-search-space/runs/ENA-top1000000-515af2/best_molecules.csv.gz')  # Latest version of our ENA molecule set

In [9]:
relevant.drop(columns='similarities', inplace=True)

In [10]:
matched = 0
updated = 0
for chunk in tqdm(np.array_split(relevant, len(relevant) // 5000 + 1)):
    keys = chunk['smiles'].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)
    result = coll.update_many({'_id': {'$in': keys.tolist()}}, {'$addToSet': {'subsets': 'relevant-ENA'}})
    updated += result.modified_count
    matched += result.matched_count
print(f'Matched {matched} molecules. Updated {updated} records.')

100%|██████████| 201/201 [36:13<00:00, 10.81s/it] 

Matched 53701 molecules. Updated 0 records.





In [11]:
%%time
count_relevant = coll.count_documents({'subsets': 'relevant-ENA'})
print(f'We have {count_relevant} molecules in that category now.')

We have 53693 molecules in that category now.
CPU times: user 1.28 ms, sys: 20 ms, total: 21.3 ms
Wall time: 66.5 ms


## Get a Random Subset
A random subset of molecules with at least one carbon-carbon double or aromatic bond.

### Step 1: Find "valid" carbon-storage molecules
We can just do a text match for `C=C` or `cc`

In [12]:
%%time
result = coll.update_many({'identifier.smiles': {'$regex': '(C=C|cc)'}}, {'$addToSet': {'subsets': 'valid'}})
print(f'Matched {result.matched_count} molecules. Updated {result.modified_count} records.')

Matched 97126505 molecules. Updated 0 records.
CPU times: user 119 ms, sys: 15 ms, total: 134 ms
Wall time: 6min 35s


## Step 2: Randomly select a few of them
We are going to assign each record a random number, then pick the first valid ones that are not 

In [13]:
%%time
result = coll.update_many({'subset': 'random-valid'}, {'$pull': {'subsets': ['random-valid']}})
print(f'Matched {result.matched_count} molecules. Updated {result.modified_count} records.')

Matched 0 molecules. Updated 0 records.
CPU times: user 28.2 ms, sys: 8.01 ms, total: 36.2 ms
Wall time: 1min 35s


In [14]:
%%time
matched_keys = [x['_id'] for x in coll.aggregate([
    {'$match': {'subsets': 'valid'}},
    {'$sample': {'size': count_relevant}},
    {'$project': {'_id': 1}}
])]
print(f'Pulled {len(matched_keys)} random "valid" molecules from the database')

Pulled 53693 random "valid" molecules from the database
CPU times: user 147 ms, sys: 19 ms, total: 166 ms
Wall time: 3min 43s


In [15]:
%%time
result = coll.update_many({'_id': {'$in': matched_keys}}, {'$addToSet': {'subsets': 'random-valid'}})
print(f'Matched {result.matched_count} molecules. Updated {result.modified_count} records.')

Matched 53693 molecules. Updated 53664 records.
CPU times: user 48.6 ms, sys: 3.17 ms, total: 51.8 ms
Wall time: 31.2 s
