In [30]:
import glob
import logging
from contextlib import contextmanager
from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
from pymatgen.core import Structure
from tqdm.notebook import tqdm

import tensorflow as tf

tqdm.pandas()
logger = logging.getLogger()

In [5]:
import os
os.chdir('../')
os.getcwd()

'/home/jlaw/projects/arpa-e/crystals/rlmolecule/examples/crystal_energy'

In [8]:
import sys
sys.path.insert(0, "../../")

In [12]:
from rlmolecule.sql.run_config import RunConfig
from scripts import compute_reward_decors as rew_decors

In [59]:
# load changes made to the script
from importlib import reload
reload(rew_decors)

<module 'scripts.compute_reward_decors' from '/home/jlaw/projects/arpa-e/crystals/rlmolecule/examples/crystal_energy/scripts/compute_reward_decors.py'>

In [24]:
config_file = "config/20220617_lt15stoich_battclust0_01/r_90.yaml"
energy_model_file = "/projects/rlmolecule/pstjohn/models/20220607_icsd_and_battery/best_model.hdf5"

run_config = RunConfig(config_file)

24 actions_to_ignore


In [27]:
# load the decoration IDs that have already been computed
strc_ids_files = ["/projects/rlmolecule/jlaw/logs/crystal_energy/20220617-batt-icsd-vol-r90-2/states_seen.csv.gz",
                  "/projects/rlmolecule/jlaw/logs/crystal_energy/20220617-batt-icsd-vol-r90-no-cond-ion-2/states_seen.csv.gz",
                  "/projects/rlmolecule/jlaw/logs/crystal_energy/20220617-batt-icsd-vol-r90-no-halides-2/states_seen.csv.gz"
                 ]

states_seen = set()
for strc_ids_file in strc_ids_files:
    states = set(pd.read_csv(strc_ids_file)['states'])
    states_seen.update(states)
    print(f"{len(states)} states read from {strc_ids_file}")
print(f"{len(states_seen)} total")

4197573 states read from /projects/rlmolecule/jlaw/logs/crystal_energy/20220617-batt-icsd-vol-r90-2/states_seen.csv.gz
4177129 states read from /projects/rlmolecule/jlaw/logs/crystal_energy/20220617-batt-icsd-vol-r90-no-cond-ion-2/states_seen.csv.gz
3381483 states read from /projects/rlmolecule/jlaw/logs/crystal_energy/20220617-batt-icsd-vol-r90-no-halides-2/states_seen.csv.gz
6578563 total


In [25]:
competing_phases = rew_decors.load_competing_phases("inputs/competing_phases.csv")

# load the icsd prototype structures
prob_config = run_config.problem_config
prototypes_file = prob_config['prototypes_file']
prototype_structures = rew_decors.read_structures_file(prototypes_file)
# make sure the prototype structures don't have oxidation states
from pymatgen.transformations.standard_transformations import OxidationStateRemovalTransformation
oxidation_remover = OxidationStateRemovalTransformation()
prototype_structures = {s_id: oxidation_remover.apply_transformation(s)
                        for s_id, s in prototype_structures.items()}

preprocessor = rew_decors.AtomicNumberPreprocessor()
energy_model = rew_decors.load_model(energy_model_file)

	12682 lines
  sortedformula   icsdnum  energyperatom reduced_composition
0    Ag10Br3Te4  173116.0      -1.718985          Ag10Br3Te4
1   Ag11K1O16V4  391344.0      -4.797702         Ag11K1O16V4


INFO:scripts.compute_reward_decors:reading ../../rlmolecule/crystal/inputs/icsd_train_and_proto_max_comp_atoms15/KLiNa_add_clust0_01_min10prototypes.json.gz


	12682 entries


INFO:scripts.compute_reward_decors:	14494 structures read


Reading inputs/models/2022_06_07_pruned_outliers/icsd_and_battery_scaled/best_model.hdf5
Reading inputs/models/2022_06_07_pruned_outliers/icsd_and_battery_scaled/best_model.hdf5


2022-06-21 15:22:09.386137: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /nopt/slurm/current/lib:
2022-06-21 15:22:09.386443: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /nopt/slurm/current/lib:
2022-06-21 15:22:09.386885: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /nopt/slurm/current/lib:
2022-06-21 15:22:09.387348: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared 

### Generate all possible decoration IDs

In [31]:
rewarder = rew_decors.CrystalStateReward(competing_phases,
                                         prototype_structures,
                                         energy_model,
                                         preprocessor)

# generate all the decoration IDs
prob_config = run_config.problem_config
builder = rew_decors.CrystalBuilder(G=prob_config.get('action_graph1'),
                                    G2=prob_config.get('action_graph2'),
                                    actions_to_ignore=prob_config.get('actions_to_ignore'))

gen_decors = rew_decors.GenerateDecorations(builder)
decor_ids = gen_decors.generate_all_decorations()
decor_ids = set(decor_ids)

  0%|                                                                                              | 139/15000000 [01:30<2712:11:10,  1.54it/s]


Read G1: ../../rlmolecule/crystal/inputs/icsd_train_and_proto_max_comp_atoms15/KLiNa_add_clust0_01_min10eles_to_comps.edgelist.gz (236167 nodes, 250002 edges)
Read G2: ../../rlmolecule/crystal/inputs/icsd_train_and_proto_max_comp_atoms15/KLiNa_add_clust0_01_min10comp_type_to_decors.edgelist.gz (49108 nodes, 48931 edges)
24 and 0 actions to ignore in G and G2, respectively


 95%|██████████████████████████████████████████████████████████████████████████████████████▌    | 14277525/15000000 [01:48<00:07, 99762.82it/s]

In [32]:
new_decors = decor_ids - states_seen
print(f"{len(new_decors)} new decorations")

7704569 new decorations


 95%|██████████████████████████████████████████████████████████████████████████████████████▋    | 14283132/15000000 [02:03<00:07, 99762.82it/s]

## Compute the reward for each decoration

### Example for a few structures

In [66]:
# code to compute the reward for each decoration
info_to_keep = ['predicted_energy',
                'decomp_energy',
                'cond_ion_frac',
                'reduction',
                'oxidation',
                'stability_window',
                ]
# test on a couple decoration ids
# decoration_rewards = rew_decors.compute_rewards(list(decor_ids)[:10], rewarder, info_to_keep=info_to_keep)
df_ids = pd.DataFrame(list(decor_ids)[:20], columns=["decor_id"])
print(df_ids.head(2))
decoration_rewards = df_ids.decor_id.progress_apply(
    lambda x: rew_decors.compute_reward(x,
                                        rewarder, 
                                        info_to_keep=info_to_keep))
print(decoration_rewards.head(2).values)
cols = ["id", "reward"] + info_to_keep
for i, col in enumerate(cols):
    df_ids[col] = decoration_rewards.map(lambda x: x[i] if i < len(x) else np.nan)
df_ids.head(2)

                                          decor_id
0  Na2Sc2W1S1P4|_1_1_2_2_4|triclinic|icsd_173455|1
1   Li1Si1Cl1Br4|_1_1_1_4|monoclinic|icsd_183878|4


  0%|          | 0/20 [00:00<?, ?it/s]

[('Na2Sc2W1S1P4|_1_1_2_2_4|triclinic|icsd_173455|1', 0.028, -0.067, 5.777, 0.2)
 ('Li1Si1Cl1Br4|_1_1_1_4|monoclinic|icsd_183878|4', 0.219, -1.934, 1.423, 0.143)]


Unnamed: 0,decor_id,id,reward,predicted_energy,decomp_energy,cond_ion_frac,reduction,oxidation,stability_window
0,Na2Sc2W1S1P4|_1_1_2_2_4|triclinic|icsd_173455|1,Na2Sc2W1S1P4|_1_1_2_2_4|triclinic|icsd_173455|1,0.028,-0.067,5.777,0.2,,,
1,Li1Si1Cl1Br4|_1_1_1_4|monoclinic|icsd_183878|4,Li1Si1Cl1Br4|_1_1_1_4|monoclinic|icsd_183878|4,0.219,-1.934,1.423,0.143,,,


### Use Dask to parallelize the computation

In [70]:
@contextmanager
def dask_cluster(n_nodes=2, n_processes=36, debug=False):
    ###cluster objects
#     n_processes = 36  # number of processes to run on each node
    memory = 90000  # to fit on a standard node; ask for 184,000 for a bigmem node
    walltime = '30' if debug else '180'
    queue = 'debug' if debug else None

    cluster = SLURMCluster(
        project='rlmolecule',
        walltime='30' if debug else '180',  # 30 minutes to fit in the debug queue; 180 to fit in short
        job_mem=str(memory),
        job_cpu=36,
        interface='ib0',
        local_directory='/tmp/scratch/dask-worker-space',
        cores=36,
        processes=n_processes,
        memory='{}MB'.format(memory),
        extra=['--lifetime-stagger', '60m'],
        queue='debug' if debug else None  # 'debug' is limited to a single job -- comment this out for larger runs
    )

    print(cluster.job_script())

    #create a client
    client = Client(cluster)

    # scale cluster
    n_nodes = 1 if debug else n_nodes
    cluster.scale(n_processes * n_nodes)
    
    try:
        yield client, cluster

    finally:
        cluster.close()
        client.close()

In [72]:
def load_rewarder():
    competing_phases = rew_decors.load_competing_phases("inputs/competing_phases.csv")

    # load the icsd prototype structures
    prob_config = run_config.problem_config
    prototypes_file = prob_config['prototypes_file']
    prototype_structures = rew_decors.read_structures_file(prototypes_file)
    # make sure the prototype structures don't have oxidation states
    from pymatgen.transformations.standard_transformations import OxidationStateRemovalTransformation
    oxidation_remover = OxidationStateRemovalTransformation()
    prototype_structures = {s_id: oxidation_remover.apply_transformation(s)
                            for s_id, s in prototype_structures.items()}

    preprocessor = rew_decors.AtomicNumberPreprocessor()
    energy_model = rew_decors.load_model(energy_model_file)
    
    rewarder = rew_decors.CrystalStateReward(competing_phases,
                                             prototype_structures,
                                             energy_model,
                                             preprocessor)
    return rewarder

In [None]:
# import dask
# dask_rewarder = dask.delayed(load_rewarder)()

In [80]:
# convert data to dask format
df_dask = dd.from_pandas(df_ids, chunksize=10)

results = df_dask.decor_id.map_partitions(
    lambda row: row.apply(
        lambda x: rew_decors.compute_reward(x, rewarder, info_to_keep)), 
    meta=pd.Series(tuple(np.zeros(5)))
)

with dask_cluster(debug=True):
    finished = results.compute()



INFO:tensorflow:Assets written to: ram://f9407099-6580-4eb1-b176-b43195d7d0de/assets


INFO:tensorflow:Assets written to: ram://f9407099-6580-4eb1-b176-b43195d7d0de/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)
Perhaps you already have a cluster running?
Hosting the HTTP server on port 44163 instead


#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -p debug
#SBATCH -A rlmolecule
#SBATCH -n 1
#SBATCH --cpus-per-task=36
#SBATCH --mem=90000
#SBATCH -t 30

/home/jlaw/.conda-envs/crystals_nfp0_3/bin/python -m distributed.cli.dask_worker tcp://10.148.8.97:36380 --nthreads 1 --nprocs 36 --memory-limit 2.33GiB --name dummy-name --nanny --death-timeout 60 --local-directory /tmp/scratch/dask-worker-space --lifetime-stagger 60m --interface ib0 --protocol tcp://





INFO:tensorflow:Assets written to: ram://a720d637-2ed5-407e-95d6-1752909f63ba/assets


INFO:tensorflow:Assets written to: ram://a720d637-2ed5-407e-95d6-1752909f63ba/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)
ERROR:asyncio.events:
Traceback (most recent call last):
  File "/home/jlaw/.conda-envs/crystals_nfp0_3/lib/python3.8/site-packages/distributed/utils.py", line 761, in wrapper
    return await func(*args, **kwargs)
  File "/home/jlaw/.conda-envs/crystals_nfp0_3/lib/python3.8/site-packages/distributed/client.py", line 1225, in _reconnect
    await self._ensure_connected(timeout=timeout)
  File "/home/jlaw/.conda-envs/crystals_nfp0_3/lib/python3.8/site-packages/distributed/client.py", line 1255, in _ensure_connected
    comm = await connect(
  File "/home/jlaw/.conda-envs/crystals_nfp0_3/lib/python3.8/site-packages/distributed/comm/core.py", line 313, in connect
    await asyncio.sleep(backoff)
  File "/home/jlaw/.conda-envs/crystals_nfp0_3/lib/python3.8/asyncio/tasks.py", line 659, in sleep
    return await fut

ValueError: Exception encountered when calling layer "edge_update" (type EdgeUpdate).

Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (2 total):
    * [<tf.Tensor 'inputs:0' shape=(None, None, 256) dtype=float32>, <tf.Tensor 'inputs_1:0' shape=(None, None, 256) dtype=float32>, <tf.Tensor 'inputs_2:0' shape=(None, None, 2) dtype=int64>]
    * [<tf.Tensor 'mask:0' shape=(None, None) dtype=bool>, None, None]
  Keyword arguments: {'training': False}

 Expected these arguments to match one of the following 2 option(s):

Option 1:
  Positional arguments (2 total):
    * [TensorSpec(shape=(None, None, 256), dtype=tf.float32, name='inputs/0'), TensorSpec(shape=(None, None, 256), dtype=tf.float32, name='inputs/1'), TensorSpec(shape=(None, None, 2), dtype=tf.int64, name='inputs/2')]
    * [TensorSpec(shape=(None, None), dtype=tf.bool, name='mask/0'), TensorSpec(shape=(None, None), dtype=tf.bool, name='mask/1'), None]
  Keyword arguments: {'training': False}

Option 2:
  Positional arguments (2 total):
    * [TensorSpec(shape=(None, None, 256), dtype=tf.float32, name='inputs/0'), TensorSpec(shape=(None, None, 256), dtype=tf.float32, name='inputs/1'), TensorSpec(shape=(None, None, 2), dtype=tf.int64, name='inputs/2')]
    * [TensorSpec(shape=(None, None), dtype=tf.bool, name='mask/0'), TensorSpec(shape=(None, None), dtype=tf.bool, name='mask/1'), None]
  Keyword arguments: {'training': True}

Call arguments received:
  • args=(['tf.Tensor(shape=(None, None, 256), dtype=float32)', 'tf.Tensor(shape=(None, None, 256), dtype=float32)', 'tf.Tensor(shape=(None, None, 2), dtype=int64)'],)
  • kwargs={'mask': ['tf.Tensor(shape=(None, None), dtype=bool)', 'None', 'None'], 'training': 'None'}