In [1]:
import os
os.chdir('../')
os.getcwd()

'/home/jlaw/projects/arpa-e/crystals/rlmolecule/examples/crystal_energy'

In [2]:
import os
import sys
from collections import defaultdict
from tqdm import tqdm
import numpy as np
# import psycopg2
import sqlalchemy
import pandas as pd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
import dask.dataframe as dd
import nfp
from pymatgen.core import Structure

sys.path.append('../../')
import rlmolecule
from rlmolecule.sql.run_config import RunConfig
from rlmolecule.sql import Base, Session
from rlmolecule.sql.tables import GameStore, RewardStore, StateStore
from rlmolecule.crystal import utils
from scripts import nrelmatdbtaps
from scripts import stability
from scripts import ehull

  from distributed.utils import format_bytes, parse_bytes, tmpfile
  from distributed.utils import format_bytes, parse_bytes, tmpfile
  from distributed.utils import parse_bytes


In [3]:
# load the relaxed structures and run the hull energy code
relaxed_energies_file = "/projects/rlmolecule/jlaw/crystal-gnn-fork/inputs/structures/battery_relaxed_energies.csv"
print(f"reading {relaxed_energies_file}")
df_rel = pd.read_csv(relaxed_energies_file)
print(df_rel.head(2))
strc_energies = dict(zip(df_rel['id'], df_rel['energyperatom']))

# instead, use the predicted values for the decomposition energy

comp_phases_file = "/home/jlaw/projects/arpa-e/crystals/rlmolecule/examples/crystal_energy/inputs/competing_phases.csv"
print(f"reading {comp_phases_file}")
df_phases = pd.read_csv(comp_phases_file)
print(df_phases.head(2))

strcs_file = "/projects/rlmolecule/jlaw/crystal-gnn-fork/inputs/structures/battery_relaxed_structures.json.gz"
rel_structures = utils.read_structures_file(strcs_file)

reading /projects/rlmolecule/jlaw/crystal-gnn-fork/inputs/structures/battery_relaxed_energies.csv
   comp_type composition                           id  energyperatom
0        112    Mg2Cl1P1   Mg2Cl1P1_sg2_icsd_035676_1      -3.527097
1        112    Mg2Cl1P1  Mg2Cl1P1_sg55_icsd_642437_1      -3.365935
reading /home/jlaw/projects/arpa-e/crystals/rlmolecule/examples/crystal_energy/inputs/competing_phases.csv
  sortedformula   icsdnum  energyperatom reduced_composition
0    Ag10Br3Te4  173116.0      -1.718985          Ag10Br3Te4
1   Ag11K1O16V4  391344.0      -4.797702         Ag11K1O16V4
reading /projects/rlmolecule/jlaw/crystal-gnn-fork/inputs/structures/battery_relaxed_structures.json.gz
	67840 structures read


In [4]:
df_rel.head(2)

Unnamed: 0,comp_type,composition,id,energyperatom
0,112,Mg2Cl1P1,Mg2Cl1P1_sg2_icsd_035676_1,-3.527097
1,112,Mg2Cl1P1,Mg2Cl1P1_sg55_icsd_642437_1,-3.365935


In [18]:
df2 = df_rel.set_index('id')
df2['structure'] = pd.Series(rel_structures)
df2.head(1)

Unnamed: 0_level_0,comp_type,composition,energyperatom,structure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mg2Cl1P1_sg2_icsd_035676_1,112,Mg2Cl1P1,-3.527097,"[[2.98539289 7.30026398 0.23120076] Cl, [11.02..."


In [5]:
def setup_dask_client(n_nodes=2, n_processes=36, debug=False):
    ###cluster objects
#     n_processes = 36  # number of processes to run on each node
    memory = 90000  # to fit on a standard node; ask for 184,000 for a bigmem node
    walltime = '30' if debug else '180'
    queue = 'debug' if debug else None
    
    cluster = SLURMCluster(
        project='rlmolecule',
        walltime='30' if debug else '180',  # 30 minutes to fit in the debug queue; 180 to fit in short
        job_mem=str(memory),
        job_cpu=36,
        interface='ib0',
        local_directory='/tmp/scratch/dask-worker-space',
        cores=36,
        processes=n_processes,
        memory='{}MB'.format(memory),
        extra=['--lifetime-stagger', '60m'],
        queue='debug' if debug else None  # 'debug' is limited to a single job -- comment this out for larger runs
    )

    print(cluster.job_script())

    #create a client
    dask_client = Client(cluster)

    # scale cluster
    n_nodes = 1 if debug else n_nodes
    cluster.scale(n_processes * n_nodes)
    return dask_client, cluster

In [27]:
# make the structures and energies into a dask dataframe
# df_rel.set_index('id', inplace=True)
# df_rel['structure'] = pd.Series(rel_structures)
df_dask = dd.from_pandas(df2[:1000], chunksize=10)

In [28]:
### Dask
# now use dask to compute the decomposition energy
dask_client, cluster = setup_dask_client(debug=True)

#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -p debug
#SBATCH -A rlmolecule
#SBATCH -n 1
#SBATCH --cpus-per-task=36
#SBATCH --mem=90000
#SBATCH -t 30

/home/jlaw/.conda/envs/crystals/bin/python -m distributed.cli.dask_worker tcp://10.148.8.97:46761 --nthreads 1 --nprocs 36 --memory-limit 2.33GiB --name dummy-name --nanny --death-timeout 60 --local-directory /tmp/scratch/dask-worker-space --lifetime-stagger 60m --interface ib0 --protocol tcp://



Perhaps you already have a cluster running?
Hosting the HTTP server on port 35730 instead


In [8]:
def convex_hull_stability(row, df_competing_phases):
    strc = row.structure
    predicted_energy = row.energyperatom
    # Add the new composition and the predicted energy to "df" if DFT energy already not present
    comp = strc.composition.reduced_composition.alphabetical_formula.replace(' ','')

    df = df_competing_phases
    if comp not in df.reduced_composition.tolist():
        df = df_competing_phases.append({'sortedformula': comp, 'energyperatom': predicted_energy, 'reduced_composition': comp}, ignore_index=True)

    # Create a list of elements in the composition
    ele = strc.composition.chemical_system.split('-')

    # Create input file for stability analysis
    inputs = nrelmatdbtaps.create_input_DFT(ele, df, chempot='ferev2')

    # Run stability function (args: input filename, composition)
    stable_state = stability.run_stability(inputs, comp)
    if stable_state == 'UNSTABLE':
        stoic = ehull.frac_stoic(comp)
        hull_nrg = ehull.unstable_nrg(stoic, comp, inputs)
        #print("energy above hull of this UNSTABLE phase is", hull_nrg, "eV/atom")
    elif stable_state == 'STABLE':
        stoic = ehull.frac_stoic(comp)
        hull_nrg = ehull.stable_nrg(stoic, comp, inputs)
        #print("energy above hull of this STABLE phase is", hull_nrg, "eV/atom")
    else:
        print(f"ERR: unrecognized stable_state: '{stable_state}'.")
        print(f"\tcomp: {comp}")
        return None
    return hull_nrg

In [20]:
df2.head(1)

Unnamed: 0_level_0,comp_type,composition,energyperatom,structure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mg2Cl1P1_sg2_icsd_035676_1,112,Mg2Cl1P1,-3.527097,"[[2.98539289 7.30026398 0.23120076] Cl, [11.02..."


In [21]:
df2[:2][['structure', 'energyperatom']]

Unnamed: 0_level_0,structure,energyperatom
id,Unnamed: 1_level_1,Unnamed: 2_level_1
Mg2Cl1P1_sg2_icsd_035676_1,"[[2.98539289 7.30026398 0.23120076] Cl, [11.02...",-3.527097
Mg2Cl1P1_sg55_icsd_642437_1,"[[5.51341932 2.94139596 3.15754637] Cl, [3.031...",-3.365935


In [57]:
def test(row, b):
    return b

In [59]:
df2[:100][['structure', 'energyperatom']].apply(test, b=4, axis=1).values

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])

In [22]:
df2[:5][['structure', 'energyperatom']].apply(
    convex_hull_stability, df_competing_phases=df_phases, axis=1).values

COMPETING PHASES FOUND IN NRELMATDB
COMPETING PHASES FOUND IN NRELMATDB
COMPETING PHASES FOUND IN NRELMATDB
COMPETING PHASES FOUND IN NRELMATDB
COMPETING PHASES FOUND IN NRELMATDB


array([0.179, 0.341, 0.145, 0.473, 0.155])

In [29]:
results = df_dask.map_partitions(
        lambda df: df[['structure', 'energyperatom']].apply(
            convex_hull_stability, df_competing_phases=df_phases, axis=1).values,
        meta=0)

In [30]:
finished = results.compute()

In [31]:
finished

array([0.419, 0.419, 0.391, 0.391, 0.473, 0.473, 0.069, 0.097, 0.173,
       0.173, 0.399, 0.041, 0.173, 0.441, 0.329, 0.329, 0.369, 0.113,
       0.369, 0.113, 0.147, 0.089, 0.081, 0.105, 0.105, 0.169, 0.169,
       0.611, 0.089, 0.353, 0.353, 0.161, 0.161, 1.363, 0.417, 1.461,
       0.599, 0.091, 0.903, 0.623, 0.153, 0.153, 0.153, 0.153, 0.999,
       0.375, 0.349, 0.221, 0.121, 0.073, 0.099, 0.099, 0.095, 0.095,
       0.583, 1.845, 0.341, 0.267, 0.509, 0.537, 0.199, 0.273, 0.273,
       0.137, 0.223, 1.227, 1.227, 0.353, 0.233, 0.153, 0.153, 1.463,
       0.557, 0.197, 0.391, 0.391, 0.191, 0.191, 0.199, 0.199, 0.197,
       0.197, 0.123, 0.123, 0.319, 0.169, 0.091, 0.091, 0.095, 0.101,
       0.211, 0.211, 0.197, 0.197, 0.089, 0.089, 0.311, 0.311, 0.363,
       0.363, 0.483, 0.089, 0.089, 0.191, 0.427, 0.641, 0.301, 0.139,
       0.135, 0.475, 0.235, 0.143, 0.405, 0.175, 0.473, 0.213, 0.223,
       0.535, 0.089, 0.089, 0.527, 0.141, 0.495, 0.135, 0.211, 0.211,
       0.261, 1.793,

In [None]:
df_out = pd.DataFrame({'decomp_energy': finished.values}, index=df_rel.index)
out_file = predicted_energies_file.replace('.csv', '_decomp_energy.csv')
print(out_file)
df_out.to_csv(out_file)

In [46]:
df2 = df_rel.copy()

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
ERROR:asyncio:_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


In [32]:
dask_client.shutdown()

  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


In [33]:
# now repeat the process for the predicted values for the relaxed batteries
pred_energy_file = "/projects/rlmolecule/jlaw/crystal-gnn-fork/outputs/icsd_battery_relaxed/hypo_vsad5_icsd_vsad5_seed1/overall_battery_pred_err.csv"
df = pd.read_csv(pred_energy_file)
df.head(2)

Unnamed: 0,comp_type,composition,id,energyperatom,predicted_energyperatom,pred_err,pred_err-0_05
0,112,Mg2Cl1P1,Mg2Cl1P1_sg194_icsd_061381_1,-3.562473,-3.586825,0.024352,0.0
1,112,Mg2Cl1P1,Mg2Cl1P1_sg64_icsd_170268_1,-3.46386,-3.464951,0.001091,0.0


In [37]:
df[df['id'].apply(lambda x: x.startswith('icsd'))]

Unnamed: 0,comp_type,composition,id,energyperatom,predicted_energyperatom,pred_err,pred_err-0_05


In [38]:
len(df)

67840

In [40]:
for row in df.iterrows():
    print(row)
    break

(0, comp_type                                           112
composition                                    Mg2Cl1P1
id                         Mg2Cl1P1_sg194_icsd_061381_1
energyperatom                                 -3.562473
predicted_energyperatom                       -3.586825
pred_err                                       0.024352
pred_err-0_05                                       0.0
Name: 0, dtype: object)
