In [1]:
# modules for structure decoration
import pandas as pd
import os
import itertools
from tqdm.notebook import tqdm
import networkx as nx
import glob
from glob import iglob
from copy import deepcopy
from collections import defaultdict

from pymatgen.core import Composition, Structure
from pymatgen.analysis import local_env

In [2]:
# print(f"pymatgen version: {pymatgen.__version__}")
import pip
pip.main(["show","pymatgen"])

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Name: pymatgen
Version: 2022.0.8
Summary: Python Materials Genomics is a robust materials analysis code that defines core object representations for structures and molecules with support for many electronic structure codes. It is currently the core analysis code powering the Materials Project (https://www.materialsproject.org).
Home-page: http://www.pymatgen.org
Author: Pymatgen Development Team
Author-email: ongsp@eng.ucsd.edu
License: MIT
Location: /home/jlaw/.conda/envs/crystals/lib/python3.8/site-packages
Requires: numpy, plotly, requests, palettable, tabulate, networkx, matplotlib, pandas, sympy, uncertainties, scipy, spglib, monty, ruamel.yaml
Required-by: pyxtal


0

In [3]:
# want to maximize the volume around only the conducting ions
conducting_ions = set(['Li', 'Na', 'K', 'Mg', 'Zn'])
anions = set(['F', 'Cl', 'Br', 'I', 'O', 'S', 'N', 'P'])
framework_cations = set(['Sc', 'Y', 'La', 'Ti', 'Zr', 'Hf', 'W', 'Zn', 'Cd', 'Hg', 'B', 'Al', 'Si', 'Ge', 'Sn', 'P', 'Sb'])
elements = conducting_ions | anions | framework_cations
# sort by the length of the string, so that multiple letter elements come first
elements = sorted(elements, key=len, reverse=True)
print(elements)

['Ti', 'Hg', 'Br', 'La', 'Na', 'Cd', 'Sc', 'Li', 'Cl', 'Zr', 'Hf', 'Sn', 'Sb', 'Zn', 'Al', 'Ge', 'Mg', 'Si', 'S', 'Y', 'B', 'K', 'W', 'N', 'O', 'P', 'I', 'F']


In [5]:
G = nx.DiGraph()
G2 = nx.DiGraph()

G = nx.read_edgelist(
    "../../../rlmolecule/crystal/inputs/elements_to_compositions.edgelist.gz", delimiter='\t', data=False,
    create_using=G,
)
print(f'{G.number_of_nodes()} nodes, {G.number_of_edges()} edges')

G2 = nx.read_edgelist(
    "../../../rlmolecule/crystal/inputs/comp_type_to_decorations.edgelist.gz", delimiter='\t', data=False,
    create_using=G2,
)
print(f'{G2.number_of_nodes()} nodes, {G2.number_of_edges()} edges')

159870 nodes, 178521 edges
16388 nodes, 16316 edges


In [6]:
G.has_node("root")

True

In [8]:
# find all the compositions with only conducting ions and anions
compositions = [n for n in G.nodes() if G.out_degree(n) == 0]
compositions[:10]

['K3N1',
 'K4I1N1',
 'K5I2N1',
 'K6I3N1',
 'K7I1N2',
 'K1Sb1I3N1',
 'K2Sb1I4N1',
 'K1Sb3I1N5',
 'K3Sb1I2N2',
 'K3Sb2I1N4']

In [14]:
'Zn3N2' in compositions

True

In [15]:
'Zn3N2' in to_skip

True

In [17]:
# we want at least 1 anion and 1 framework cation
import sys
sys.path.append('../../../')
from rlmolecule.crystal.crystal_state import CrystalState
to_skip = set()
for c in compositions:
    eles = CrystalState.get_eles_from_comp(c)
    if len(framework_cations & set(eles)) == 0 or len(eles) == 2:
        to_skip.add(c)
to_skip = sorted(to_skip)
out = []
for i in range(0, len(to_skip)-8, 8):
    print("'" + "', '".join(to_skip[i:i+8]) + "', ")
print(to_skip[i+8:])
# print(to_skip)

'K1Br1', 'K1Cl1', 'K1F1', 'K1I1', 'K2Br1I1', 'K2Cl1Br1', 'K2Cl1I1', 'K2F1Br1', 
'K2F1Cl1', 'K2F1I1', 'K2O1', 'K2S1', 'K3Br1I2', 'K3Br1O1', 'K3Br1S1', 'K3Br2I1', 
'K3Cl1Br2', 'K3Cl1I2', 'K3Cl1O1', 'K3Cl1S1', 'K3Cl2Br1', 'K3Cl2I1', 'K3F1Br2', 'K3F1Cl2', 
'K3F1I2', 'K3F1O1', 'K3F1S1', 'K3F2Br1', 'K3F2Cl1', 'K3F2I1', 'K3I1O1', 'K3I1S1', 
'K3N1', 'K3P1', 'K4Br1I3', 'K4Br1N1', 'K4Br2O1', 'K4Br2S1', 'K4Br3I1', 'K4Cl1Br3', 
'K4Cl1I3', 'K4Cl1N1', 'K4Cl2O1', 'K4Cl2S1', 'K4Cl3Br1', 'K4Cl3I1', 'K4F1Br3', 'K4F1Cl3', 
'K4F1I3', 'K4F1N1', 'K4F2O1', 'K4F2S1', 'K4F3Br1', 'K4F3Cl1', 'K4F3I1', 'K4I1N1', 
'K4I2O1', 'K4I2S1', 'K4O1S1', 'K5Br1I4', 'K5Br1O2', 'K5Br1S2', 'K5Br2I3', 'K5Br2N1', 
'K5Br3I2', 'K5Br3O1', 'K5Br3S1', 'K5Br4I1', 'K5Cl1Br4', 'K5Cl1I4', 'K5Cl1O2', 'K5Cl1S2', 
'K5Cl2Br3', 'K5Cl2I3', 'K5Cl2N1', 'K5Cl3Br2', 'K5Cl3I2', 'K5Cl3O1', 'K5Cl3S1', 'K5Cl4Br1', 
'K5Cl4I1', 'K5F1Br4', 'K5F1Cl4', 'K5F1I4', 'K5F1O2', 'K5F1S2', 'K5F2Br3', 'K5F2Cl3', 
'K5F2I3', 'K5F2N1', 'K5F3Br2', 'K5F3Cl2', 'K5F3I2', '

In [18]:
to_keep = set(compositions) - set(to_skip)
print(len(to_keep))

138306


In [21]:
to_skip = set(to_skip)

In [20]:
vol_stats_file = "/home/jlaw/projects/arpa-e/crystals/rlmolecule/examples/crystal_volume/outputs/2021-07-16-all-decoration-vol-stats.tsv.gz"
df = pd.read_csv(vol_stats_file, sep='\t', names=["state", "cond_ion_vol", "total_vol", "fraction", "time_taken"])
df.head()

Unnamed: 0,state,cond_ion_vol,total_vol,fraction,time_taken
0,K1Cl1|_1_1|orthorhombic|POSCAR_sg38_icsd_183254|1,206.0859,388.1016,0.531,0.14
1,K1Cl1|_1_1|orthorhombic|POSCAR_sg38_icsd_183254|2,182.0157,388.1016,0.469,0.1395
2,K1Cl1|_1_1|orthorhombic|POSCAR_sg25_icsd_043951|1,15.6924,31.3847,0.5,0.0347
3,K1Cl1|_1_1|orthorhombic|POSCAR_sg25_icsd_043951|2,15.6924,31.3847,0.5,0.0347
4,K1Cl1|_1_1|orthorhombic|POSCAR_sg39_icsd_068701|1,259.5733,402.7998,0.6444,0.0541


In [22]:
states_to_keep = set([s for s in df['state'] if s.split('|')[0] not in to_skip])
print(f"{len(states_to_keep)} states_to_keep")

15443011 states_to_keep


In [23]:
df = df[df['state'].isin(states_to_keep)]
print(df['fraction'].max())

0.9349


In [46]:
df_top = df[df['fraction'] > 0.89]
print(len(df_top))
df_top.head()

94


Unnamed: 0,state,cond_ion_vol,total_vol,fraction,time_taken
500478,K2Sb1Cl7|_1_2_7|triclinic|POSCAR_sg1_icsd_1656...,608.8572,677.0664,0.8993,0.4144
507031,K2P1Cl7|_1_2_7|triclinic|POSCAR_sg1_icsd_165648|1,608.8572,677.0664,0.8993,0.4144
884558,K2P1F7|_1_2_7|triclinic|POSCAR_sg1_icsd_165648|1,608.8572,677.0664,0.8993,0.4144
1006822,K6N1P1|_1_1_6|orthorhombic|POSCAR_sg31_icsd_16...,119.4126,131.4577,0.9084,0.2785
1006823,K6N1P1|_1_1_6|orthorhombic|POSCAR_sg31_icsd_16...,119.4126,131.4577,0.9084,0.2785


In [40]:
print(df_top[['cond_ion_vol', 'total_vol', 'fraction']].values)

[[6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [1.1941260e+02 1.3145770e+02 9.0840000e-01]
 [1.1941260e+02 1.3145770e+02 9.0840000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [1.0667877e+03 1.1823781e+03 9.0220000e-01]
 [9.9618570e+02 1.1150266e+03 8.9340000e-01]
 [5.7318000e+02 6.1307190e+02 9.3490000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [6.0885720e+02 6.7706640e+02 8.9930000e-01]
 [1.066787

In [34]:
import numpy as np

In [36]:
icsd_structures = np.asarray([x.split('|')[-2] for x in df_top['state'].values])
for s in icsd_structures:
    print(s)

POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg31_icsd_161309
POSCAR_sg31_icsd_161309
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg57_icsd_022232
POSCAR_sg63_icsd_062029
POSCAR_sg113_icsd_063311
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg57_icsd_022232
POSCAR_sg63_icsd_062029
POSCAR_sg113_icsd_063311
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg57_icsd_022232
POSCAR_sg63_icsd_062029
POSCAR_sg113_icsd_063311
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg1_icsd_165648
POSCAR_sg57_icsd_022232
POSCAR_sg63_icsd_062029
POSCAR_sg113_icsd_063311
POSCAR_sg1_icsd_

In [43]:
os.chdir('../')
sys.path.append('../../')
from examples.crystal_volume import optimize_crystal_volume as ocv
from rlmolecule.crystal.crystal_state import CrystalState
from tqdm.notebook import tqdm

reading ../../rlmolecule/crystal/inputs/icsd_prototypes.json.gz
	4170 structures read


In [44]:
def write_decorated_structure(decorated_str, out_dir):
    # Now create the decoration of this composition onto this prototype structure
    # the 'action_node' string has the following format at this point:
    # comp_type|prototype_structure|decoration_idx
    # we just need 'comp_type|prototype_structure' to get the icsd structure
    composition = decorated_str.split('|')[0]
    structure_key = '|'.join(decorated_str.split('|')[1:-1])
    icsd_prototype = ocv.structures[structure_key]
    decoration_idx = int(decorated_str.split('|')[-1]) - 1
    print(decorated_str, composition, structure_key)
    try:
        decorated_structure, comp = CrystalState.decorate_prototype_structure(
            icsd_prototype, composition, decoration_idx=decoration_idx)
        #decorations[descriptor] = decorated_structure.as_dict()
    except AssertionError as e:
        print(f"AssertionError: {e}")
        return
        #volume_stats[descriptor] = (-1, -1, 0, comp_type)
        #return 0.0, {'terminal': True, 'state_repr': repr(state)}

    # Compute the volume of the conducting ions.
    conducting_ion_vol, total_vol = ocv.compute_structure_vol(decorated_structure)
    frac_conducting_ion_vol = conducting_ion_vol / total_vol if total_vol != 0 else 0
#     print(conducting_ion_vol, total_vol, frac_conducting_ion_vol)
    out_file = f"{out_dir}/POSCAR_{decorated_str.replace('|','-')}"
    print(f"writing {out_file}")
    decorated_structure.to(filename=out_file)
    
    # also write the original poscar file for reference
    out_file2 = f"{out_dir}/icsd_structures/{decorated_str.split('|')[-2]}"
    os.makedirs(os.path.dirname(out_file2), exist_ok=True)
    if not os.path.isfile(out_file2):
        icsd_prototype.to(filename=out_file2)
        
    return [conducting_ion_vol, total_vol, frac_conducting_ion_vol]

In [48]:
# now write each of these to a file
out_dir = f"outputs/all_cvol-0_89"
os.makedirs(out_dir, exist_ok=True)
volume_stats = {}
for decorated_str in tqdm(df_top['state']):
    vol_stats = write_decorated_structure(decorated_str, out_dir)
    volume_stats[decorated_str] = [decorated_str.split('|')[-2]] + vol_stats

  0%|          | 0/94 [00:00<?, ?it/s]

K2Sb1Cl7|_1_2_7|triclinic|POSCAR_sg1_icsd_165648|1 K2Sb1Cl7 _1_2_7|triclinic|POSCAR_sg1_icsd_165648
writing outputs/all_cvol-0_89/POSCAR_K2Sb1Cl7-_1_2_7-triclinic-POSCAR_sg1_icsd_165648-1
K2P1Cl7|_1_2_7|triclinic|POSCAR_sg1_icsd_165648|1 K2P1Cl7 _1_2_7|triclinic|POSCAR_sg1_icsd_165648
writing outputs/all_cvol-0_89/POSCAR_K2P1Cl7-_1_2_7-triclinic-POSCAR_sg1_icsd_165648-1
K2P1F7|_1_2_7|triclinic|POSCAR_sg1_icsd_165648|1 K2P1F7 _1_2_7|triclinic|POSCAR_sg1_icsd_165648
writing outputs/all_cvol-0_89/POSCAR_K2P1F7-_1_2_7-triclinic-POSCAR_sg1_icsd_165648-1
K6N1P1|_1_1_6|orthorhombic|POSCAR_sg31_icsd_161309|1 K6N1P1 _1_1_6|orthorhombic|POSCAR_sg31_icsd_161309
writing outputs/all_cvol-0_89/POSCAR_K6N1P1-_1_1_6-orthorhombic-POSCAR_sg31_icsd_161309-1
K6N1P1|_1_1_6|orthorhombic|POSCAR_sg31_icsd_161309|2 K6N1P1 _1_1_6|orthorhombic|POSCAR_sg31_icsd_161309
writing outputs/all_cvol-0_89/POSCAR_K6N1P1-_1_1_6-orthorhombic-POSCAR_sg31_icsd_161309-2
K2P1I7|_1_2_7|triclinic|POSCAR_sg1_icsd_165648|1 K2P1I7 _