In [56]:
# modules for structure decoration
import pandas as pd
import os
import itertools
from tqdm.notebook import tqdm
import networkx as nx
import glob
from glob import iglob
from copy import deepcopy
from collections import defaultdict

from pymatgen.core import Composition, Structure
from pymatgen.analysis import local_env

In [2]:
# print(f"pymatgen version: {pymatgen.__version__}")
import pip
pip.main(["show","pymatgen"])

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Name: pymatgen
Version: 2022.0.8
Summary: Python Materials Genomics is a robust materials analysis code that defines core object representations for structures and molecules with support for many electronic structure codes. It is currently the core analysis code powering the Materials Project (https://www.materialsproject.org).
Home-page: http://www.pymatgen.org
Author: Pymatgen Development Team
Author-email: ongsp@eng.ucsd.edu
License: MIT
Location: /home/jlaw/.conda/envs/crystals/lib/python3.8/site-packages
Requires: scipy, monty, networkx, matplotlib, sympy, plotly, tabulate, numpy, spglib, palettable, requests, uncertainties, pandas, ruamel.yaml
Required-by: 


0

## Find the crystal structure with the maximum volume around the conducting ions

Action space:
1. Choose the elements desired for the battery material i.e., conducting ion, framework cation(s), and anion(s)
2. For a given combination of elements, randomly select a composition from one of the valence-balanced compounds available as a lookup table (see 3.1).
3. For the selected composition type, a number of prototype structures are available, which will be classified by their crystal system (cubic, hexagonal, ...). Choose a crystal system randomly.
4. For a chosen crystal system, consider all the prototypes and construct hypothetical decorated structures.
5. Compute the volume around the conducting ions

### Compositions
Elements commonly found in battery materials:
- Conducting ion (C): Li+, Na+, K+, Mg2+, Zn2+
- Anion (A): F-, Cl-, Br-, I-, O2-, S2-, N3-, P3-
- Framework cation (F): Sc3+, Y3+, La3+, Ti4+, Zr4+, Hf4+, W6+, Zn2+, Cd2+, Hg2+, B3+, Al3+, Si4+, Ge4+, Sn4+, P5+, Sb5+

Hypothetical compositions using combinations of C, F, and A are of the following forms:
1. Cx Az
2. Cx A1z1 A2z2
3. Cx Fy Az
4. Cx Fy A1z1 A2z2
5. Cx F1y1 F2y2 Az
6. Cx F1y1 F2y2 A1z1 A2z2 

The following constraints are employed in generating the compositions:
1. A composition may contain only one C ion, up to two (0-2) F ions and at least one and up to two (1-2) A ions.
2. The sum of stoichiometric coefficients of the ions is less than or equal to ten, i.e., x + y1 + y2 + z1 + z2 ≤ 10 .
3. The generated compositions are valence-balanced, i.e., stoichiometric sum of oxidation states of ions equals to 0.

### Build Action space graph
We are going to build a networkx graph of all the possible actions, split into two parts. Actions 1-2, and actions 3-5.

In [8]:
G = nx.DiGraph()

In [3]:
# want to maximize the volume around only the conducting ions
conducting_ions = set(['Li', 'Na', 'K', 'Mg', 'Zn'])
anions = set(['F', 'Cl', 'Br', 'I', 'O', 'S', 'N', 'P'])
framework_cations = set(['Sc', 'Y', 'La', 'Ti', 'Zr', 'Hf', 'W', 'Zn', 'Cd', 'Hg', 'B', 'Al', 'Si', 'Ge', 'Sn', 'P', 'Sb'])
elements = conducting_ions | anions | framework_cations
# sort by the length of the string, so that multiple letter elements come first
elements = sorted(elements, key=len, reverse=True)
print(elements)

['Cl', 'Al', 'Zn', 'Ge', 'Br', 'La', 'Cd', 'Hg', 'Si', 'Sb', 'Hf', 'Sc', 'Li', 'Mg', 'Ti', 'Zr', 'Na', 'Sn', 'K', 'F', 'N', 'S', 'I', 'W', 'P', 'Y', 'O', 'B']


In [4]:
anions & framework_cations

{'P'}

In [5]:
conducting_ions & framework_cations

{'Zn'}

In [62]:
# first, choose the elements for the battery material. Put all possible combinations as the actions
def build_element_combinations():
    element_combinations = set()
    # 1. Cx Az
    for c in conducting_ions:
        for a in anions:
            element_combinations.add((c, a))

    # 2. Cx A1z1 A2z2
        for a1, a2 in itertools.combinations(anions, 2):
            element_combinations.add((c, a1, a2))

    # 3. Cx Fy Az
        for f in framework_cations:
            for a in anions:
                element_combinations.add((c, f, a))
                
    # 4. Cx Fy A1z1 A2z2
        for f in framework_cations:
            for a1, a2 in itertools.combinations(anions, 2):
                element_combinations.add((c, f, a1, a2))

    # 5. Cx F1y1 F2y2 Az
        for f1, f2 in itertools.combinations(framework_cations, 2):
            for a in anions:
                element_combinations.add((c, f1, f2, a))

    # 6. Cx F1y1 F2y2 A1z1 A2z2 
        for f1, f2 in itertools.combinations(framework_cations, 2):
            for a1, a2 in itertools.combinations(anions, 2):
                element_combinations.add((c, f1, f2, a1, a2))
                
    # 'P' can be either in A or F, and Zinc can be either in C or F
    # Make sure there are no duplicates here
    element_combinations = set(tuple(set(e_c)) for e_c in element_combinations)
    
    return element_combinations

element_combinations = build_element_combinations()
print(len(element_combinations))

25939


In [14]:
# G2 = nx.DiGraph()
nx.add_path(G2, [0,1,2,4])
G2.edges()

OutEdgeView([(0, 1), (1, 2), (2, 3), (2, 4)])

For the reinforcement learning to better distinguish between the combination of elements, adding an element will be a specific action

For example:
1. Choose a conducting ion
2. Choose an anion
3. Possibly add a framework cation
4. Possibly add another anion
5. Possibly add another framework cation

In [28]:
def build_element_combination_actions(G):
    """
    *G*: networkx DiGraph of crystal structure actions
    """    

    for c in conducting_ions:
    # 1. Cx Az
        for a1 in anions - {c}:
            # since the ordering of the elements doesn't matter, 
            # only store the sorted version of the elements
            c_a1 = tuple(sorted(set((c, a1))))
            # add edge from c to (c a1)
            G.add_edge(c, c_a1)
                
    # 2. Cx A1z1 A2z2
            for a2 in anions - set(c_a1):
                c_a1_a2 = tuple(sorted(set((c, a1, a2))))
                G.add_edge(c_a1, c_a1_a2)

    # 4. Cx Fy A1z1 A2z2
                for f1 in framework_cations - set(c_a1_a2):
                    c_f1_a1_a2 = tuple(sorted(set((c, f1, a1, a2))))
                    G.add_edge(c_a1_a2, c_f1_a1_a2)

    # 6. Cx F1y1 F2y2 A1z1 A2z2 
                    for f2 in framework_cations - set(c_f1_a1_a2):
                        c_f1_f2_a1_a2 = tuple(sorted(set((c, f1, f2, a1, a2))))
                        G.add_edge(c_f1_a1_a2, c_f1_f2_a1_a2)

    # 3. Cx Fy Az                
            for f1 in framework_cations - set(c_a1):
                c_f1_a1 = tuple(sorted(set((c, f1, a1))))
                G.add_edge(c_a1, c_f1_a1)

    # 5. Cx F1y1 F2y2 Az
                for f2 in framework_cations - set(c_f1_a1):
                    c_f1_f2_a1 = tuple(sorted(set((c, f1, f2, a1))))
                    G.add_edge(c_f1_a1, c_f1_f2_a1)

G = nx.DiGraph()
build_element_combination_actions(G)
print(f'{G.number_of_nodes()} nodes, {G.number_of_edges()} edges')

25853 nodes, 49260 edges


In [29]:
working_dir = "/projects/rlmolecule/shubham/file_transfer/decorations/jupyter_demo"

In [30]:
# read-in the dataframe containing all compositions to decorate
df_comp = pd.read_csv(f'{working_dir}/compositions.csv')
df_comp

Unnamed: 0,composition,stoichiometry,comp_type
0,Li1Sc1F4,114,_1_1_4
1,Li1Sc1Cl4,114,_1_1_4
2,Li1Sc1Br4,114,_1_1_4
3,Li1Sc1I4,114,_1_1_4
4,Li1Sc1O2,112,_1_1_2
...,...,...,...
138771,Zn5O2P2,522,_2_2_5
138772,Zn5S2N2,522,_2_2_5
138773,Zn5S2P2,522,_2_2_5
138774,Zn6N1P3,613,_1_3_6


In [70]:
compositions = df_comp['composition'].to_list()
comp_types = set(df_comp['comp_type'].to_list())
comp_to_comp_type = dict(zip(df_comp['composition'], df_comp['comp_type']))

In [32]:
len(str(df_comp['stoichiometry'][0]))

3

In [33]:
# strip the compositions to just the atoms
comp_elements = defaultdict(set)
for c in compositions:
    orig_c = c
    ele_in_comp = []
    # these elements are sorted such that the double letter elements come after single letter ones
    for e in elements:
        if e in c:
            ele_in_comp.append(e)
            # make sure a single letter element doesn't also match (e.g., Si vs S)
            c = c.replace(e,'')
    comp_elements[tuple(sorted(ele_in_comp))].add(orig_c)
#     print(ele_in_comp)
#     print(orig_c)
print(len(comp_elements))

21088


In [37]:
# not all of the generated compositions are valence-balanced, i.e., stoichiometric sum of oxidation states of ions equals 0.
# Shubham already computed which combinations are valid, so limit those here
ele_combo_with_comp = set()
ele_combo_without_comp = set()
for ele_combo in G.nodes():
    if ele_combo in comp_elements:
        ele_combo_with_comp.add(ele_combo)
    else:
        ele_combo_without_comp.add(ele_combo)
            
print(f"{len(ele_combo_with_comp)} out of {len(G.nodes())} have a corresponding composition")
# delete the non-valid element combinations from the graph
G.remove_nodes_from(ele_combo_without_comp)
print(f"{len(G.nodes())} nodes remaining")

21088 out of 25853 have a corresponding composition
21088 nodes remaining


In [38]:
# Step 2: For a given combination of elements, randomly select a composition from one of the valence-balanced compounds available as a lookup table
# As a sanity check, see how many compositions there are for a given combination of elements (top 20):
print(sorted([len(vals) for vals in comp_elements.values()], reverse=True)[:20])

[29, 29, 29, 29, 29, 29, 29, 29, 29, 22, 22, 22, 22, 18, 18, 18, 18, 18, 18, 18]


In [39]:
for eles, comps in comp_elements.items():
    break
print(eles, comps)

('F', 'Li', 'Sc') {'Li3Sc1F6', 'Li1Sc1F4', 'Li2Sc1F5', 'Li1Sc2F7'}


In [41]:
# add the edges from the element combinations to the compositions
for ele_combo, comps in comp_elements.items():
    for comp in comps:
        G.add_edge(ele_combo, comp)
    
print(f'{G.number_of_nodes()} nodes, {G.number_of_edges()} edges')

159864 nodes, 178476 edges


In [50]:
# write this network as the action tree
out_file = "inputs/elements_to_compositions.edgelist"
print(f"writing graph to {out_file}")
nx.write_edgelist(G, out_file, delimiter='\t', data=False)

writing graph to inputs/elements_to_compositions.edgelist


In [53]:
# lets select the first one as an example:
curr_comp = next(iter(comps))
print(comp)

# get the comp_type for this comp
curr_comp_type = comp_to_comp_type[comp]
print(curr_comp_type)

Zn5S2N2
_2_2_5


### Step 3: 
For the selected composition type, a number of prototype structures are available, which will be classified by their crystal system (e.g., cubic, hexagonal, ...). Select a crystal system.


The crystal systems are determined by "spacegroup numbers", which represents symmetry of a structure. There are 230 spacegroups (1-230).

Following is the classification of the 7 crystal systems by spacegroups (sg):
1. Triclinic: sg 1-2
2. Monoclinic: sg 3-15
3. Orthorhombic: sg 16-74
4. Tetragonal: 75-142
5. Trigonal: sg 143-167
6. Hexagonal: 168-194
7. Cubic: sg 195-230.

The spacegroup of a prototype structure is present in the structure id: sg33, sg225 etc.
For example, spacegroup of prototype structure "POSCAR_sg33_icsd_065132" is 33.


In [54]:
crystal_systems = {'triclinic': set([1,2]),
                   'monoclinic': set(range(3,16)),
                   'orthorhombic': set(range(16,75)),
                   'tetragonal': set(range(75,143)),
                   'trigonal': set(range(143,168)),
                   'hexagonal': set(range(168,195)),  
                   'cubic': set(range(195,231)),
                  }
sg_num_to_crystal_sys = {n: crystal_sys for crystal_sys, nums in crystal_systems.items() for n in nums}

In [57]:
# how do I get the prototype structure?
# Check how many prototype structures there are for each 
prototype_folder = '/projects/rlmolecule/shubham/icsd_prototypes_poscars/DB/'
comp_prototypes = defaultdict(set)
for comp_type in comp_types:
    for poscar_file in glob.glob(f"{prototype_folder}/{comp_type}/*"):
        file_name = os.path.basename(poscar_file)
        comp_prototypes[comp_type].add(file_name)
        
print(list(comp_prototypes.items())[:2])

[('_1_3_6', {'POSCAR_sg167_icsd_423672', 'POSCAR_sg205_icsd_065968', 'POSCAR_sg203_icsd_019068', 'POSCAR_sg2_icsd_411500', 'POSCAR_sg162_icsd_062027', 'POSCAR_sg63_icsd_030712', 'POSCAR_sg182_icsd_632802', 'POSCAR_sg87_icsd_262076', 'POSCAR_sg15_icsd_409384', 'POSCAR_sg2_icsd_300275', 'POSCAR_sg14_icsd_152889', 'POSCAR_sg12_icsd_093226', 'POSCAR_sg163_icsd_154092', 'POSCAR_sg206_icsd_098672', 'POSCAR_sg88_icsd_260574', 'POSCAR_sg62_icsd_040575', 'POSCAR_sg148_icsd_026312', 'POSCAR_sg63_icsd_419133', 'POSCAR_sg1_icsd_173746', 'POSCAR_sg58_icsd_001106', 'POSCAR_sg146_icsd_240377', 'POSCAR_sg61_icsd_402279', 'POSCAR_sg14_icsd_096478', 'POSCAR_sg71_icsd_074211', 'POSCAR_sg71_icsd_089477', 'POSCAR_sg14_icsd_027070', 'POSCAR_sg9_icsd_247337', 'POSCAR_sg84_icsd_421317', 'POSCAR_sg163_icsd_062035', 'POSCAR_sg144_icsd_096753', 'POSCAR_sg163_icsd_154094', 'POSCAR_sg36_icsd_635286', 'POSCAR_sg2_icsd_063582', 'POSCAR_sg2_icsd_421175', 'POSCAR_sg152_icsd_024147', 'POSCAR_sg206_icsd_026990', 'POSCAR

In [58]:
# head and tail of number of prototypes available to choose from per composition type
print(sorted([len(vals) for vals in comp_prototypes.values()], reverse=True)[:20])
print(sorted([len(vals) for vals in comp_prototypes.values()])[:20])

[418, 218, 211, 204, 194, 172, 161, 153, 138, 130, 122, 110, 107, 93, 84, 83, 80, 68, 67, 66]
[1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 10, 11, 12, 12, 12, 15, 16]


In [59]:
# total number of prototypes for these compositions
print(sum([len(vals) for vals in comp_prototypes.values()]))

# total number of unique prototypes
prototype_files = set(p for vals in comp_prototypes.values() for p in vals)
print(f'{len(prototype_files)} prototype files')

4170
4170 prototype files


In [60]:
# map the poscar files to their crystal system
prototype_to_crystal_sys = {}
for poscar_file in prototype_files:
    sg_num = poscar_file.split('_')[1].replace('sg','')
    system = sg_num_to_crystal_sys[int(sg_num)]
    prototype_to_crystal_sys[poscar_file] = system
    
crystal_sys_prototypes = defaultdict(set)
for p, c in prototype_to_crystal_sys.items():
    crystal_sys_prototypes[c].add(p)

for c, p in crystal_sys_prototypes.items():
    print(c, len(p))

cubic 270
monoclinic 820
orthorhombic 1421
tetragonal 741
trigonal 414
hexagonal 351
triclinic 153


In [61]:
print(list(prototype_to_crystal_sys.items())[:3])

[('POSCAR_sg199_icsd_033656', 'cubic'), ('POSCAR_sg14_icsd_262584', 'monoclinic'), ('POSCAR_sg14_icsd_107927', 'monoclinic')]


In [62]:
# how many crystal systems are available for each comp_type?
comp_type_to_crystal_sys = defaultdict(set)
for comp_type, prototypes in comp_prototypes.items():
    for p in prototypes:
        comp_type_to_crystal_sys[comp_type].add(prototype_to_crystal_sys[p])
print(list(comp_type_to_crystal_sys.items())[:3])
    
print(f"histogram of number of crystal structures per comp_type (out of {len(comp_type_to_crystal_sys)} total):")
for i in range(1,8):
    num_matching = len([x for x in comp_type_to_crystal_sys.values() if len(x) == i])
    print(f"{num_matching} comp_types have {i} crystal_systems")

[('_1_3_6', {'orthorhombic', 'hexagonal', 'triclinic', 'trigonal', 'cubic', 'monoclinic', 'tetragonal'}), ('_1_1_1_5', {'orthorhombic', 'hexagonal', 'triclinic', 'trigonal', 'monoclinic', 'tetragonal'}), ('_1_2_2_2', {'orthorhombic', 'hexagonal', 'triclinic', 'trigonal', 'monoclinic', 'tetragonal'})]
histogram of number of crystal structures per comp_type (out of 72 total):
2 comp_types have 1 crystal_systems
1 comp_types have 2 crystal_systems
5 comp_types have 3 crystal_systems
13 comp_types have 4 crystal_systems
9 comp_types have 5 crystal_systems
11 comp_types have 6 crystal_systems
31 comp_types have 7 crystal_systems


In [75]:
G2 = G.copy()

In [76]:
# add the crystal system to the graph
for comp in compositions:
    comp_type = comp_to_comp_type[comp]
    crystal_systems = comp_type_to_crystal_sys[comp_type]
    for crystal_sys in crystal_systems:
        G2.add_edge(comp, comp + '|' + crystal_sys)

print(f'{G2.number_of_nodes()} nodes, {G2.number_of_edges()} edges')

805362 nodes, 823974 edges


In [73]:
# choose a crystal system
print(curr_comp_type, comp_type_to_crystal_sys[curr_comp_type])
curr_crystal_sys = next(iter(comp_type_to_crystal_sys[curr_comp_type]))
print(curr_crystal_sys)

_2_2_5 {'orthorhombic', 'hexagonal', 'triclinic', 'trigonal', 'monoclinic', 'tetragonal'}
orthorhombic


In [74]:
# now choose a prototype structure
# for comp_type, prototypes in comp_prototypes.items():
avail_prototypes = set()
for p in comp_prototypes[curr_comp_type]:
    crystal_sys = prototype_to_crystal_sys[p]
    if crystal_sys == curr_crystal_sys:
        avail_prototypes.add(p)
print(f'{len(avail_prototypes)} prototypes are {curr_crystal_sys} out of {len(comp_prototypes[curr_comp_type])} possible for {curr_comp_type}:')
print(avail_prototypes)
curr_prototype = next(iter(avail_prototypes))
print(curr_prototype)

22 prototypes are orthorhombic out of 51 possible for _2_2_5:
{'POSCAR_sg33_icsd_037173', 'POSCAR_sg58_icsd_090184', 'POSCAR_sg63_icsd_401723', 'POSCAR_sg61_icsd_067185', 'POSCAR_sg63_icsd_150370', 'POSCAR_sg60_icsd_069300', 'POSCAR_sg62_icsd_648455', 'POSCAR_sg65_icsd_023084', 'POSCAR_sg63_icsd_280533', 'POSCAR_sg62_icsd_161513', 'POSCAR_sg52_icsd_018301', 'POSCAR_sg56_icsd_026452', 'POSCAR_sg55_icsd_180494', 'POSCAR_sg63_icsd_025382', 'POSCAR_sg64_icsd_098943', 'POSCAR_sg36_icsd_001478', 'POSCAR_sg29_icsd_037334', 'POSCAR_sg55_icsd_417810', 'POSCAR_sg46_icsd_051319', 'POSCAR_sg55_icsd_414344', 'POSCAR_sg62_icsd_401724', 'POSCAR_sg37_icsd_280481'}
POSCAR_sg33_icsd_037173


In [88]:
# 4. For a chosen crystal system, consider all the prototypes and construct hypothetical decorated structures.
structures = {}
for comp in tqdm(compositions):
    comp_type = comp_to_comp_type[comp]
    crystal_systems = comp_type_to_crystal_sys[comp_type]
    
    composition = Composition(comp)
    # create permutations of order of elements within a composition
    comp_permutations = list(itertools.permutations(composition.formula.split(' ')))

    # second loop over all prototypes for a given composition type
    for proto in comp_prototypes[comp_type]:
        crystal_sys = prototype_to_crystal_sys[proto]
#         G2.add_edge(comp, comp + '|' + crystal_sys)
#         G2.add_edge(comp + '|' + crystal_sys, comp + '|' + proto)
        
        if proto in structures:
            strc = structures[proto]
        else:
            fname = proto.replace('POSCAR_','')
            strc_file = f"{prototype_folder}/{comp_to_comp_type[comp]}/{proto}"
            strc = Structure.from_file(strc_file, primitive=False)
            structures[proto] = strc
        comp_proto = Composition(strc.formula).reduced_composition
        proto_stoic = [int(p) for p in comp_proto.formula if p.isdigit()]
        
        c = 0
        # third loop over all permutations of order of elements for a composition, created in the first loop
        for permu in comp_permutations:
#             strc_subs = deepcopy(strc)
            separator = ' '
            comp_permu = separator.join(list(permu))
            comp_stoic = [int(p) for p in comp_permu if p.isdigit()]

            # check if stoichiometric coefficients of the composition to decorate match with those of prototype structure
            if comp_stoic == proto_stoic:
                c += 1
                # TODO rather than create a new structures file for every possible decoration, 
                # store them all in a single json file 
#                 # define lists of elements for elemental substitution
#                 original_ele = ''.join(i for i in comp_proto.formula if not i.isdigit()).split(' ')
#                 replacement_ele = ''.join(i for i in comp_permu if not i.isdigit()).split(' ')

#                 # dictionary containing original elements as keys and new elements as values
#                 replacement = {original_ele[i]: replacement_ele[i] for i in range(len(original_ele))}
                
#                 # 'replace_species' function from pymatgen to replace original elements with new elements
#                 strc_subs.replace_species(replacement)
#                 new_strc_file = 'POSCAR' + '_' + comp + '_' + fname + '_' + str(c)
# #                 strc_subs.to(filename= dir + '/' + )
                G2.add_edge(comp + '|' + proto, comp + '|' + proto + '|' + str(c))
#                 print(new_strc_file)
        
print(f'{G2.number_of_nodes()} nodes, {G2.number_of_edges()} edges')

  0%|          | 0/138776 [00:00<?, ?it/s]

19081381 nodes, 19099993 edges


In [89]:
# write this network as the action tree
out_file = "inputs/all_actions.edgelist"
print(f"writing graph to {out_file}")
nx.write_edgelist(G2, out_file, delimiter='\t', data=False)

writing graph to inputs/all_actions.edgelist


In [None]:
# write this network as the action tree
out_file = "inputs/compositions_to_decorations.edgelist"
print(f"writing graph to {out_file}")
nx.write_edgelist(G2, out_file, delimiter='\t', data=False)