In [1]:
import os
import pandas as pd
import numpy as np
from data import get_dichalcogenides_innopolis_202105

In [2]:
from pymatgen.core.sites import PeriodicSite
from pymatgen.core import Structure
from pymatgen.core.periodic_table import DummySpecies
from pymatgen.io.cif import CifParser

In [3]:
structures = get_dichalcogenides_innopolis_202105()

  0%|          | 0/3480 [00:00<?, ?it/s]


Issues encountered while parsing CIF: Some fractional co-ordinates rounded to ideal values to avoid issues with finite precision.



In [4]:
# TODO(inner perfectionist) eval is unsecure
defects = pd.read_csv(
  "datasets/dichalcogenides_innopolis_202105/descriptors.csv", index_col="_id",
  converters={"cell": eval, "defects": eval})

In [5]:
materials = defects.base.unique()

In [6]:
unit_cells = {}
for material in materials:
  unit_cells[material] = CifParser(os.path.join(
  "defects_generation/molecules", f"{material}.cif")).get_structures(primitive=True)[0]

In [7]:
def get_frac_coords_set(structure):
  return set(map(tuple, np.round(structure.frac_coords, 3)))

In [8]:
def get_defects(structure, unit_cell, supercell):
  reference_species = set(unit_cell.species)
  reference_supercell = unit_cell.copy()
  reference_supercell.make_supercell(supercell)
  reference_sites = get_frac_coords_set(reference_supercell)

  defects = []
  for site in structure:
    if site.specie not in reference_species:
      defects.append(site)
  absent_sites = reference_sites - get_frac_coords_set(structure)
  defects.extend(map(lambda coords: PeriodicSite(
    # TODO(kazeevn) proper vacancies handling downstream, don't conscript hydrogen
    # species=DummySpecies(),
    species=1,
    coords=coords, coords_are_cartesian=False, lattice=structure.lattice), absent_sites))
  return Structure(lattice=structure.lattice,
                   species=[x.specie for x in defects],
                   coords=[x.frac_coords for x in defects],
                   coords_are_cartesian=False)

In [9]:
def get_defecs_from_row(row):
  this_defect = defects.loc[row.descriptor_id]
  return get_defects(row.initial_structure, unit_cells[this_defect.base], this_defect.cell)

In [10]:
structures["defect_representation"] = structures.apply(get_defecs_from_row, axis=1)

In [11]:
# Test
assert structures.apply(
  lambda row: len(row.defect_representation) == len(defects.loc[row.descriptor_id, "defects"]), 
  axis=1).all()

In [12]:
structures.to_pickle("datasets/structures_defects.pickle.gzip")

In [13]:
from ase.visualize import view
from pymatgen.io.ase import AseAtomsAdaptor

In [14]:
structure_to_plot = structures.iloc[1221]
view(AseAtomsAdaptor().get_atoms(structure_to_plot.initial_structure), viewer='ngl')



HBox(children=(NGLWidget(), VBox(children=(Dropdown(description='Show', options=('All', 'S', 'Mo'), value='All…

In [15]:
structure_to_plot = structures.iloc[1221]
view(AseAtomsAdaptor().get_atoms(structure_to_plot.defect_representation), viewer='ngl')

HBox(children=(NGLWidget(), VBox(children=(Dropdown(description='Show', options=('All', 'H'), value='All'), Dr…