In [1]:
import os
import pandas as pd
import numpy as np
from data import get_dichalcogenides_innopolis_202105

In [2]:
from pymatgen.core.sites import PeriodicSite
from pymatgen.core import Structure
from pymatgen.core.periodic_table import DummySpecies, Element
from pymatgen.io.cif import CifParser

In [3]:
structures = get_dichalcogenides_innopolis_202105()

  0%|          | 0/3480 [00:00<?, ?it/s]


Issues encountered while parsing CIF: Some fractional co-ordinates rounded to ideal values to avoid issues with finite precision.



In [4]:
# TODO(inner perfectionist) eval is unsecure
defects = pd.read_csv(
  "datasets/dichalcogenides_innopolis_202105/descriptors.csv", index_col="_id",
  converters={"cell": eval, "defects": eval})

In [5]:
materials = defects.base.unique()

In [6]:
unit_cells = {}
for material in materials:
  unit_cells[material] = CifParser(os.path.join(
  "defects_generation", "molecules", f"{material}.cif")).get_structures(primitive=True)[0]

In [7]:
initial_structure_properties = pd.read_csv(os.path.join(
  "datasets", "dichalcogenides_innopolis_202105", "initial_structures.csv"),
                                          index_col=["base", "cell_length"],
                                          usecols=[1,2,3,4])

In [8]:
single_atom_energies = pd.read_csv(os.path.join("datasets", "single_atom_energies.csv"),
                                   index_col=0,
                                   converters={0: Element})
SINGLE_ENENRGY_COLUMN = "energy_symmetric"

In [9]:
def get_frac_coords_set(structure):
  return set(map(tuple, np.round(structure.frac_coords, 3)))

In [10]:
def strucure_to_dict(structure, precision=3):
  res = {}
  for site in structure:
    res[tuple(np.round(site.frac_coords, precision))] = site
  return res

In [11]:
VACANCY_PLACEHOLDER = 1
def get_defects(structure, defect_description):
  unit_cell = unit_cells[defect_description.base]
  reference_species = set(unit_cell.species)
  reference_supercell = unit_cell.copy()
  reference_supercell.make_supercell(defect_description.cell)
  reference_sites = get_frac_coords_set(reference_supercell)

  defects = []
  initial_energy = initial_structure_properties.loc[defect_description.base, defect_description.cell[0]].energy
  defect_energy_correction = 0
  
  structure_dict = strucure_to_dict(structure)
  reference_structure_dict = strucure_to_dict(reference_supercell)

  for coords, reference_site in reference_structure_dict.items():
    # Vacancy
    if coords not in structure_dict:
      defects.append(PeriodicSite(
        # TODO(kazeevn) proper vacancies handling downstream, don't conscript hydrogen
        # species=DummySpecies(),
        species=VACANCY_PLACEHOLDER,
        coords=coords, coords_are_cartesian=False, lattice=structure.lattice))
      defect_energy_correction += single_atom_energies.loc[
        reference_site.specie, SINGLE_ENENRGY_COLUMN]
    # Substitution
    elif structure_dict[coords].specie != reference_site.specie:
      defects.append(structure_dict[coords])
      initial_energy += single_atom_energies.loc[
        structure_dict[coords].specie, SINGLE_ENENRGY_COLUMN]
      defect_energy_correction += single_atom_energies.loc[
        reference_site.specie, SINGLE_ENENRGY_COLUMN]
    
  res = Structure(lattice=structure.lattice,
                   species=[x.specie for x in defects],
                   coords=[x.frac_coords for x in defects],
                   coords_are_cartesian=False)
  res.state = [sorted([element.Z for element in reference_species])]
  return res, defect_energy_correction - initial_energy

In [12]:
# TODO(kazeevn) this all is very ugly
def get_defecs_from_row(row):
  defect_structure, formation_energy_part = get_defects(row.initial_structure, defects.loc[row.descriptor_id])
  return defect_structure, formation_energy_part + row.energy

In [13]:
defect_properties = structures.apply(
  get_defecs_from_row,
  axis=1,
  result_type="expand")
defect_properties.columns = ["defect_representation", "formation_energy"]
structures = structures.join(defect_properties)

In [14]:
# Test
assert structures.apply(
  lambda row: len(row.defect_representation) == len(defects.loc[row.descriptor_id, "defects"]), 
  axis=1).all()

In [15]:
structures.to_pickle("datasets/structures_defects.pickle.gzip")

In [16]:
from ase.visualize import view
from pymatgen.io.ase import AseAtomsAdaptor

In [17]:
structure_to_plot = structures.iloc[1221]
view(AseAtomsAdaptor().get_atoms(structure_to_plot.initial_structure), viewer='ngl')



HBox(children=(NGLWidget(), VBox(children=(Dropdown(description='Show', options=('All', 'S', 'Mo'), value='All…

In [18]:
structure_to_plot = structures.iloc[1221]
view(AseAtomsAdaptor().get_atoms(structure_to_plot.defect_representation), viewer='ngl')

HBox(children=(NGLWidget(), VBox(children=(Dropdown(description='Show', options=('All', 'H'), value='All'), Dr…