In [1]:
import os
import pandas as pd
import numpy as np
from data import get_dichalcogenides_innopolis

In [2]:
from pymatgen.core.sites import PeriodicSite
from pymatgen.core import Structure
from pymatgen.core.periodic_table import DummySpecies, Element
from pymatgen.io.cif import CifParser

In [3]:
structures = get_dichalcogenides_innopolis("datasets/dichalcogenides_innopolis_202105/")

  0%|          | 0/3480 [00:00<?, ?it/s]


Issues encountered while parsing CIF: Some fractional co-ordinates rounded to ideal values to avoid issues with finite precision.



In [4]:
structures_8x8 = get_dichalcogenides_innopolis("datasets/dichalcogenides8x8_innopolis_202108/").dropna()

  0%|          | 0/517 [00:00<?, ?it/s]

In [5]:
# Check that the ids are unique
assert len(structures_8x8.index.intersection(structures.index)) == 0

In [6]:
# TODO(inner perfectionist) eval is unsecure
defects_list = []
for dataset in ("dichalcogenides_innopolis_202105", "dichalcogenides8x8_innopolis_202108"):
  defects_list.append(pd.read_csv(
    os.path.join("datasets", dataset, "descriptors.csv"), index_col="_id",
    converters={"cell": eval, "defects": eval}))
defects = pd.concat(defects_list, axis=0)

In [7]:
materials = defects.base.unique()

In [8]:
unit_cells = {}
for material in materials:
  unit_cells[material] = CifParser(os.path.join(
  "defects_generation", "molecules", f"{material}.cif")).get_structures(primitive=True)[0]


Issues encountered while parsing CIF: Some fractional co-ordinates rounded to ideal values to avoid issues with finite precision.



In [9]:
initial_structure_properties = pd.concat([
  pd.read_csv(os.path.join("datasets", "dichalcogenides_innopolis_202105", "initial_structures.csv"),
              index_col=["base", "cell_length"], usecols=[1,2,3,4]),
  pd.read_csv(os.path.join("datasets", "dichalcogenides8x8_innopolis_202108", "initial_structures.csv"),
              index_col=["base", "cell_length"], usecols=[1,2,3,4])
], axis=0)

In [10]:
single_atom_energies = pd.read_csv(os.path.join("datasets", "single_atom_energies.csv"),
                                   index_col=0,
                                   converters={0: Element})
SINGLE_ENENRGY_COLUMN = "energy_bulk_estimate"
temporary_energies = {"Mo": -10.85, "W": -12.96, "S": -4.24, "Se": -3.50, "O": -4.95}
for element_name, energy in temporary_energies.items():
  single_atom_energies.loc[Element(element_name), SINGLE_ENENRGY_COLUMN] = energy

In [11]:
def get_frac_coords_set(structure):
  return set(map(tuple, np.round(structure.frac_coords, 3)))

In [12]:
def strucure_to_dict(structure, precision=3):
  res = {}
  for site in structure:
    res[tuple(np.round(site.frac_coords, precision))] = site
  return res

In [13]:
def get_defects(structure, defect_description):
  unit_cell = unit_cells[defect_description.base]
  reference_species = set(unit_cell.species)
  reference_supercell = unit_cell.copy()
  reference_supercell.make_supercell(defect_description.cell)
  reference_sites = get_frac_coords_set(reference_supercell)

  defects = []
  were_species = []
  initial_energy = initial_structure_properties.loc[defect_description.base, defect_description.cell[0]].energy
  defect_energy_correction = 0
  
  structure_dict = strucure_to_dict(structure)
  reference_structure_dict = strucure_to_dict(reference_supercell)

  for coords, reference_site in reference_structure_dict.items():
    # Vacancy
    if coords not in structure_dict:
      defects.append(PeriodicSite(
        species=DummySpecies(),
        coords=coords,
        coords_are_cartesian=False,
        lattice=structure.lattice,
      ))
      were_species.append(reference_site.specie.Z)
      defect_energy_correction += single_atom_energies.loc[
        reference_site.specie, SINGLE_ENENRGY_COLUMN]
    # Substitution
    elif structure_dict[coords].specie != reference_site.specie:
      defects.append(structure_dict[coords])
      were_species.append(reference_site.specie.Z)
      initial_energy += single_atom_energies.loc[
        structure_dict[coords].specie, SINGLE_ENENRGY_COLUMN]
      defect_energy_correction += single_atom_energies.loc[
        reference_site.specie, SINGLE_ENENRGY_COLUMN]
    
  res = Structure(lattice=structure.lattice,
                  species=[x.specie for x in defects],
                  coords=[x.frac_coords for x in defects],
                  site_properties={"was": were_species},
                  coords_are_cartesian=False)
  res.state = [sorted([element.Z for element in reference_species])]
  return res, defect_energy_correction - initial_energy

In [14]:
# TODO(kazeevn) this all is very ugly
def get_defecs_from_row(row):
  defect_structure, formation_energy_part = get_defects(row.initial_structure, defects.loc[row.descriptor_id])
  return defect_structure, formation_energy_part + row.energy

In [15]:
all_structures = pd.concat([structures, structures_8x8], axis=0)
defect_properties = all_structures.apply(
  get_defecs_from_row,
  axis=1,
  result_type="expand")
defect_properties.columns = ["defect_representation", "formation_energy"]
all_structures = all_structures.join(defect_properties)

In [16]:
all_structures["formation_energy_per_site"] = all_structures["formation_energy"] / all_structures["defect_representation"].apply(len)
all_structures["band_gap"] = all_structures["lumo"] - all_structures["homo"]

In [17]:
# Test
assert all_structures.apply(
  lambda row: len(row.defect_representation) == len(defects.loc[row.descriptor_id, "defects"]), 
  axis=1).all()

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
all_structures.to_pickle("datasets/all_structures_defects.pickle.gzip")

def is_vacancy_only(defect_structure):
  return all((isinstance(specie, DummySpecies) for specie in defect_structure.species))

train, test = train_test_split(all_structures, test_size=0.25, random_state=2141)
train.to_pickle("datasets/train_defects.pickle.gzip")
test.to_pickle("datasets/test_defects.pickle.gzip")

In [20]:
vac_only = all_structures.defect_representation.apply(is_vacancy_only)
vac_train, vac_test = train_test_split(all_structures[vac_only], test_size=0.25, random_state=211231)
vac_train.to_pickle("datasets/train_defects_vac_only.pickle.gzip")
vac_test.to_pickle("datasets/test_defects_vac_only.pickle.gzip")

In [21]:
is_small = all_structures.apply(
  lambda row: defects.loc[row.descriptor_id, "cell"][0] < 8,
  axis=1)
vac_8x8 = all_structures[vac_only & ~is_small]
vac_no_8x8 = all_structures[vac_only & is_small]
vac_no_8x8.to_pickle("datasets/train_defects_vac_only_no_8x8_in_train.pickle.gzip")
vac_8x8.to_pickle("datasets/test_defects_vac_only_no_8x8_in_train.pickle.gzip")

In [22]:
train_8x8, test_8x8 = train_test_split(vac_8x8, test_size=0.5, random_state=42134114)
pd.concat([vac_no_8x8, train_8x8], ignore_index=True).to_pickle(
  "datasets/train_defects_vac_only_8x8_split.pickle.gzip")
test_8x8.to_pickle(
  "datasets/test_defects_vac_only_8x8_split.pickle.gzip")

In [23]:
from ase.visualize import view
from pymatgen.io.ase import AseAtomsAdaptor

In [24]:
structure_to_plot = all_structures.iloc[1221]
view(AseAtomsAdaptor().get_atoms(structure_to_plot.initial_structure), viewer='ngl')



HBox(children=(NGLWidget(), VBox(children=(Dropdown(description='Show', options=('All', 'Mo', 'S'), value='All…

In [25]:
view(AseAtomsAdaptor().get_atoms(structure_to_plot.defect_representation), viewer='ngl')

HBox(children=(NGLWidget(), VBox(children=(Dropdown(description='Show', options=('All', 'X'), value='All'), Dr…