In [1]:
from pathlib import Path
import pandas as pd
from data import read_csv_cif, read_defects_descriptions, read_flat

## Load and explore a subset of data

The path assumes we are in the project tree. If you just downloaded the archive, chage it.

In [2]:
example_dataset_path = Path("../../datasets/others/rolos/2d-materials-point-defects/2d-materials-point-defects-all/high_density_defects/BP_spin_500")

In [3]:
BP_structures, BP_defects = read_csv_cif(example_dataset_path)
BP_targets = pd.read_csv(example_dataset_path / "targets.csv.gz", index_col="_id")

100%|██████████| 500/500 [00:17<00:00, 29.04it/s]


`BP_structures` contains the initial unrelaxed structures, along with the high-level variables computed with DFT

In [4]:
BP_structures.head()

Unnamed: 0_level_0,descriptor_id,energy,fermi_level,total_mag,homo_lumo_gap_majority,lumo_majority,homo_majority,E_1_majority,homo_lumo_gap_minority,lumo_minority,homo_minority,E_1_minority,initial_structure
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.162674,-2.341205,0.37354,0.997,-1.3324,-2.3294,-20.211,0.9275,-1.3305,-2.258,-20.1964,[[4.35465431e-16 2.70791180e+00 4.18246000e+00...
P_P141N1_2a285334-f6b0-498c-bb4a-788bfd4af4a6,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.791661,-2.265668,0.274071,0.8881,-1.3302,-2.2183,-20.1189,0.793,-1.3286,-2.1216,-20.1178,[[4.35465431e-16 2.70791180e+00 4.18246000e+00...
P_P141_83ddad39-bb28-4896-93f3-a97b8664304f,851b0d44-b9af-41e1-b1ec-925daf419aae,-751.693427,-2.232107,0.973272,0.7287,-1.6675,-2.3962,-17.2629,0.432,-1.6566,-2.0886,-17.2612,[[4.35465431e-16 2.70791180e+00 4.18246000e+00...
P_P141N1_64f2c31f-7207-4897-9877-60187bcb87d4,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.493381,-2.219081,1.862092,0.9955,-1.3313,-2.3268,-20.3546,0.704,-1.3262,-2.0302,-20.3553,[[4.35465431e-16 2.70791180e+00 4.18246000e+00...
P_P141N1_af730d2e-7790-46db-8751-75e46bf49b68,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.140472,-2.262924,0.142128,0.6302,-1.564,-2.1942,-20.2375,0.5673,-1.5634,-2.1307,-20.2376,[[4.35465431e-16 2.70791180e+00 4.18246000e+00...


In [5]:
BP_defects.head()

Unnamed: 0_level_0,description,base,cell,defects
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,P141N1,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."
851b0d44-b9af-41e1-b1ec-925daf419aae,P141,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."
3765801f-e8c7-4e1e-8705-da03ef64f59c,P141N3,P,"(6, 6, 1)","[{'type': 'substitution', 'from': 'P', 'to': '..."
72c75217-369a-483e-88f6-86c3edaaf8af,P141N2,P,"(6, 6, 1)","[{'type': 'substitution', 'from': 'P', 'to': '..."
bfdd9f72-3d2b-435c-b9df-80b283befc68,P137N2,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."


We can merge the defect descriptions and targets into a single DataFrame

In [6]:
BP_flat = BP_structures.combine_first(BP_targets).merge(BP_defects, how="left", left_on="descriptor_id", right_index=True)

In [7]:
BP_flat.head()

Unnamed: 0_level_0,E_1_majority,E_1_minority,descriptor_id,energy,energy_per_atom,fermi_level,formation_energy,formation_energy_per_site,homo_lumo_gap_majority,homo_lumo_gap_max,...,normalized_homo_minority,normalized_lumo_majority,normalized_lumo_max,normalized_lumo_min,normalized_lumo_minority,total_mag,description,base,cell,defects
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f,-20.211,-20.1964,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.162674,-5.353258,-2.341205,4.230926,1.410309,0.997,0.997,...,2.9684,3.9086,3.9086,3.8959,3.8959,0.37354,P141N1,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."
P_P141N1_2a285334-f6b0-498c-bb4a-788bfd4af4a6,-20.1189,-20.1178,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.791661,-5.357688,-2.265668,3.601939,1.200646,0.8881,0.8881,...,3.0262,3.8187,3.8192,3.8187,3.8192,0.274071,P141N1,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."
P_P141_83ddad39-bb28-4896-93f3-a97b8664304f,-17.2629,-17.2612,851b0d44-b9af-41e1-b1ec-925daf419aae,-751.693427,-5.331159,-2.232107,4.376173,1.458724,0.7287,0.7287,...,0.2026,0.6254,0.6346,0.6254,0.6346,0.973272,P141,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."
P_P141N1_64f2c31f-7207-4897-9877-60187bcb87d4,-20.3546,-20.3553,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.493381,-5.355587,-2.219081,3.900219,1.300073,0.9955,0.9955,...,3.3551,4.0533,4.0591,4.0533,4.0591,1.862092,P141N1,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."
P_P141N1_af730d2e-7790-46db-8751-75e46bf49b68,-20.2375,-20.2376,5106743f-06b3-4e57-8ee6-5cdf4ed14dc5,-760.140472,-5.353102,-2.262924,4.253128,1.417709,0.6302,0.6302,...,3.1369,3.7035,3.7042,3.7035,3.7042,0.142128,P141N1,P,"(6, 6, 1)","[{'type': 'vacancy', 'element': 'P'}, {'type':..."


Visualisation

In [8]:
example_structure_id = BP_flat.index[0]

In [9]:
BP_flat.loc[example_structure_id].initial_structure

Structure Summary
Lattice
    abc : 19.799508 27.605556 20.0
 angles : 90.0 90.0 90.0
 volume : 10931.528537328959
      A : 19.799508 0.0 1.2123702048446206e-15
      B : 4.439311986051677e-15 27.605556 1.6903527897041506e-15
      C : 0.0 0.0 20.0
    pbc : True True True
PeriodicSite: P (0.0000, 2.7079, 4.1825) [0.0000, 0.0981, 0.2091]
PeriodicSite: P (0.0000, 7.3088, 4.1825) [0.0000, 0.2648, 0.2091]
PeriodicSite: P (0.0000, 11.9098, 4.1825) [0.0000, 0.4314, 0.2091]
PeriodicSite: P (0.0000, 16.5107, 4.1825) [0.0000, 0.5981, 0.2091]
PeriodicSite: P (0.0000, 21.1116, 4.1825) [0.0000, 0.7648, 0.2091]
PeriodicSite: P (0.0000, 25.7125, 4.1825) [0.0000, 0.9314, 0.2091]
PeriodicSite: P (3.2999, 2.7079, 4.1825) [0.1667, 0.0981, 0.2091]
PeriodicSite: P (3.2999, 7.3088, 4.1825) [0.1667, 0.2648, 0.2091]
PeriodicSite: P (3.2999, 11.9098, 4.1825) [0.1667, 0.4314, 0.2091]
PeriodicSite: P (3.2999, 16.5107, 4.1825) [0.1667, 0.5981, 0.2091]
PeriodicSite: P (3.2999, 21.1116, 4.1825) [0.1667, 0.7648, 

In [10]:
from ase.visualize import view
from pymatgen.io.ase import AseAtomsAdaptor

In [11]:
# Crashes at CRP, but might work for you
# view(AseAtomsAdaptor.get_atoms(BP_structures.loc[example_structure_id].initial_structure), viewer='ngl')
view(AseAtomsAdaptor.get_atoms(BP_structures.loc[example_structure_id].initial_structure), viewer='x3d')

If needed, we can dive into the VASP outputs:

In [12]:
from pathlib import Path
from pymatgen.io.vasp.outputs import Vasprun, Outcar

vasp_folder = Path(f"../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_{example_structure_id}/01_relax")
vasprun = Vasprun(vasp_folder / "vasprun.xml", parse_potcar_file=False, separate_spins=True, parse_dos=True)

If you want to look at VASP output, you probably know what you are doing, but just in case here is pymetgen documentation:
https://pymatgen.org/pymatgen.io.vasp.outputs.html#pymatgen.io.vasp.outputs.Vasprun

In [13]:
print(vasprun.final_energy)

-760.1626739 eV


List of other VASP outputs available

In [14]:
list(vasp_folder.iterdir())

[PosixPath('../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f/01_relax/CHGCAR'),
 PosixPath('../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f/01_relax/OSZICAR'),
 PosixPath('../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f/01_relax/IBZKPT'),
 PosixPath('../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f/01_relax/EIGENVAL'),
 PosixPath('../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f/01_relax/XDATCAR'),
 PosixPath('../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f/01_relax/OUTCAR'),
 PosixPath('../../datasets/raw_vasp/high_density_defects/BP_spin_500/poscar_P_P141N1_dd6a5668-efc9-4f4c-aee8-ac3472eaf09f/01_relax/CHG'),
 PosixPath('

# Load all the structures

In [15]:
extracted_data_root = Path("../../datasets/others/rolos/2d-materials-point-defects/2d-materials-point-defects-all/")
datasets = [extracted_data_root / f"high_density_defects/{name}_500" for name in ("BP_spin", "GaSe_spin", "hBN_spin", "InSe_spin", "MoS2", "WSe2")] + \
           [extracted_data_root / f"low_density_defects/{name}" for name in ("MoS2", "WSe2")]

In [16]:
import os
from multiprocessing import Pool
if 'ROLOS_AVAILABLE_CPU' in os.environ:
    n_cpus = int(os.environ['ROLOS_AVAILABLE_CPU'])
else:
    n_cpus = None
with Pool(processes=n_cpus) as pool:
    data = pool.map(read_flat, datasets)

100%|██████████| 500/500 [00:18<00:00, 26.69it/s]
100%|██████████| 500/500 [00:19<00:00, 26.18it/s]
100%|██████████| 500/500 [00:17<00:00, 28.25it/s]
100%|██████████| 500/500 [00:19<00:00, 25.43it/s]
100%|██████████| 500/500 [00:26<00:00, 18.56it/s]
100%|██████████| 500/500 [00:27<00:00, 17.98it/s]
100%|██████████| 5933/5933 [05:36<00:00, 17.64it/s]
100%|██████████| 5933/5933 [05:33<00:00, 17.80it/s]


In [17]:
flat_2DMMD = pd.concat(data, axis=0)

In [20]:
print(f"Loaded {len(flat_2DMMD)} structures")

Loaded 14866 structures
