# Preprocessing of CIF files into CSE files

We provide five example CIFs from the ICSD in the `cif/` directory.
This notebook converts them into the input formats used by our workflows.

This script only converts CIF files whose ICSD is contained in the benchmark dataset.
The CIF files must have 'icsd123456' in their filenames, where '123456' is the ICSD ID of the material.

**Disclaimer:** 

We have removed the structures from the provided databases (the `qe_yambo_database/` and `questaal_database/` directories) and converted them from `ComputedStructureEntries` (CSE) to `ComputedEntries` (CE) because we are not permitted to provide ICSD structures. If you would like to reproduce our results, please download the CIFs from the ICSD, run this notebook, and use the provided workflows. The ICSD IDs of the materials used for this benchmark can be found in the provided spreadsheets or CSV files, or in the original benchmark by Borlido et al. [https://doi.org/10.1021/acs.jctc.9b00322].

In [None]:
# directory where CIF files are located
struct_dir = "cifs"

In [None]:
# external import
import os
import re
import sys
import json
import shutil
import pickle
import warnings
import numpy as np
import pandas as pd
from pymatgen.core import Structure
from importlib.resources import files
from pymatgen.entries.computed_entries import ComputedStructureEntry

# internal imports
import qsgw_workflow.utils.helper as helper

# check if that the 'struct_dir' exists
if not os.path.exists(struct_dir):
    sys.exit("The 'struct_dir' does not exist, make sure that the path is set correctly!")

# parse the benchmark data from Borlido et al.
borlido_path = files("qsgw_workflow.files").joinpath("borlido.csv")

# get the icsd ids of the materials in the benchmark dataset
df = pd.read_csv(borlido_path)
benchmark_ids = df["ICSD-ID"].values

def get_material_id(path):
    """
    Extract the numeric material ID from a filename that contains either
    'icsd<digits>' or 'CollCode<digits>'. Raises ValueError if no ID found.
    """
    filename = os.path.basename(path)
    match = re.search(r"(?:icsd|CollCode)[_-]?(\d+)", filename, re.IGNORECASE)
    if not match:
        raise ValueError(f"Expected 'icsd<digits>'/'CollCode<digits>'/'icsd_<digits>'/'CollCode_<digits>' in '{filename:s}'!")
    return int(match.group(1))

# get the material composition from a cif file
def get_composition(cif_path):
    composition = None
    with open(cif_path, "r") as f:
        lines = f.readlines()
    for line in lines:
        if line.startswith('_chemical_name_structure_type'):
            composition = line.split(' ', 1)[1].strip()
    return composition

# load all materials into pymatgen structures (if included in the benchmark dataset)
paths = [os.path.join(struct_dir, f) for f in os.listdir(struct_dir)]
struct_list = []
error_list = []
not_in_benchmark = []
for path in paths:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        mat_id = get_material_id(path)
        if mat_id in benchmark_ids:
            try:
                struct = Structure.from_file(path)
                with open(path, "r") as f:
                    cif_str = f.read()
                struct_list.append([path, mat_id, Structure.from_file(path), cif_str])
            except:
                error_list.append(path)
        else:
            not_in_benchmark.append(path)
if struct_list:
    print("Parsed the following CIFs:")
    for l in struct_list:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print(f"    {l[0]:s} ({l[2].composition.reduced_formula})")
if not_in_benchmark:
    print("Skipped the followings CIFs (they are not in the benchmark dataset):")
    for path in not_in_benchmark:
        composition = get_composition(path)
        if composition is None:
            print(f"    {path:s}")
        else:
            print(f"    {path:s} ({composition:s})")
if error_list:
    print("Unable to parse following CIFs:")
    for path in error_list:
        composition = get_composition(path)
        if composition is None:
            print(f"    {path:s}")
        else:
            print(f"    {path:s} ({composition:s})")

# standardize all unit cells
std_struct_list = []
for _, mat_id, struct, cif_str in struct_list:
    struct = helper.standardize_cell(struct)
    std_struct_list.append([mat_id, struct, cif_str])

In [None]:
"""
QUESTAAL WORKFLOW INPUT (./qsgw_benchmark/).
Store the standardized primitive structure in the benchmark directory as ComputedStructureEntries.
"""
# path and directory setup
output_dir = "./qsgw_benchmark/structures/benchmark"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)
# loop over all benchmark materials
for mat_id, struct, cif_str in std_struct_list:
    # ignore random warnings from pymatgen...
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # create a CSE
        cse = ComputedStructureEntry(
            struct,
            energy=0.0, # dummy value
            parameters={"cif": cif_str}, 
        )
        # setup a sensible file name and save the CSE as a JSON file
        formula = struct.composition.reduced_formula
        sanitized_formula = re.sub(r"[^A-Za-z0-9_\.-]", "", formula)
        filepath = os.path.join(output_dir, f"{sanitized_formula:s}_icsd_{mat_id:d}_nsites_{struct.num_sites:d}.json")
        json_dict = cse.as_dict()
        with open(filepath, "w") as f:
            json.dump(json_dict, f)

In [None]:
"""
QUANTUM ESPRESSO AND YAMBO WORKFLOW INPUT (./g0w0_benchmark/).
Store all standardized primitive structures as ComputedStructureEntries in the benchmark directory in one pickle file.
"""
# cutoff for the LDA PseudoDojo pseudopotentials
lda_cutoffs = {
    "H": 31,
    "He": 39,
    "Li": 33,
    "Be": 38,
    "B": 34,
    "C": 37,
    "N": 36,
    "O": 30,
    "F": 34,
    "Ne": 44,
    "Na": 38,
    "Mg": 38,
    "Al": 16,
    "Si": 12,
    "P": 18,
    "S": 21,
    "Cl": 24,
    "Ar": 29,
    "K": 32,
    "Ca": 30,
    "Sc": 35,
    "Ti": 38,
    "V": 38,
    "Cr": 43,
    "Mn": 30,
    "Fe": 39,
    "Co": 30,
    "Ni": 30,
    "Cu": 38,
    "Zn": 30,
    "Ga": 36,
    "Ge": 30,
    "As": 34,
    "Se": 39,
    "Br": 18,
    "Kr": 22,
    "Rb": 17,
    "Sr": 28,
    "Y": 30,
    "Zr": 29,
    "Nb": 37,
    "Mo": 36,
    "Tc": 38,
    "Ru": 40,
    "Rh": 40,
    "Pd": 37,
    "Ag": 39,
    "Cd": 47,
    "In": 31,
    "Sn": 32,
    "Sb": 36,
    "Te": 34,
    "I": 31,
    "Xe": 29,
    "Cs": 30,
    "Ba": 18,
    "Hf": 25,
    "Ta": 25,
    "W": 29,
    "Re": 30,
    "Os": 33,
    "Ir": 24,
    "Pt": 34,
    "Au": 28,
    "Hg": 26,
    "Tl": 27,
    "Pb": 24,
    "Bi": 31,
}

# path and directory setup
output_dir = "./g0w0_benchmark/input/"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)
test_output_dir = os.path.join(output_dir, "test")
if os.path.exists(test_output_dir):
    shutil.rmtree(test_output_dir)
os.makedirs(test_output_dir, exist_ok=True)
# loop over all benchmark materials
filtered_cse_list = []
for mat_id, struct, cif_str in std_struct_list:
    # find a good cutoff point to start when using LDA pseudopotentials
    # (also, tag materials for which we cannot perform an LDA calculation)
    cut = []
    for elem in struct.elements:
        cut.append(lda_cutoffs[elem.name])
    # get the LDA cutoff for this material
    lda_cutoff = 2 * max(cut) # convert Ha to Ry
    # skip structures with materials for which we have no pseudopotentials
    elements = np.unique([specie.name for specie in struct.species])
    if "Th" in elements or "La" in elements:
        continue
    # remove structures with more than 12 atoms in the unit cell
    if len(struct) > 12:
        continue
    # create a CSE
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        cse = ComputedStructureEntry(
            struct,
            energy=0.0, # dummy value
            parameters={
                "id": str(mat_id),
                "lda_pw_cutoff_Ry": lda_cutoff,
                "cif_str": cif_str, # added as an afterthought, we forgot to tell our master's student...
            }, 
        )
    filtered_cse_list.append(cse) 
    # setup a sensible file name and save the CSE as a JSON file usable for local test runs
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        formula = struct.composition.reduced_formula
        sanitized_formula = re.sub(r"[^A-Za-z0-9_\.-]", "", formula)
        filepath = os.path.join(test_output_dir, f"{sanitized_formula:s}_icsd_{mat_id:d}_nsites_{struct.num_sites:d}.json")
        json_dict = cse.as_dict()
    with open(filepath, "w") as f:
        json.dump(json_dict, f)   
# save all CSEs in one pickle
pickle.dump(filtered_cse_list, open(os.path.join(output_dir, "benchmark_structures.pkl"), "wb"))