### Run this to reproduce all of the results shown in the main manuscript and the supplementary information

In [None]:
# external imports
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from importlib.resources import files
from openpyxl.styles import Alignment
from openpyxl.utils import get_column_letter

# internal imports
from qsgw_workflow.utils.helper import load_db_entry, replace_greek_and_subscripts, plot_bs

# load my plotstyle
plt.style.use("./figures/plotstyle.mplstyle")

# prepare output directories 
paper_fig_path = "figures/paper/"
si_fig_path = "figures/supplement/"
sheet_path = "spreadsheets"
csv_path = "csvs"
os.makedirs(paper_fig_path, exist_ok=True)
os.makedirs(si_fig_path, exist_ok=True)
os.makedirs(sheet_path, exist_ok=True)
os.makedirs(csv_path, exist_ok=True)

In [None]:
# helper function to get the ICSD ID of a material from our naming scheme 
def get_material_id(name):
    match = re.search(r"(?:icsd|CollCode)[_-]?(\d+)", name, re.IGNORECASE)
    if not match:
        raise ValueError(f"Expected 'icsd<digits>'/'CollCode<digits>'/'icsd_<digits>'/'CollCode_<digits>' in '{name:s}'!")
    return int(match.group(1))

# helper function for nice latex tables
def smart_fmt(x: float) -> str:
    return f"{int(round(x))}" if np.isclose(x, round(x)) else f"{x:.2f}"

# helper function for latex version of composition formulas
def pretty_formula(formula: str) -> str:
    return re.sub(r"(\d+)", r"$_{\1}$", formula)

# helper functions for plots
def annotate_mat(ax, x, y, offset, text):
    xy = (x, y)
    try:
        xy = [float(val.iloc[0]) for val in xy]
    except:
        xy = [float(val) for val in xy] 
    xy_offset = (xy[0] + offset[0], xy[1] + offset[1])
    ax.annotate(
        text,
        xy=xy,
        xycoords="data",
        xytext=xy_offset,
        textcoords="offset points",
        arrowprops=dict(arrowstyle="->"),
        fontsize=8,
        ha="center",
        bbox=dict(boxstyle="round, pad=0.0", fc="white", ec="none"),
    )

# Parse results

In [None]:
# parse the benchmark data from Borlido et al (we converted the Excel to a CSV)
borlido_path = files("qsgw_workflow.files").joinpath("borlido.csv")
df = pd.read_csv(borlido_path)

# add columns to the dataframe for our calculations
lda_idx = df.columns.get_loc("LDA")
if "LDA NC" not in df.columns:
    df.insert(lda_idx + 1, "LDA NC", np.nan)
if "LDA AE" not in df.columns:
    df.insert(lda_idx + 2, "LDA AE", np.nan)
pbe_idx = df.columns.get_loc("PBE")
if "PBE NC" not in df.columns:
    df.insert(pbe_idx + 1, "PBE NC", np.nan)
gw_idx = df.columns.get_loc("PBE0_mix")
if "G0W0@LDA-PPA" not in df.columns:
    df.insert(gw_idx + 1, "G0W0@LDA-PPA", np.nan)
if "G0W0@PBE-PPA" not in df.columns:
    df.insert(gw_idx + 2, "G0W0@PBE-PPA", np.nan)
if "QPG0W0" not in df.columns:
    df.insert(gw_idx + 3, "QPG0W0", np.nan)
if "QPG0W0+SOC" not in df.columns:
    df.insert(gw_idx + 4, "QPG0W0+SOC", np.nan)
if "QSGW" not in df.columns:
    df.insert(gw_idx + 5, "QSGW", np.nan)
if "QSGW^" not in df.columns:
    df.insert(gw_idx + 6, "QSGW^", np.nan)
if "QSGW^+SOC" not in df.columns:
    df.insert(gw_idx + 7, "QSGW^+SOC", np.nan)

In [None]:
# load all Questaal results (QPG0W0, QPG0W0+SOC, QSGW, QSGW^, QSGW^+SOC)
db_dir = "./questaal_database"
questaal_data = []
for mat in tqdm(os.listdir(db_dir), "Going through all database files"):
    db_path = os.path.join(db_dir, mat)
    cse = load_db_entry(db_path)
    questaal_data.append(cse)

In [None]:
# calculate the band gap from a band structure
def bs_gap_estimation(bs, vbm_idx):
    vb = []
    cb = []
    for path in bs["bs_paths"]:
        vb.extend(path["bands"][:, vbm_idx])
        cb.extend(path["bands"][:, vbm_idx + 1])
    return np.max([0, np.min(cb) - np.max(vb)])

In [None]:
# estimate the difference between the band gap obtained from a regular Brillouin 
# zone sampling and a band structure calculation using the LDA results (this is part of the SI)
diff_list = []
print("Please note that the SOC results for the extreme positive outliers identified here have been removed from the dataset below.")
print(f"{"Material":<30s} | DFT k-grid gap (eV) | BS gap (eV) | Difference (eV)")
print("-----------------------------------------------------------------------------")
for cse in questaal_data:
    icsd_id = get_material_id(cse.entry_id)
    vbm_idx = cse.parameters["vbm_idx"]
    try:
        gap = cse.data[f"gap_lda"]
        gap = max(0, gap)
        gap_from_bs = bs_gap_estimation(cse.data[f"bs_lda"], vbm_idx)
        diff = gap - gap_from_bs
        if np.abs(diff) > 0.05:
            print(f"{cse.entry_id:<30s} | {gap:<19.3f} | {gap_from_bs:<11.3f} | {diff:<2.3f}")
        diff_list.append(gap - gap_from_bs)
    except:
        continue
if diff_list:
    print(f"\nME  = {np.mean(diff_list):.3f} eV")
    print(f"MAE = {np.mean(np.abs(diff_list)):.3f} eV")
    
# make a histogram
bins = np.arange(-0.15, 0.20 + 0.01, 0.01)
fig, ax = plt.subplots()
ax.hist(diff_list, bins=bins, color="tab:blue", edgecolor="k", linewidth=0.1)
ax.annotate(
    r"SiC",
    xy=(0.158, 1),
    xycoords="data",
    xytext=(-10, 20),
    textcoords="offset points",
    arrowprops=dict(arrowstyle="->"),
    fontsize=8,
    ha="center",
    bbox=dict(boxstyle="round, pad=0.0", fc="white", ec="none"),
)
ax.annotate(
    r"LiCoO$_2$",
    xy=(-0.125, 1),
    xycoords="data",
    xytext=(10, 20),
    textcoords="offset points",
    arrowprops=dict(arrowstyle="->"),
    fontsize=8,
    ha="center",
    bbox=dict(boxstyle="round, pad=0.0", fc="white", ec="none"),
)
ax.axvline(np.mean(diff_list), linestyle="-", color="tab:orange")
ax.set_xlabel(r"$E_\mathrm{gap}^{\mathbf{k}_\mathrm{DFT}} - E_\mathrm{gap}^{\mathbf{k}_\mathrm{sym}}$ (eV)")
ax.set_ylabel(r"Occurence")
ax.set_xticks([-0.1, 0.0, 0.1, 0.2])
fig.tight_layout()
fig.savefig(os.path.join(si_fig_path, "dft_vs_bs_gap.pdf"))

In [None]:
# plot the band structure of LiCoO2 (this is part of the SI)
for cse in questaal_data:
    icsd_id = get_material_id(cse.entry_id)
    if icsd_id == 51767:
        break
fig, ax = plt.subplots(figsize=(3, 3))
bs = cse.data["bs_lda"]
plot_bs(ax, bs)
bs_paths = bs["bs_paths"]
vbm_idx = cse.parameters["vbm_idx"]
vbm_gap_idx = np.argmax(bs_paths[1]["bands"][:, vbm_idx])
cbm_gap_idx = np.argmin(bs_paths[1]["bands"][:, vbm_idx + 1])
ax.annotate(xy=(43, bs_paths[1]["bands"][vbm_gap_idx, vbm_idx]), 
            xytext=(51, bs_paths[1]["bands"][cbm_gap_idx, vbm_idx + 1]),
            text="", arrowprops=dict(arrowstyle="<->", color="r"))
ax.set_ylim([-0.70, 1.89])
ax.set_title("LiCoO$_2$", pad=3)
ax.set_title(f"{cse.composition.reduced_formula:s}~~~(ISCD {get_material_id(cse.entry_id):d})", pad=3)
fig.tight_layout()
fig.align_xlabels()
fig.savefig(os.path.join(si_fig_path, "licoo2_bandstructure.pdf"))

In [None]:
# add the Questaal band gaps to the dataframe
# (if Questaal does not find a gap the value is set to -1 in the database)
audit = {
    "crashed_during_qpg0w0": [],
    "self_energy_kgrid_reached_dft_kgrid": [],
    "shortcut_failed": [],
    "kpt_conv_problem": [],
    "qpg0w0_metals": [],
    "qsgw_metals": [],
}
for cse in questaal_data:
    icsd_id = get_material_id(cse.entry_id)
    vbm_idx = cse.parameters["vbm_idx"]
    # LDA
    gap_lda = min(max(cse.data["gap_lda"], 0), bs_gap_estimation(cse.data["bs_lda"], vbm_idx))
    df.loc[df["ICSD-ID"] == icsd_id, "LDA AE"] = np.round(gap_lda, 2)
    # materials where the QPG0W0 crashed somewhere are skipped
    # (mostly caused by our computation time limit)
    if "qsgw_kpt_conv_error_flag" not in cse.parameters:
        audit["crashed_during_qpg0w0"].append(cse.entry_id)
        continue
    # material where the self energy k-grid is equal to the DFT k-grid
    # (the QPG0W0 band gap is likely not fully converged...)
    if cse.parameters["qsgw_kpt_conv_error_flag"] == True:
        audit["self_energy_kgrid_reached_dft_kgrid"].append(cse.entry_id)
    # material where we did not use the shortcut because the band structure
    # contains some artifacts, probably caused by a k-point grid that is too coarse
    if cse.parameters["qsgw_kppa"] != cse.data["qsgw_kpt_conv_data"][-2][1]:
        audit["shortcut_failed"].append(cse.entry_id)
        gap_qpg0w0 = min(max(cse.data["qsgw_kpt_conv_data"][-1][-1], 0), bs_gap_estimation(cse.data["bs_qpg0w0"], vbm_idx))
    else:
        gap_qpg0w0 = min(max(cse.data["qsgw_kpt_conv_data"][-2][-1], 0), bs_gap_estimation(cse.data["bs_qpg0w0"], vbm_idx))
    # materials where the self energy k-point grid has reached the DFT k-point grid 
    # and the band gap is not really converged are omitted... (differences greater than 100 meV)
    if np.abs(cse.data["qsgw_kpt_conv_data"][-1][-1] - cse.data["qsgw_kpt_conv_data"][-2][-1]) > 0.1 and cse.parameters["qsgw_kpt_conv_error_flag"] == True:
        audit["kpt_conv_problem"].append(cse.entry_id)
        print(f"Omitting the QPG0W0, QSGW and QSGW^ results for {cse.entry_id:s}...")
        continue
    # materials that are a metal in QPG0W0 
    # (literature review confirmed that they are just weird)
    if cse.parameters["metal_flag_qsgw"] or gap_qpg0w0 <= 0:
        audit["qpg0w0_metals"].append(cse.entry_id)
    # QPG0W0
    df.loc[df["ICSD-ID"] == icsd_id, "QPG0W0"] = np.round(gap_qpg0w0, 2)
    # QPG0W0+SOC (see band structure analysis above for omitted materials)
    if not cse.entry_id in [
            "MoS2_icsd_95569_nsites_6", # large gap difference to band structure
            "SiC_icsd_86253_nsites_4", # large gap difference to band structure
            "WS2_icsd_202366_nsites_6", # large gap difference to band structure
        ]:
        gap_qpg0w0_soc = max(cse.data["gap_qpg0w0_soc"], 0)
        df.loc[df["ICSD-ID"] == icsd_id, "QPG0W0+SOC"] = np.round(gap_qpg0w0_soc, 2)
    else:
        print(f"Omitting the QPG0W0+SOC results for {cse.entry_id:s}...")
    # QSGW
    if cse.parameters["qsgw_flag"] == True:
        if cse.parameters["qsgw_scf_conv_error_flag"]:
            print(f"The QSGW self-consistency cycle did not converge after 25 iterations for {cse.entry_id:s}...")
            print("     Iteration |   RMS    | Band gap")
            print(f"    {int(cse.data["qsgw_scf_data"][-2][0]):>10d} | {cse.data["qsgw_scf_data"][-2][1]:.2e} | {cse.data["qsgw_scf_data"][-2][2]:2.2f}")
            print(f"    {int(cse.data["qsgw_scf_data"][-1][0]):>10d} | {cse.data["qsgw_scf_data"][-1][1]:.2e} | {cse.data["qsgw_scf_data"][-1][2]:2.2f}")
        gap_qsgw = min(max(cse.data["gap_qsgw"], 0), bs_gap_estimation(cse.data["bs_qsgw"], vbm_idx))
        if cse.entry_id in [
            "CsAu_icsd_58427_nsites_2", # unstable self-consistency cycle (became metallic...)
            "BaF2_icsd_64717_nsites_3", # unstable self-consistency cycle (large gap changes)
        ]:
            print(f"Omitting the QSGW results for {cse.entry_id:s}...")
            continue
        if gap_qsgw == 0:
            audit["qsgw_metals"].append(cse.entry_id)
        df.loc[df["ICSD-ID"] == icsd_id, "QSGW"] = np.round(gap_qsgw, 2)
    # QSGW^, QSGW^+SOC
    if cse.parameters["finish"] == True:
        if cse.parameters["qsgwbse_scf_conv_error_flag"]:
            print(f"The QSGW^ self-consistency cycle did not converge after 25 iterations for {cse.entry_id:s}...")
            print("     Iteration |   RMS    | Band gap")
            print(f"    {int(cse.data["qsgwbse_scf_data"][-2][0]):>10d} | {cse.data["qsgwbse_scf_data"][-2][1]:.2e} | {cse.data["qsgwbse_scf_data"][-2][2]:2.2f}")
            print(f"    {int(cse.data["qsgwbse_scf_data"][-1][0]):>10d} | {cse.data["qsgwbse_scf_data"][-1][1]:.2e} | {cse.data["qsgwbse_scf_data"][-1][2]:2.2f}")
        gap_qsgwbse = min(max(cse.data["gap_qsgwbse"], 0), bs_gap_estimation(cse.data["bs_qsgwbse"], vbm_idx))
        df.loc[df["ICSD-ID"] == icsd_id, "QSGW^"] = np.round(gap_qsgwbse, 2)
        if cse.entry_id in [
            "CaMg2N2_icsd_79123_nsites_5", # unstable self-consistency cycle (large gap changes)
            "BaSe_icsd_52696_nsites_2", # unstable self-consistency cycle (large gap changes)
            "BaO_icsd_616005_nsites_2", # unstable self-consistency cycle (large gap changes)
            "BeO_icsd_391224_nsites_4", # unstable self-consistency cycle (large gap changes)
            "MoS2_icsd_95569_nsites_6", # large gap difference to band structure
            "SiC_icsd_86253_nsites_4", # large gap difference to band structure
            "WS2_icsd_202366_nsites_6", # large gap difference to band structure
        ]:
            print(f"Omitting the QSGW^+SOC results for {cse.entry_id:s}...")
            continue
        gap_qsgwbse_soc = max(cse.data["gap_qsgwbse_soc"], 0)
        df.loc[df["ICSD-ID"] == icsd_id, "QSGW^+SOC"] = np.round(gap_qsgwbse_soc, 2)

In [None]:
# audit reports
print("------------------------------------------\nAudit results:\n------------------------------------------")
print(f"{'Total':35} | {len(questaal_data):>3d}")
for key, value in audit.items():
    num_mats = len(value)
    print(f"{key:35} | {num_mats:>3d}")

In [None]:
# load all Quantum ESPRESSO + Yambo results (G0W0-PPA)
db_dir = "./qe_yambo_database"
qe_yambo_data = []
for mat in tqdm(os.listdir(db_dir), "Going through all database files"):
    db_path = os.path.join(db_dir, mat)
    cse = load_db_entry(db_path)
    qe_yambo_data.append(cse)

In [None]:
# add the Quantum ESPRESSO and Yambo band gaps to the dataframe
# the workflow calculates the band gap as CBM - VBM, therefore,
# gaps can be negative if the band structure is inverted
for cse in qe_yambo_data:
    icsd_id = int(cse.parameters["id"])
    if "indirect_gap_lda" in cse.parameters:
        gap_lda = max(0, cse.parameters["indirect_gap_lda"])
        gap_lda = np.round(gap_lda, 2)
        df.loc[df["ICSD-ID"] == icsd_id, "LDA NC"] = gap_lda
        if "g0w0_ppa_lda" in cse.parameters:
            scissor = cse.parameters["g0w0_ppa_lda"]["scissor"]
            gap_g0w0_lda =  max(0, gap_lda + scissor)
            gap_g0w0_lda = np.round(gap_g0w0_lda, 2)
            df.loc[df["ICSD-ID"] == icsd_id, "G0W0@LDA-PPA"] = gap_g0w0_lda
    if "indirect_gap_pbe" in cse.parameters:
        gap_pbe =  max(0, cse.parameters["indirect_gap_pbe"])
        gap_pbe = np.round(gap_pbe, 2)
        df.loc[df["ICSD-ID"] == icsd_id, "PBE NC"] = gap_pbe
        if "g0w0_ppa_pbe" in cse.parameters:
            scissor = cse.parameters["g0w0_ppa_pbe"]["scissor"]
            gap_g0w0_pbe =  max(0, gap_pbe + scissor)
            gap_g0w0_pbe = np.round(gap_g0w0_pbe, 2)
            df.loc[df["ICSD-ID"] == icsd_id, "G0W0@PBE-PPA"] = gap_g0w0_pbe

In [None]:
# some statistics, i.e., how many materials did we calculate for each method
print(f"G0W0@LDA-PPA - {df["G0W0@LDA-PPA"].count():d}")
print(f"G0W0@PBE-PPA - {df["G0W0@PBE-PPA"].count():d}")
print(f"QPG0W0       - {df["QPG0W0"].count():d}")
print(f"QPG0W0+SOC   - {df["QPG0W0+SOC"].count():d}")
print(f"QSGW         - {df["QSGW"].count():d}")
print(f"QSGW^        - {df["QSGW^"].count():d}")
print(f"QSGW^+SOC    - {df["QSGW^+SOC"].count():d}")

# Main

In [None]:
# find materials where the QPG0W0 and the experimental value have large differences
df_sorted = df.loc[np.abs(df["QPG0W0"] - df["Experimental"]).sort_values(ascending=False, na_position="last").index]
df_sorted

In [None]:
# find materials where the QSGW underestimates the experimental value
df_sorted = df.loc[(df["QSGW"] - df["Experimental"]).sort_values(ascending=True, na_position="last").index]
df_sorted

In [None]:
# find materials where the QSGW overestimates the experimental value
df_sorted = df.loc[(df["QSGW"] - df["Experimental"]).sort_values(ascending=False, na_position="last").index]
df_sorted

In [None]:
# find materials where the QSGW^ underestimates the experimental value
pd.set_option("display.max_rows", 0)
df_sorted = df.loc[(df["QSGW^"] - df["Experimental"]).sort_values(ascending=True, na_position="last").index]
df_sorted

In [None]:
# find materials where the QSGW^ overestimates the experimental value
pd.set_option("display.max_rows", 0)
df_sorted = df.loc[(df["QSGW^"] - df["Experimental"]).sort_values(ascending=False, na_position="last").index]
df_sorted

In [None]:
# estimate the percentage which the QSGW systematically overestimates the experimental values
mask = df[["QSGW", "Experimental"]].notna().all(axis=1)
qs   = df.loc[mask, "QSGW"].to_numpy()
exp  = df.loc[mask, "Experimental"].to_numpy()
multiplier = exp.sum() / qs.sum()
overestimate = (1 / multiplier - 1) 
print(f"Optimal QSGW gap multiplier    {100 * multiplier:.1f}%")
print(f"Average QSGW gap overestimate: {100 * overestimate:.1f}%")

In [None]:
# main plot
c = "tab:orange"
bins = np.arange(-2.1, 2.1 + 0.1, 0.1)
fig, axes = plt.subplots(2, 3, figsize=(6, 4.5))

ax = axes[0, 0]
key = "MBJ"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(key, pad=3)

ax = axes[0, 1]
key = "HSE06"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(key, pad=3)

ax = axes[0, 2]
key = "G0W0@LDA-PPA"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(r"$G_{0}W_{0}$@LDA-PPA", pad=3)

ax = axes[1, 0]
key = "QPG0W0"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "CuLaO2")].iloc[0]), 1, [-10, 20], "CuLaO$_2$")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "AgF")].iloc[0]), 1, [10, 20], "AgF")
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(r"QP$G_{0}W_{0}$", pad=3)

ax = axes[1, 1]
key = "QSGW"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "CuLaO2")].iloc[0]), 1, [-10, 20], "CuLaO$_2$") # ~2.7, its off the chart...
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["ICSD-ID"] == 20560)].iloc[0])-0.05, 2, [-10, 20], "Cu$_2$S\nCu$_2$Se")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "BeSe")].iloc[0]), 2, [10, 20], "BeSe\nRbAu")
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20])
ax.set_title(r"QS$GW$", pad=3)

ax = axes[1, 2]
key = "QSGW^"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "BeSe")].iloc[0]), 1, [-5, 35], "BeSe")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "BaO")].iloc[0]), 1, [-13, 20], "BaO")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "BaTe")].iloc[0]), 1, [-10, 30], "BaTe")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "Ar")].iloc[0]), 4, [-10, 20], "Ar\nKr\nCuCl\nCuBr")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "LiF")].iloc[0]), 2, [0, 20], "LiF")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "PbF2")].iloc[0]), 1, [8, 25], "PbF$_2$")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "Ne")].iloc[0]), 2, [2.5, 30], "Ne\nMgCl$_2$")
annotate_mat(ax, float((df[key] - df["Experimental"]).loc[(df["Composition"] == "SrF2")].iloc[0]), 1, [0, 15], "SrF$_2$")
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20])
ax.set_title(r"QS$G\hat{W}$", pad=3)

fig.supxlabel(r"$E_{\mathrm{gap}}^{\mathrm{calc.}} - E_{\mathrm{gap}}^{\mathrm{exp.}}$ (eV)", x=0.53, y=0.04)
fig.supylabel(r"Occurence", x=0.025, y=0.525)

fig.tight_layout()
fig.savefig(paper_fig_path + "histo.pdf")

In [None]:
# setup a dataframe for the main table 
skip_cols = {"Composition", "ICSD-ID", "MP-ID", "DOI", "num_sites"}
metric_cols = [col for col in df.columns if col not in skip_cols]
df_metrics = df[metric_cols].copy(deep=True)
errors = df_metrics.sub(df_metrics["Experimental"], axis=0)
results = {}
for col in df_metrics.columns:
    if col == "Experimental":
        continue
    e = errors[col]
    results[col] = {
        "$n$": e.count(),
        "ME (eV)": e.mean(numeric_only=True),
        r"$\sigma$ (eV)": e.std(numeric_only=True),
        "MAE (eV)": e.abs().mean(numeric_only=True),
        "RMSE (eV)": np.sqrt((e ** 2).mean(numeric_only=True)),
        r"MAPE (\%)": (e / df_metrics["Experimental"]).abs().mean(numeric_only=True) * 100,
    }
metrics_df = pd.DataFrame(results)
metrics_df.loc[r"MAPE (\%)"] = metrics_df.loc[r"MAPE (\%)"].round().astype("Int64")

# main table
header = [r"MBJ", r"HSE06", r"$G_{0}W_{0}$@LDA-PPA", r"$G_{0}W_{0}$@PBE-PPA",
          r"QP$G_{0}W_{0}$", r"QP$G_{0}W_{0}$+SOC", r"QS$GW$", r"QS$G\hat{{W}}$", r"QS$G\hat{{W}}$+SOC"]
print(
    metrics_df[["MBJ", "HSE06", "G0W0@LDA-PPA", "G0W0@PBE-PPA", "QPG0W0", "QPG0W0+SOC", "QSGW", "QSGW^", "QSGW^+SOC"]].to_latex(
        index=True, 
        header=header, 
        float_format=smart_fmt,
        column_format="l" + 9*"c",
    )
)

In [None]:
# expand the main table to include the metrics for all materials, because with we have data for all methods
# (complete‐case metrics, i.e., drop any row that contains a NaN)
skip_cols = {"Composition", "ICSD-ID", "MP-ID", "DOI", "num_sites"}
metric_cols = [col for col in df.columns if col not in skip_cols]
df_complete = df.dropna(subset=metric_cols, inplace=False)
df_metrics = df_complete[metric_cols].copy(deep=True)
errors = df_metrics.sub(df_metrics["Experimental"], axis=0)
results = {}
for col in df_metrics.columns:
    if col == "Experimental":
        continue
    e = errors[col]
    results[col] = {
        "$n$": e.count(),
        "ME (eV)": e.mean(numeric_only=True),
        r"$\sigma$ (eV)": e.std(numeric_only=True),
        "MAE (eV)": e.abs().mean(numeric_only=True),
        "RMSE (eV)": np.sqrt((e ** 2).mean(numeric_only=True)),
        r"MAPE (\%)": (e / df_metrics["Experimental"]).abs().mean(numeric_only=True) * 100,
    }
metrics_df = pd.DataFrame(results)
metrics_df.loc[r"MAPE (\%)"] = metrics_df.loc[r"MAPE (\%)"].round().astype("Int64")
header = [r"MBJ", r"HSE06", r"$G_{0}W_{0}$@LDA-PPA", r"$G_{0}W_{0}$@PBE-PPA",
          r"QP$G_{0}W_{0}$", r"QP$G_{0}W_{0}$+SOC", r"QS$GW$", r"QS$G\hat{{W}}$", r"QS$G\hat{{W}}$+SOC"]
print(
    metrics_df[["MBJ", "HSE06", "G0W0@LDA-PPA", "G0W0@PBE-PPA", "QPG0W0", "QPG0W0+SOC", "QSGW", "QSGW^", "QSGW^+SOC"]].to_latex(
        index=True, 
        header=header, 
        float_format=smart_fmt,
        column_format="l" + 9*"c",
    )
)

In [None]:
# band structure plots comparing different methods
fig, axes = plt.subplots(1, 2, figsize=(6.5, 3))

# MgO
ax = axes[0]
for idx, cse in enumerate(questaal_data):
    if cse.entry_id == "MgO_icsd_9863_nsites_2":
        break
cse = questaal_data[idx]
bs = cse.data["bs_lda"]
n_colors = bs["n_colors"]
n_bands = bs["n_bands"]
tick_labels = bs["tick_labels"]
tick_labels = [replace_greek_and_subscripts(item) for item in tick_labels]
tick_labels[2] = r"H$_2$, H$_0$"
bs_paths = bs["bs_paths"]
x0 = 0
label_idx = []
for path in bs_paths:
    label_idx += [x0]
    nk = path["nk"]
    x = np.arange(x0, x0 + nk)
    for i in range(n_bands - 1):
        y = np.array(path["bands"])[:, i]
        ax.plot(x, y, "-", color="tab:gray", alpha=0.5, zorder=-1)
    x0 = x[-1]
plot_bs(ax, cse.data["bs_qpg0w0"], lcs="r-")
plot_bs(ax, cse.data["bs_qsgw"], lcs="b-")
plot_bs(ax, cse.data["bs_qsgwbse"], lcs="k-")
xlim = ax.get_xlim()
ax.plot([-1, -2], [1, 1], "-", color="tab:gray", alpha=0.5, label=r"LDA")
ax.plot([-1, -2], [1, 1], "r-", label=r"QP$G_{0}W_{0}$")
ax.plot([-1, -2], [1, 1], "b-", label=r"QS$GW$")
ax.plot([-1, -2], [1, 1], "k-", label=r"QS$G\hat{W}$")
ax.set_xlim(xlim)
ax.set_ylim([-5.5, 20])
ax.set_yticks([-4, 0, 4, 8, 12, 16, 20])
ax.set_title(f"{cse.composition.reduced_formula:s}~~~(ISCD {get_material_id(cse.entry_id):d})", pad=3)
ax.axhline(y=7.67, color="k", linestyle="-.", lw=0.5, zorder=-1)
ax.text(x=78, y=3.5, s=r"Exp. $7.67$~eV")

# ScN
ax = axes[1]
for idx, cse in enumerate(questaal_data):
    if cse.entry_id == "ScN_icsd_644666_nsites_2":
        break
cse = questaal_data[idx]
bs = cse.data["bs_lda"]
n_colors = bs["n_colors"]
n_bands = bs["n_bands"]
tick_labels = bs["tick_labels"]
tick_labels = [replace_greek_and_subscripts(item) for item in tick_labels]
tick_labels[2] = r"H$_2$, H$_0$"
bs_paths = bs["bs_paths"]
x0 = 0
for path in bs_paths:
    nk = path["nk"]
    x = np.arange(x0, x0 + nk)
    for i in range(n_bands - 1):
        y = np.array(path["bands"])[:, i]
        ax.plot(x, y, "-", color="tab:gray", alpha=0.5, zorder=-1)
    x0 = x[-1]
plot_bs(ax, cse.data["bs_qpg0w0"], lcs="r-")
plot_bs(ax, cse.data["bs_qsgw"], lcs="b-", deco=False)
plot_bs(ax, cse.data["bs_qsgwbse"], lcs="k-", deco=False)
ax.axhline(y=1.1, color="k", linestyle="-.", lw=0.5, zorder=-1)
ax.text(x=75, y=0.325, s=r"Exp. $1.1$~eV")
ax.set_ylim([-5.7, 7])
ax.set_ylabel("")
ax.set_title(f"{cse.composition.reduced_formula:s}~~~(ISCD {get_material_id(cse.entry_id):d})", pad=3)

# global figure settings
fig.legend(ncol=4, handlelength=1.25, bbox_to_anchor=(0.72, 0.04), columnspacing=0.8, frameon=True, fancybox=False)
fig.tight_layout()
fig.align_xlabels()
fig.savefig(os.path.join(paper_fig_path, "bandstructures.pdf"))

# Supplement

In [None]:
# long table with all GW band gaps (not actually used anywhere...)
table_df = df[["Composition", "ICSD-ID", "G0W0@LDA-PPA", "G0W0@PBE-PPA", "QPG0W0", "QPG0W0+SOC", "QSGW", "QSGW^", "QSGW^+SOC", "Experimental"]].copy(deep=True)
mask = table_df[["G0W0@LDA-PPA", "G0W0@PBE-PPA", "QPG0W0", "QPG0W0+SOC", "QSGW", "QSGW^", "QSGW^+SOC"]].notna().any(axis=1)
table_df = table_df[mask]
table_df["Composition"] = table_df["Composition"].apply(pretty_formula)
header = [r"Material", r"ICSD-ID", r"$G_{0}W_{0}$@LDA-PPA", r"$G_{0}W_{0}$@PBE-PPA", r"QP$G_{0}W_{0}$",
          r"QP$G_{0}W_{0}$+SOC", r"QS$GW$", r"QS$G\hat{{W}}$", r"QS$G\hat{{W}}$+SOC", r"Exp."]
latex_str = table_df.to_latex(index=False, header=header, float_format=smart_fmt, column_format="l" + (table_df.shape[1] - 1)*"c")
print(re.sub(r"NaN", r"---", latex_str))

In [None]:
# metrics highlighting how close the band gaps between Quantum Espresso and Vasp are
(np.abs(df["LDA NC"] - df["LDA"])).describe()

In [None]:
# metrics highlighting how close the band gaps between Quantum Espresso and Questaal are
(np.abs(df["LDA NC"] - df["LDA AE"])).describe()

In [None]:
# metrics highlighting how close the band gaps between Vasp and Questaal are
(np.abs(df["LDA"] - df["LDA AE"])).describe()

In [None]:
mask = (df["LDA NC"] == 0) & (df["LDA"] > 0)
lda_df = df.loc[mask, :].copy(deep=True)
lda_df["Composition"] = lda_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(lda_df[["Composition", "ICSD-ID", "LDA NC", "LDA"]].to_latex(index=False, header=["Composition", "ICSD ID", "LDA NC (eV)", "LDA PAW (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
mask = (df["LDA NC"] > 0) & (df["LDA"] == 0)
lda_df = df.loc[mask, :].copy(deep=True)
lda_df["Composition"] = lda_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(lda_df[["Composition", "ICSD-ID", "LDA NC", "LDA"]].to_latex(index=False, header=["Composition", "ICSD ID", "LDA NC (eV)", "LDA PAW (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
mask = (df["LDA NC"] == 0) & (df["LDA AE"] > 0)
lda_df = df.loc[mask, :].copy(deep=True)
lda_df["Composition"] = lda_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(lda_df[["Composition", "ICSD-ID", "LDA NC", "LDA AE"]].to_latex(index=False, header=["Composition", "ICSD ID", "LDA NC (eV)", "LDA AE (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
mask = (df["LDA NC"] > 0) & (df["LDA AE"] == 0)
lda_df = df.loc[mask, :].copy(deep=True)
lda_df["Composition"] = lda_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(lda_df[["Composition", "ICSD-ID", "LDA NC", "LDA AE"]].to_latex(index=False, header=["Composition", "ICSD ID", "LDA NC (eV)", "LDA AE (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
mask = (df["LDA"] == 0) & (df["LDA AE"] > 0)
lda_df = df.loc[mask, :].copy(deep=True)
lda_df["Composition"] = lda_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(lda_df[["Composition", "ICSD-ID", "LDA", "LDA AE"]].to_latex(index=False, header=["Composition", "ICSD ID", "LDA PAW (eV)", "LDA AE (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
mask = (df["LDA"] > 0) & (df["LDA AE"] == 0)
lda_df = df.loc[mask, :].copy(deep=True)
lda_df["Composition"] = lda_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(lda_df[["Composition", "ICSD-ID", "LDA", "LDA AE"]].to_latex(index=False, header=["Composition", "ICSD ID", "LDA PAW (eV)", "LDA AE (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
# make the actual table for the supplement
mask = (df["LDA NC"] == 0) & (df["LDA"] > 0)
lda_df = df.loc[mask, :].copy(deep=True)
lda_df["Composition"] = lda_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(lda_df[["Composition", "ICSD-ID", "LDA", "LDA NC", "LDA AE"]].to_latex(index=False, header=["Composition", "ICSD ID", "LDA PAW (eV)", "LDA NC (eV)", "LDA AE (eV)"], float_format="%.2f",column_format="ccccc"))

In [None]:
# compare DFT band gaps obtained from different codes and highlight where different codes (Vasp, Yambo and Questaal) predict a different LDA ground state
fig, axes = plt.subplots(2, 3, figsize=(6.0, 4.0))

xpos = -0.275
ypos = 1.1
axes[0, 0].text(xpos-0.032, ypos, r"\textbf{a}", transform=axes[0, 0].transAxes, size=10)
axes[0, 1].text(xpos, ypos, r"\textbf{b}", transform=axes[0, 1].transAxes, size=10)
axes[0, 2].text(xpos, ypos, r"\textbf{c}", transform=axes[0, 2].transAxes, size=10)
axes[1, 0].text(xpos-0.032, ypos, r"\textbf{d}", transform=axes[1, 0].transAxes, size=10)
axes[1, 1].text(xpos, ypos, r"\textbf{e}", transform=axes[1, 1].transAxes, size=10)
axes[1, 2].text(xpos, ypos, r"\textbf{f}", transform=axes[1, 2].transAxes, size=10)

ax = axes[0, 0]
ax.axline([0, 0], [1, 1])
ax.plot(df["LDA NC"], df["LDA"], "o", color="tab:blue")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{NC}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{PAW}}$ (eV)")

ax = axes[0, 1]
ax.axline([0, 0], [1, 1])
ax.plot(df["LDA NC"], df["LDA AE"], "o", color="tab:green")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{NC}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{AE}}$ (eV)")

ax = axes[0, 2]
ax.axline([0, 0], [1, 1])
ax.plot(df["LDA"], df["LDA AE"], "o", color="tab:orange")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{PAW}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{AE}}$ (eV)")

ax = axes[1, 0]
ax.axline([0, 0], [1, 1])
ax.plot(df["LDA NC"], df["LDA"], "o", color="tab:blue")
annotate_mat(ax, df.loc[(df["Composition"] == "Ag2Se"), :]["LDA NC"], df.loc[(df["Composition"] == "Ag2Se"), :]["LDA"], [17, 20], r"Ag$_{2}$Se")
annotate_mat(ax, df.loc[(df["Composition"] == "CuGaSe2"), :]["LDA NC"], df.loc[(df["Composition"] == "CuGaSe2"), :]["LDA"], [10, 20], r"CuGaSe$_{2}$")
annotate_mat(ax, df.loc[(df["Composition"] == "P"), :]["LDA NC"], df.loc[(df["Composition"] == "P"), :]["LDA"], [15, 20], "P")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{NC}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{PAW}}$ (eV)")
ax.set_xlim([-0.01, 0.1])
ax.set_ylim([-0.01, 0.1])
ax.set_yticks([0.00, 0.05, 0.10])

ax = axes[1, 1]
ax.axline([0, 0], [1, 1])
ax.plot(df["LDA NC"], df["LDA AE"], "o", color="tab:green")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{NC}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{AE}}$ (eV)")
ax.set_xlim([-0.05, 0.4])
ax.set_ylim([-0.05, 0.4])
ax.set_yticks([0.0, 0.1, 0.2, 0.3, 0.4])

ax = axes[1, 2]
ax.axline([0, 0], [1, 1])
ax.plot(df["LDA"], df["LDA AE"], "o", color="tab:orange")
annotate_mat(ax, df.loc[(df["Composition"] == "P"), :]["LDA"], df.loc[(df["Composition"] == "P"), :]["LDA AE"], [25, 0], r"P")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{PAW}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{AE}}$ (eV)")
ax.set_xlim([-0.05, 0.4])
ax.set_ylim([-0.05, 0.4])
ax.set_yticks([0.0, 0.1, 0.2, 0.3, 0.4])

fig.tight_layout()
fig.align_labels()
fig.subplots_adjust(wspace=0.4, hspace=0.5)
fig.savefig(si_fig_path + "lda_cmp.pdf")

In [None]:
mask = (df["PBE NC"] == 0) & (df["PBE"] > 0)
pbe_df = df.loc[mask, :].copy(deep=True)
pbe_df["Composition"] = pbe_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(pbe_df[["Composition", "ICSD-ID", "PBE NC", "PBE"]].to_latex(index=False, header=["Composition", "ICSD ID", "PBE NC (eV)", "PBE PAW (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
mask = (df["PBE NC"] > 0) & (df["PBE"] == 0)
pbe_df = df.loc[mask, :].copy(deep=True)
pbe_df["Composition"] = pbe_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(pbe_df[["Composition", "ICSD-ID", "PBE NC", "PBE"]].to_latex(index=False, header=["Composition", "ICSD ID", "PBE NC (eV)", "PBE PAW (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
# make the actual table for the supplement
mask = (df["PBE NC"] == 0) & (df["PBE"] > 0) | (df["PBE NC"] > 0) & (df["PBE"] == 0)
pbe_df = df.loc[mask, :].copy(deep=True)
pbe_df["Composition"] = pbe_df["Composition"].apply(
    lambda s: re.sub(r"(\d+)", r"$_{\1}$", s)
)
print(pbe_df[["Composition", "ICSD-ID", "PBE NC", "PBE"]].to_latex(index=False, header=["Composition", "ICSD ID", "PBE NC (eV)", "PBE PAW (eV)"], float_format="%.2f",column_format="cccc"))

In [None]:
# compare DFT band gaps obtained from different codes and highlight where different codes (Vasp, Yambo and Questaal) predict a different PBE ground state
fig, axes = plt.subplots(1, 2, figsize=(4, 2))

ypos = 1.1
axes[0].text(-0.25, ypos, r"\textbf{a}", transform=axes[0].transAxes, size=10)
axes[1].text(-0.32, ypos, r"\textbf{b}", transform=axes[1].transAxes, size=10)

ax = axes[0]
ax.axline([0, 0], [1, 1])
ax.plot(df["PBE NC"], df["PBE"], "o", color="tab:blue", label="PBE")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{NC}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{PAW}}$ (eV)")
ax = axes[1]

ax.axline([0, 0], [1, 1])
ax.plot(df["PBE NC"], df["PBE"], "o", color="tab:blue", label="PBE")
annotate_mat(ax, df.loc[(df["Composition"] == "Ag2PdO2"), :]["PBE NC"], df.loc[(df["Composition"] == "Ag2PdO2"), :]["PBE"], [7.5, 25], r"Ag$_{2}$PdO$_{2}$")
annotate_mat(ax, df.loc[(df["Composition"] == "HgSnO3"), :]["PBE NC"], df.loc[(df["Composition"] == "HgSnO3"), :]["PBE"], [20, 20], r"HgSnO$_{3}$")
annotate_mat(ax, df.loc[(df["ICSD-ID"] == 192171), :]["PBE NC"], df.loc[(df["ICSD-ID"] == 192171), :]["PBE"], [35, 0], r"Cu$_{2}$GeSe$_{3}$")
annotate_mat(ax, df.loc[(df["ICSD-ID"] == 180272), :]["PBE NC"], df.loc[(df["ICSD-ID"] == 180272), :]["PBE"], [35, -3], r"TlInSe$_{2}$")
ax.set_xlabel(r"$E_{\mathrm{gap}}^{\mathrm{NC}}$ (eV)")
ax.set_ylabel(r"$E_{\mathrm{gap}}^{\mathrm{PAW}}$ (eV)")
ax.set_xlim([-0.01, 0.1])
ax.set_ylim([-0.01, 0.1])
ax.set_yticks([0.00, 0.05, 0.10])

fig.tight_layout()
fig.align_labels()
fig.savefig(si_fig_path + "pbe_cmp.pdf")

In [None]:
# find materials where the PBE NC band gap is almost zero
df_sorted = df.loc[np.abs(df["PBE NC"] - 0.01).sort_values(ascending=True, na_position="last").index]
df_sorted

In [None]:
# find materials where the QPG0W0 and QSGW^ are very different (sorted by largest differences)
df_sorted = df.loc[np.abs(df["QPG0W0"] - df["QSGW^"]).sort_values(ascending=False, na_position="last").index]
df_sorted

In [None]:
# metrics highlighting how close the QPG0W0 band gap is to the QSGW band gap
np.abs(df["QPG0W0"] - df["QSGW^"]).describe()

In [None]:
# find materials where the QPG0W0 and QPG0W0+SOC are very different (sorted by largest differences)
df_sorted = df.loc[np.abs(df["QPG0W0"] - df["QPG0W0+SOC"]).sort_values(ascending=False, na_position="last").index]
df_sorted

In [None]:
# find materials where the QSGW^ and QSGW^+SOC are very different (sorted by largest differences)
df_sorted = df.loc[np.abs(df["QSGW^"] - df["QSGW^+SOC"]).sort_values(ascending=False, na_position="last").index]
df_sorted

In [None]:
# additional histograms not shown in the main text
c = "tab:orange"
bins = np.arange(-2.1, 2.1 + 0.1, 0.1)
fig, axes = plt.subplots(1, 3, figsize=(6, 2.25))

ax = axes[0]
key = "G0W0@PBE-PPA"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20, 25])
ax.set_title(r"$G_{0}W_{0}$@PBE-PPA", pad=3)

ax = axes[1]
key = "QPG0W0+SOC"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20, 25])
ax.set_title(r"QP$G_{0}W_{0}$+SOC", pad=3)

ax = axes[2]
key = "QSGW^+SOC"
ax.hist(df[key] - df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(df[key] - df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) - np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(df[key] - df["Experimental"]) + np.nanstd(df[key] - df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20])
ax.set_title(r"QSG$\hat{\mathrm{W}}$+SOC", pad=3)

fig.supxlabel(r"$E_{\mathrm{gap}}^{\mathrm{calc.}} - E_{\mathrm{gap}}^{\mathrm{exp.}}$ (eV)", x=0.53, y=0.08)
fig.supylabel(r"Occurence", x=0.025, y=0.525)

fig.tight_layout()
fig.savefig(si_fig_path + "additional_histo.pdf")

In [None]:
# an additional figure highlights that the MAPE is a poor metric...
fig, axes = plt.subplots(1, 2, figsize=(5, 2.5))
mape_vals = 100 * np.abs((df["QSGW^"] - df["Experimental"]) / df["Experimental"])
mape_vals = mape_vals.replace(np.inf, np.nan)

# normal x-axis
ax = axes[0]
bins = np.arange(0, 400 + 10, 10)
ax.hist(mape_vals, bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(x=np.nanmean(mape_vals), color="limegreen", label="Mean")
ax.axvline(x=np.nanmedian(mape_vals), color="magenta", label="Median")
ax.set_xlabel(r"APE (\%)")
ax.set_ylabel(r"Occurence")
ax.legend(handlelength=1.25)
fig.tight_layout()

# log x-axis
ax = axes[1]
bins = np.logspace(np.log10(np.nanmin(mape_vals) + 0.1), np.log10(np.nanmax(mape_vals)))
ax.hist(mape_vals, bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(x=np.nanmean(mape_vals), color="limegreen")
ax.axvline(x=np.nanmedian(mape_vals), color="magenta")
ax.set_xscale("log")
ax.set_xlabel(r"APE (\%)")
ax.set_ylabel(r"Occurence")
fig.tight_layout()
fig.savefig(os.path.join(si_fig_path, "ape_analysis.pdf"))

# print the mean and median
print(f"Mean APE   = {np.nanmean(mape_vals):.2f} %")
print(f"Median APE = {np.nanmedian(mape_vals):.2f} %\n ")

# find the materials with the worst MAPE and create a table
top_idx = mape_vals.nlargest(5).index 
top_mape = mape_vals.loc[top_idx]
worst =  df.assign(MAPE=mape_vals.round(0)).loc[top_idx].sort_values("MAPE", ascending=False)
header = [r"Material", r"ICSD-ID", r"QS$G\hat{{W}}$", r"QS$G\hat{{W}}$+SOC", r"Exp.", r"MAPE (\%)"]
print(
    worst[["Composition", "ICSD-ID", "QSGW^", "QSGW^+SOC", "Experimental", "MAPE"]].to_latex(
        index=False, 
        header=header, 
        float_format=smart_fmt,
        column_format="l" + 5*"c",
    )
)

# Sanitized dataset

In [None]:
"""
Sanitized version of the benchmark dataset. We removed/adjusted materials where we identified questionable experimental data.
(Read the paper for details...)
"""

cleaned_df = df.copy(deep=True)

# materials where we could not find more plausible experiments or where the crystal structure appears to be problematic (Cu2S & Cu2Se)...
drop_mats = [
    18102, # CuLaO2
    20560, # Cu2S
    56025, # Cu2Se
    86439, # MgCl2
    86738, # PbF2
    40414, # SrF2
    58428, # RbAu
    616165, # BaTe
]
cleaned_df.drop(cleaned_df.loc[df["ICSD-ID"].isin(drop_mats)].index, inplace=True)

# materials where we could find more plausible experiments
# BeSe, 616419, 3.8 eV, https://doi.org/10.1103/PhysRevB.73.115212
cleaned_df.loc[cleaned_df["ICSD-ID"] == 616419,"Experimental"] = 3.8
cleaned_df.loc[cleaned_df["ICSD-ID"] == 616419,"DOI"] = "https://doi.org/10.1103/PhysRevB.73.115212"
# BaO, 616005, 4.1 eV, https://doi.org/10.1103/PhysRev.113.1019
cleaned_df.loc[cleaned_df["ICSD-ID"] == 616005,"Experimental"] = 4.1
cleaned_df.loc[cleaned_df["ICSD-ID"] == 616005,"DOI"] = "https://doi.org/10.1103/PhysRev.113.1019"
# ScN, 644666, 1.1 eV, https://doi.org/10.1103/PhysRevMaterials.8.L071601
cleaned_df.loc[cleaned_df["ICSD-ID"] == 644666,"Experimental"] = 1.1
cleaned_df.loc[cleaned_df["ICSD-ID"] == 644666,"DOI"] = "https://doi.org/10.1103/PhysRevMaterials.8.L071601"

In [None]:
# supplemental version of the main plot (sanitized dataset)
bins = np.arange(-2.1, 2.1 + 0.1, 0.1)
fig, axes = plt.subplots(2, 3, figsize=(6, 4.5))

c = "tab:orange"

ax = axes[0, 0]
key = "MBJ"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(key, pad=3)

ax = axes[0, 1]
key = "HSE06"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(key, pad=3)

ax = axes[0, 2]
key = "G0W0@LDA-PPA"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(r"$G_{0}W_{0}$@LDA-PPA", pad=3)

ax = axes[1, 0]
key = "QPG0W0"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(r"QP$G_{0}W_{0}$", pad=3)

ax = axes[1, 1]
key = "QSGW"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20])
ax.set_title(r"QS$GW$", pad=3)

ax = axes[1, 2]
key = "QSGW^"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20])

ax.set_title(r"QS$G\hat{W}$", pad=3)

fig.supxlabel(r"$E_{\mathrm{gap}}^{\mathrm{calc.}} - E_{\mathrm{gap}}^{\mathrm{exp.}}$ (eV)", x=0.53, y=0.04)
fig.supylabel(r"Occurence", x=0.025, y=0.525)

fig.tight_layout()
fig.savefig(si_fig_path + "histo_clean_part_1.pdf")

In [None]:
# additional histograms not shown in the main text
c = "tab:orange"
bins = np.arange(-2.1, 2.1 + 0.1, 0.1)
fig, axes = plt.subplots(1, 3, figsize=(6, 2.25))

ax = axes[0]
key = "G0W0@PBE-PPA"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_title(r"$G_{0}W_{0}$@PBE-PPA", pad=3)

ax = axes[1]
key = "QPG0W0+SOC"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20, 25])
ax.set_title(r"QP$G_{0}W_{0}$+SOC", pad=3)

ax = axes[2]
key = "QSGW^+SOC"
ax.hist(cleaned_df[key] - cleaned_df["Experimental"], bins=bins, color="tab:blue", alpha=1.00, edgecolor="k", linewidth=0.1)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="-", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) - np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.axvline(np.nanmean(cleaned_df[key] - cleaned_df["Experimental"]) + np.nanstd(cleaned_df[key] - cleaned_df["Experimental"]), linestyle="--", color=c)
ax.set_xticks([-2, -1, 0, 1, 2])
ax.set_yticks([0, 5, 10, 15, 20])
ax.set_title(r"QSG$\hat{\mathrm{W}}$+SOC", pad=3)

fig.supxlabel(r"$E_{\mathrm{gap}}^{\mathrm{calc.}} - E_{\mathrm{gap}}^{\mathrm{exp.}}$ (eV)", x=0.53, y=0.08)
fig.supylabel(r"Occurence", x=0.025, y=0.525)

fig.tight_layout()
fig.savefig(si_fig_path + "histo_clean_part_2.pdf")

In [None]:
# setup a dataframe for the cleaned version of main metric table 
skip_cols = {"Composition", "ICSD-ID", "MP-ID", "DOI", "num_sites"}
metric_cols = [col for col in cleaned_df.columns if col not in skip_cols]
df_metrics = cleaned_df[metric_cols].copy(deep=True)
errors = df_metrics.sub(df_metrics["Experimental"], axis=0)
results = {}
for col in df_metrics.columns:
    if col == "Experimental":
        continue
    e = errors[col]
    results[col] = {
        "$n$": e.count(),
        "ME (eV)": e.mean(numeric_only=True),
        r"$\sigma$ (eV)": e.std(numeric_only=True),
        "MAE (eV)": e.abs().mean(numeric_only=True),
        "RMSE (eV)": np.sqrt((e ** 2).mean(numeric_only=True)),
        r"MAPE (\%)": (e / df_metrics["Experimental"]).abs().mean(numeric_only=True) * 100,
    }
metrics_df = pd.DataFrame(results)
metrics_df.loc[r"MAPE (\%)"] = metrics_df.loc[r"MAPE (\%)"].round().astype("Int64")

# supplemental metric table (cleaned dataset)
header = [r"MBJ", r"HSE06", r"$G_{0}W_{0}$@LDA-PPA", r"$G_{0}W_{0}$@PBE-PPA",
          r"QP$G_{0}W_{0}$", r"QP$G_{0}W_{0}$+SOC", r"QS$GW$", r"QS$G\hat{{W}}$", r"QS$G\hat{{W}}$+SOC"]
print(
    metrics_df[["MBJ", "HSE06", "G0W0@LDA-PPA", "G0W0@PBE-PPA", "QPG0W0", "QPG0W0+SOC", "QSGW", "QSGW^", "QSGW^+SOC"]].to_latex(
        index=True, 
        header=header, 
        float_format=smart_fmt,
        column_format="l" + 9*"c",
    )
)

In [None]:
# expand the supplemental metric table to include the metrics for all materials, because with we have data for all methods
# (complete‐case metrics, i.e., drop any row that contains a NaN)
skip_cols = {"Composition", "ICSD-ID", "MP-ID", "DOI", "num_sites"}
metric_cols = [col for col in cleaned_df.columns if col not in skip_cols]
df_complete = cleaned_df.dropna(subset=metric_cols, inplace=False)
df_metrics = df_complete[metric_cols].copy(deep=True)
errors = df_metrics.sub(df_metrics["Experimental"], axis=0)
results = {}
for col in df_metrics.columns:
    if col == "Experimental":
        continue
    e = errors[col]
    results[col] = {
        "$n$": e.count(),
        "ME (eV)": e.mean(numeric_only=True),
        r"$\sigma$ (eV)": e.std(numeric_only=True),
        "MAE (eV)": e.abs().mean(numeric_only=True),
        "RMSE (eV)": np.sqrt((e ** 2).mean(numeric_only=True)),
        r"MAPE (\%)": (e / df_metrics["Experimental"]).abs().mean(numeric_only=True) * 100,
    }
metrics_df = pd.DataFrame(results)
metrics_df.loc[r"MAPE (\%)"] = metrics_df.loc[r"MAPE (\%)"].round().astype("Int64")
header = [r"MBJ", r"HSE06", r"$G_{0}W_{0}$@LDA-PPA", r"$G_{0}W_{0}$@PBE-PPA",
          r"QP$G_{0}W_{0}$", r"QP$G_{0}W_{0}$+SOC", r"QS$GW$", r"QS$G\hat{{W}}$", r"QS$G\hat{{W}}$+SOC"]
print(
    metrics_df[["MBJ", "HSE06", "G0W0@LDA-PPA", "G0W0@PBE-PPA", "QPG0W0", "QPG0W0+SOC", "QSGW", "QSGW^", "QSGW^+SOC"]].to_latex(
        index=True, 
        header=header, 
        float_format=smart_fmt,
        column_format="l" + 9*"c",
    )
)

# Spreadsheets

In [None]:
# replace empty references with 10.1007/978-3-642-18865-7, i.e., "Semiconductors: Data Handbook, Otfried Madelung, Springer-Verlag Berlin Heidelberg, 2004"
df["DOI"] = df["DOI"].fillna("10.1007/978-3-642-18865-7")
cleaned_df["DOI"] = cleaned_df["DOI"].fillna("10.1007/978-3-642-18865-7")

In [None]:
# save the dataframe as a nicely formatted spreadsheet
with pd.ExcelWriter(os.path.join(sheet_path, "bandgap_benchmark.xlsx"), engine="openpyxl") as writer:
    # write without index and format floats
    df.to_excel(
        writer, 
        sheet_name="Benchmark",
        index=False,
        float_format="%.2f",
    )
    # grab the workbook & worksheet objects
    wb  = writer.book
    ws  = writer.sheets["Benchmark"]
    # prepare a center alignment object
    center = Alignment(horizontal="center", vertical="center")
    # loop through all cells in the used range and apply centering
    for row in ws.iter_rows(
        min_row=1, 
        max_row=ws.max_row,
        min_col=1, 
        max_col=ws.max_column
    ):
        for cell in row:
            # header row: just center
            if cell.row == 1:
                cell.alignment = center
            else:
                # data rows: if its a float (or int) always print two decimal places
                if isinstance(cell.value, float):
                    cell.number_format = "0.00"
                cell.alignment = center
    # determine column widths
    for idx, col in enumerate(df.columns, 1):
        if col in ["MP-ID", "DOI"]:
            max_len = max(df[col].astype(str).map(len).max(), len(col))
        else:
            max_len = len(col)
        ws.column_dimensions[get_column_letter(idx)].width = max_len + 5

In [None]:
# save the cleaned dataframe as a nicely formatted spreadsheet
with pd.ExcelWriter(os.path.join(sheet_path, "revised_bandgap_benchmark.xlsx"), engine="openpyxl") as writer:
    # write without index and format floats
    cleaned_df.to_excel(
        writer, 
        sheet_name="Benchmark",
        index=False,
        float_format="%.2f",
    )
    # grab the workbook & worksheet objects
    wb  = writer.book
    ws  = writer.sheets["Benchmark"]
    # prepare a center alignment object
    center = Alignment(horizontal="center", vertical="center")
    # loop through all cells in the used range and apply centering
    for row in ws.iter_rows(
        min_row=1, 
        max_row=ws.max_row,
        min_col=1, 
        max_col=ws.max_column
    ):
        for cell in row:
            # header row: just center
            if cell.row == 1:
                cell.alignment = center
            else:
                # data rows: if its a float (or int) always print two decimal places
                if isinstance(cell.value, float):
                    cell.number_format = "0.00"
                cell.alignment = center
    # determine column widths
    for idx, col in enumerate(cleaned_df.columns, 1):
        if col in ["MP-ID", "DOI"]:
            max_len = max(cleaned_df[col].astype(str).map(len).max(), len(col))
        else:
            max_len = len(col)
        ws.column_dimensions[get_column_letter(idx)].width = max_len + 5

# CSVs

In [None]:
# save the data frames as CSV files so people can easily load or view them without Excel
df.to_csv(os.path.join(csv_path, "bandgap_benchmark.csv"), index=False)
cleaned_df.to_csv(os.path.join(csv_path, "revised_bandgap_benchmark.csv"), index=False)