This notebook is optimized to run on Noctua2 (PC2 in Paderborn, Germany).

It analyzes all calculations in a calculation directory and all database entries.

We check if the calculation finished normally or if it crashed for various reasons.

In [None]:
"""
START USER INPUT 
"""

calc_dir = "../../questaal_calc"
db_dir = "../../questaal_database"
job_name = "benchmark"

"""
END USER INPUT 
"""

# external imports
import os
import re
import sys
import pickle 
import numpy as np
from tqdm import tqdm

# internal imports
from qsgw_workflow.utils.helper import load_db_entry

# check that all needed directories exist
if not os.path.exists(calc_dir):
    sys.exit("The calculation directory does not exist!")
if not os.path.exists(db_dir):
    sys.exit("The database directory does not exist!")

# list of all materials with a calculation directory
mat_dirs = os.listdir(calc_dir)

# list of all materials with a database entry
db_files = os.listdir(db_dir)

# get the job ids of all currently running jobs
queue_str = os.popen(f"squeue_pretty -n {job_name:s}").read()
pattern = re.compile(r"^\s*(\d+)", re.MULTILINE)
job_ids = [int(id) for id in pattern.findall(queue_str)]

# go through all calculation directories (and associated database entries)
# (also find materials with a large BSE transition space) 
audit = {
    "pending_jobs": [],
    "running_jobs": [],
    "crashed_before_qpg0w0": [],
    "crashed_during_qpg0w0": [],
    "crashed_during_qsgw": [],
    "crashed_during_qsgw^": [],
    "finished": [],
}
band_space = {}
for mat in tqdm(mat_dirs, "Going through all calculation directories"):
    # get a list of all files in the calculation directory of the material
    files = os.listdir(os.path.join(calc_dir, mat))
    # get all log files and find the one with the highest id (last job)
    log_files = [f for f in files if "slurm-" in f and f.endswith("log")]
    if not log_files:
        audit["pending_jobs"].append(mat)
        continue
    log_ids = [int(re.findall(r"\d+", f)[0]) for f in log_files]
    log_idx = np.argmax(log_ids)
    log_id = log_ids[log_idx]
    log_file = log_files[log_idx]
    # check if the calculation is still running
    if log_id in job_ids:
        audit["running_jobs"].append(mat)
        continue
    # check if a database entry exists
    if mat + ".json" not in db_files:
        audit["pending_jobs"].append(mat)
        continue
    cse = load_db_entry(os.path.join(db_dir, mat + ".json"))
    if cse.parameters["finish"] == True:
        audit["finished"].append(mat)
        continue
    if cse.parameters["qsgw_flag"] == True:
        if "nv" in cse.parameters and "nc" in cse.parameters:
            band_space[mat] = [cse.parameters["nv"], cse.parameters["nc"]]
        audit["crashed_during_qsgw^"].append(mat)
        continue
    if "gap_qpg0w0_soc" in cse.data:
        audit["crashed_during_qsgw"].append(mat)
        continue
    if "eps_kpts" in cse.parameters:
        audit["crashed_during_qpg0w0"].append(mat)
        continue
    else:
        audit["crashed_before_qpg0w0"].append(mat)
        continue

# reports
print("\n----------------------------------------\nAudit results:\n----------------------------------------\n")
print(f"{'Total':27} | {len(mat_dirs):>5d}")
counter = 0
for key, value in audit.items():
    num_mats = len(value)
    print(f"{key:27} | {num_mats:>5d}")
    counter += num_mats
print(f"{'Sum (sanity check)':27} | {counter:>5d}")

In [None]:
# now we analyze the reasons for the crashes
keys = [
    "crashed_before_qpg0w0",
    "crashed_during_qpg0w0",
    "crashed_during_qsgw",
    "crashed_during_qsgw^",
]
error_audit = {}
for key in keys:
    mats = audit[key]
    error_audit[key] = {
        "dft_kpt_conv_max_iter": [],
        "eps_kpt_conv_max_iter": [],
        "qsgw_kpt_conv_max_iter": [],
        "pqmap_error": [],
        "bloch_sum_error": [],
        "basis_problem": [],
        "out_of_memory": [],
        "metals": [],
        "timeout": [],
        "bsw_parallelization_error": [],
        "band_idx_problem": [],
        "inv_problem": [],
        "other_bse_crashes": [],
        "other_crashes": [],
    }
    for mat in mats:
        # get a list of all files in the calculation directory of the material
        files = os.listdir(os.path.join(calc_dir, mat))
        # get all log files and find the one with the highest id (last job)
        log_files = [f for f in files if "slurm-" in f and f.endswith("log")]
        log_ids = [int(re.findall(r"\d+", f)[0]) for f in log_files]
        log_idx = np.argmax(log_ids)
        log_id = log_ids[log_idx]
        log_file = log_files[log_idx]
        # catch some "normal" crashes
        if "dft_kpt_conv_max_iter.txt" in files:
            error_audit[key]["dft_kpt_conv_max_iter"].append(mat)
            continue
        if "eps_kpt_conv_max_iter.txt" in files:
            error_audit[key]["eps_kpt_conv_max_iter"].append(mat)
            continue
        if "qsgw_kpt_conv_max_iter.txt" in files:
            error_audit[key]["qsgw_kpt_conv_max_iter"].append(mat)
            continue
        if "pqmap_error.txt" in files:
            error_audit[key]["pqmap_error"].append(mat)
            continue
        if "block_sum_error.txt" in files:
            error_audit[key]["bloch_sum_error"].append(mat)
            continue
        # check if the job used to much memory - check 1
        job_eff = os.popen(f"seff {log_id:d}").read()
        match_eff = re.search(r"Memory Efficiency:\s+(\d+(?:\.\d+)?)%", job_eff)
        mem_efficiency = float(match_eff.group(1))
        if mem_efficiency > 100:
            error_audit[key]["out_of_memory"].append(mat)
            continue
        # check if the job used to much memory - check 2
        with open(os.path.join(calc_dir, mat, f"slurm-{log_id:d}.err"), "r") as f:
            err_str = f.read()
        if "oom_kill" in err_str:
            error_audit[key]["out_of_memory"].append(mat)
            continue
        # check if the job used to much memory - check 3
        # there are cases where the out of memory error appears in the 'llmf' log file
        if "llmf" in files:
            with open(os.path.join(calc_dir, mat, "llmf"), "r") as f:
                llmf_str = f.read()
            if "Out Of Memory" in llmf_str:
                error_audit[key]["out_of_memory"].append(mat)
                continue
        # parse the log file
        with open(os.path.join(calc_dir, mat, log_file), "r") as f:
            log_str = f.read()
        if "The calculated material appears to be a metal in the QSGW." in log_str:
            error_audit[key]["metals"].append(mat)
            continue
        elif "DUE TO TIME LIMIT ***" in log_str:
            error_audit[key]["timeout"].append(mat)
            continue
        elif os.path.exists(os.path.join(calc_dir, mat, "lbsw-b1")): # check the BSE log file
            with open(os.path.join(calc_dir, mat, "lbsw-b1"), "r") as f:
                bsw_str = f.read()
            if "OOM Killed" in bsw_str:
                error_audit[key]["out_of_memory"].append(mat)
                continue
            elif "mkw4a: number of threads per group < 1" in bsw_str:
                cse = load_db_entry(os.path.join(db_dir, mat + ".json"))
                cse.parameters["qsgw_kpts"]
                error_audit[key]["bsw_parallelization_error"].append(mat)
                continue
            elif "bse: any(num_valnsp > num_val_sp)" in bsw_str:
                error_audit[key]["band_idx_problem"].append(mat)
                continue
            elif "Exit -1  matinv: degtrf" in bsw_str:
                error_audit[key]["inv_problem"].append(mat)
                continue
            elif "bse: fractional occupancies not supported" in bsw_str:
                error_audit[key]["metals"].append(mat)
                continue
            else:
                error_audit[key]["other_bse_crashes"].append(mat)
                continue
        else:
            error_audit[key]["other_crashes"].append(mat)
            continue

In [None]:
# check that we classified all materials (sanity check)
print(f"{'Class':27} | {'Audit Materials':20} | {'Error Classification':20}")
for key in keys:
    counter = 0
    for err_key in error_audit[key]:
        counter += len(error_audit[key][err_key])
    print(f"{key:27} | {len(audit[key]):<20d} | {counter:<20d}")
    
# reports
for key in keys:
    print(f"\n----------------------------------------\n{key:27s}\n----------------------------------------\n")
    print(f"{'Total':27} | {len(audit[key]):>5d}")
    for err_key in error_audit[key]:
        print(f"{err_key:27} | {len(error_audit[key][err_key]):>5d}")
        
# save the outputs (used by 'restart_noctua.py')
audit_dir = "../audit/benchmark/"
os.makedirs(audit_dir, exist_ok=True)
for key in keys:
    with open(audit_dir + key + ".pkl", "wb") as f:
        pickle.dump(error_audit[key], f)

In [None]:
def bs_check(vbm_idx, bs):
    """
    Helper function to check if a band structure has artifacts.
    Input:
        vbm_idx:    Zero-based index of the valence band maximum
        bs:         Band structure object (see 'qsgw.utils.helper')
    Output:
        flag:       False is everything is fine, True is there is an artifact
    """
    flag = False
    for path in bs["bs_paths"]:
        band = path["bands"][:, vbm_idx + 1]
        max_diff = np.max(np.abs(np.diff(band)))
        if max_diff > 1: # eV
            flag = True
            break
    return flag
        
# get for band structure problems (mostly caused by interpolation errors)
for mat in mat_dirs:
    # check if a database entry exists
    if mat + ".json" not in db_files:
        continue
    cse = load_db_entry(os.path.join(db_dir, mat + ".json"))
    vbm_idx = cse.parameters["vbm_idx"]
    if cse.parameters["finish"] == True:
        bs = cse.data["bs_qsgwbse"]
        if bs_check(vbm_idx, bs):
            print(f"{mat:s} - QSGW^ band structure problem.")
    if cse.parameters["qsgw_flag"] == True:
        bs = cse.data["bs_qsgw"]
        if bs_check(vbm_idx, bs):
            print(f"{mat:s} - QSGW band structure problem.")
    if "gap_qpg0w0_soc" in cse.data:
        bs = cse.data["bs_qpg0w0"]
        if bs_check(vbm_idx, bs):
            print(f"{mat:s} - QPG0W0 band structure problem.")

In [None]:
# clean up the directory of the crashed materials 
base_dir = os.getcwd()
for key in keys:
    for err_key in error_audit[key]:
        for mat in error_audit[key][err_key]:
            os.chdir(os.path.join(calc_dir, mat))
            os.system("touch meta bz.h5; rm -rf [0-9]*run meta mixm.mat mixsigma; lmgwclear")
            os.chdir(base_dir)

Analyze how memory usage correlates with the number of sites and number of electrons in a structure.

In [None]:
"""
START USER INPUT 
"""

calc_dir = "../../questaal_calc"
db_dir = "../../questaal_database"
job_name = "benchmark"

"""
END USER INPUT 
"""

# external imports
import os
import re
import sys
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

# internal imports
from qsgw_workflow.utils.helper import load_db_entry

# check that all needed directories exist
if not os.path.exists(calc_dir):
    sys.exit("The calculation directory does not exist!")
if not os.path.exists(db_dir):
    sys.exit("The database directory does not exist!")

# Noctua 2 has no LaTeX...
plt.rcParams["text.usetex"] = False
plt.rcParams["font.family"] = "DejaVu Serif"

# go through all calculation directories (and associated database entries)
memory_audit = {}
db_files = os.listdir(db_dir)
mat_dirs = os.listdir(calc_dir)
for mat in tqdm(mat_dirs, "Going through all materials"):
    if not os.path.exists(os.path.join(db_dir, mat + ".json")):
        continue
    cse = load_db_entry(os.path.join(db_dir, mat + ".json"))
    num_sites = cse.structure.num_sites
    num_elec = 0
    for site in cse.structure:
        num_elec += site.specie.Z
    # get all log files and analyze the one with the highest id
    files = os.listdir(os.path.join(calc_dir, mat))
    log_files = [f for f in files if "slurm-" in f]
    if not log_files:
        continue
    log_ids = [int(re.findall(r"\d+", f)[0]) for f in log_files]
    log_idx = np.argmax(log_ids)
    log_id = log_ids[log_idx]
    # check how much memory a job used
    job_eff = os.popen(f"seff {log_id:d}").read()
    match_eff = re.search(r"Memory Efficiency:\s+(\d+(?:\.\d+)?)%", job_eff)
    mem_efficiency = float(match_eff.group(1))
    match_mem = re.search(r"Memory Utilized:\s+([\d\.]+)\s*(MB|GB|TB)", job_eff)
    memory_value = float(match_mem.group(1))
    memory_unit = match_mem.group(2).upper() 
    if memory_unit == "MB":
        memory_in_gb = memory_value / 1024
    elif memory_unit == "TB":
        memory_in_gb = memory_value * 1024
    else: # already in GB
        memory_in_gb = memory_value
    memory_audit[mat] = {
        "cse": cse,
        "num_sites": num_sites,
        "num_elec": num_elec,
        "memory_in_gb": memory_in_gb,
    }

# makes plotting easier
num_sites = []
num_elecs = []
memory = []
for key in memory_audit.keys():
    num_sites.append(memory_audit[key]["num_sites"])
    num_elecs.append(memory_audit[key]["num_elec"])
    memory.append(memory_audit[key]["memory_in_gb"])
max_sites = max(num_sites)    

# show how the number of sites and the number of electrons in the structure relate to the maximum memory used
fig, axes = plt.subplots(1, 2, figsize=(4, 2))
ax = axes[0]
ax.plot(num_sites, memory, "o")
ax.set_xlabel("Number of Sites")
ax.set_ylabel("Max. Memory (GB)")
ax.set_xlim(0, max_sites+1)
ax.set_xticks(np.arange(1, max_sites+1, 1))
ax = axes[1]
ax.plot(num_elecs, memory, "o")
ax.set_xlabel("Number of Electrons")
ax.set_ylabel("Max. Memory (GB)")
ax.set_xlim(0, 400)
ax.set_xticks([0, 100, 200, 300, 400])
fig.tight_layout()