We have a web-scraped dataset from the NIST website for spectral lines
https://physics.nist.gov/PhysRefData/ASD/lines_form.html

This was compiled into a set of .npy files (numpy archive) under the slim_db directory
This code further processes that data into a single python pickle file containing the following:


In [1]:
#Imports and env
import re
import pickle
import numpy as np
from pathlib import Path

rel_path = 'data' 
top_dir = Path.cwd().parent
datapath = top_dir / rel_path

In [25]:
#Pre-process database to dict. element key with array of [wavelength,relative intensity]
files = (datapath / 'slim_db').glob('*.npy')
atom_dict = {}  #of form {element:ndarray[[wavelength, rel_intensity]]}
#Available elements, starting with first 30
el_include = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 
        'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 
        'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 
        'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Dy', 
        'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Os', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 
        'Po', 'At', 'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U']
#database missing data for these elements
#TODO verify that no lines exist for these elements
no_lines = ["Pr", "Es", "Rn", "Pu", "Ir", "Nb", "Tb", "Re"] 

#check basic setup for inclusion of unavailable elements
for el in no_lines:
    if el in el_include:
        raise ValueError("Trying to include an element with no spectral data")

for file in files:
    #get element from filename
    m = re.search('([A-Za-z]+).npy$', file.name)
    element = m[1]
    
    if element in no_lines or element not in el_include:
        #print('Skipping: ', element)
        continue
    #print('Processing: ', element)
    
    #load file data into ndarray
    #'element', 'sp_num', 'obs_wl_vac(nm)', 'ritz_wl_vac(nm)', 'intens', 'gA(s^-1)', 'Acc', 'Ei(eV)', 'Ek(eV)', 'conf_i',
    #'term_i', 'J_i', 'conf_k', 'term_k', 'J_k', 'g_i', 'g_k', 'Type', 'tp_ref', 'line_ref'
    filedata = np.load(file)
    col_names = filedata[0]
    col_keep = ['ritz_wl_vac(nm)','gA(s^-1)','g_k']
    cols = [i in col_keep for i in col_names]
    filedata = filedata[:, cols][1:] #remove headers
    #note this is string data at this point, which may be helpful through the grouping stage!
    
    #some wavelengths have '+' so clean that up now. Using loop, look for native array ops
    blank = np.invert(np.any(filedata == '', axis=1)) # find rows with blank entries
    filedata = filedata[blank] # remove rows with blank entries
    gooddata = np.ones(filedata.shape[0], dtype=bool)
    for i in range(filedata.shape[0]):
        if any([re.search('[a-df-zA-Z]', j) for j in filedata[i]]): #look for letters, but skip 'e' b/c scientific notation (e.g., 10e+4)
            gooddata[i] = 0
        else: 
            filedata[i,0] = re.search('[0-9]+.[0-9]+', filedata[i,0])[0]
    
    if np.sum(gooddata) == 0:
        filedata = np.array([])
    else: 
        filedata = filedata[gooddata]

    #convert the g_A data to A_ki data which is ~ to intensity
    if len(filedata) == 0:
        no_lines.append(element)
    elif len(filedata[0]) == len(col_keep):
        filedata[:,1] = (filedata[:,1].astype(float) / filedata[:,2].astype(float))
        filedata = filedata[:,0:2] #drop the g_k col
        
        #next we aggregate by unique wavelength (e.g Li has multiple rows/probs at same wl)
        #clever example here of using np.split to group
        #https://stackoverflow.com/questions/38013778/is-there-any-numpy-group-by-function
        #sort by wavelength
        wvlengths = filedata[:,0] #array of strings in numeric format
        filedata = filedata[filedata[:,0].astype(float).argsort()]
        #find the breakpoint indices which define each group/wavelength, skip first value
        unique_ind = np.unique(filedata[:, 0], return_index=True)[1][1:]
        wavegroups = np.split(filedata[:,1], unique_ind)#list of arrays. Array contains intensity values to sum for wl
        #sum the intensities at each distinct wl
        wavesums = np.array([np.sum(intens_arr.astype(float)) for intens_arr in wavegroups])
        #add the first wavelength index back to array of unique value indices
        unique_ind = np.append(0, unique_ind)
        if len(unique_ind) == len(wavesums):
            atom_dict[element] = np.column_stack((filedata[:,0][unique_ind].astype(float),
                                                    wavesums.astype(float)/np.sum(wavesums.astype(float))))
print('no neutral lines for: ' + str(np.sort(no_lines)))
print('valid elements: ' + str(len(atom_dict)))

#persist the data for ongoing usage
with open(datapath / 'rel_int/valid77_spec.pickle', 'wb') as f:
# Pickle the relative intensity spectra with default protocol (4 as of py3.8)
    pickle.dump(atom_dict, f)

#To load in other modules at top level of repo
#import pickle
#with open(datapath / 'rel_int/top30_spec.pickle', 'rb') as f:
#    atom_dict = pickle.load(f)

no neutral lines for: ['At' 'Es' 'Ir' 'Nb' 'Os' 'Pa' 'Pm' 'Po' 'Pr' 'Pu' 'Re' 'Rn' 'Se' 'Tb'
 'Th' 'U' 'Zr']
valid elements: 77
