# Preface
This is a simple script to pull NIST line and spectral data into a notebook, specifically from the **NIST Atomic Spectra Database Lines Data** area of the NIST database.


In [3]:
#Numpy is good practice, pandas is what we will load our web scrape into
import numpy as np
import pandas as pd

#Not required, can give some surface level insight into the repot

from pandas_profiling import ProfileReport


from bs4 import BeautifulSoup
import requests

### 1. Load in the data

In [47]:
link = "https://physics.nist.gov/cgi-bin/ASD/lines1.pl?spectra=W&limits_type=0&low_w=1&upp_w=100&unit=1&submit=Retrieve+Data&de=0&format=0&line_out=0&en_unit=0&output=0&bibrefs=1&page_size=15&show_obs_wl=1&show_calc_wl=1&unc_out=1&order_out=0&max_low_enrg=&show_av=2&max_upp_enrg=&tsb_value=0&min_str=&A_out=0&intens_out=on&max_str=&allowed_out=1&forbid_out=1&min_accur=&min_intens=&conf_out=on&term_out=on&enrg_out=on&J_out=on"
#link2 = "https://physics.nist.gov/cgi-bin/ASD/lines1.pl?spectra=Mg&limits_type=0&low_w=1&upp_w=1000&unit=1&submit=Retrieve+Data&de=0&format=0&line_out=0&en_unit=0&output=0&bibrefs=1&page_size=15&show_obs_wl=1&show_calc_wl=1&unc_out=1&order_out=0&max_low_enrg=&show_av=2&max_upp_enrg=&tsb_value=0&min_str=&A_out=0&intens_out=on&max_str=&allowed_out=1&forbid_out=1&min_accur=&min_intens=&conf_out=on&term_out=on&enrg_out=on&J_out=on"

#Use requests to get the text
res = requests.get(link).text
print('Request is done')
#Make the request beautiful
soup = BeautifulSoup(res,'lxml')
print('Soup`s hot')
#Find all the tables (there is only one)
table = soup.findAll('table')
print('The unecessary table is tabulated')

Request is done
Soup`s hot
The unecessary table is tabulated


### 2. Parse out the data
Like Moses parsing the sea.

In [48]:
pre_pandas_table = []

#Find all the table_rows:
table_rows = soup.find_all('tr')
#For a row in all the rows
for tr in table_rows:
    #Find the text, and then strip it 
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    #Append the row data to our empty list
    pre_pandas_table.append(row)

### 3. Let's equip our experimentalist hats & look at the table
We see it has 20 columns which we identify in one of the cels below. Also, the top 10 rows are defunct, so we can chop them up. Note sometimes **the number of cols can change**. It should be 20, but just check and make sure

In [53]:
pd.DataFrame(pre_pandas_table)[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,,,,,,,,,,,,,,,,,,,,
1,NIST Atomic Spectra Database Lines Data\nW (al...,Example of how to reference these results:\n\n...,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,
3,Wavelength range: 1 - 100 nm\nWavelength in: v...,,,,,,,,,,,,,,,,,,,
4,Query NIST Bibliographic Databases for\n\n\nW ...,Query NIST Bibliographic Databases for,W (new window),,,,,,,,,,,,,,,,,
5,Query NIST Bibliographic Databases for,,,,,,,,,,,,,,,,,,,
6,W (new window),,,,,,,,,,,,,,,,,,,
7,W Energy Levels,,,,,,,,,,,,,,,,,,,
8,W Line Wavelengths and Classification,,,,,,,,,,,,,,,,,,,
9,W Transition Probabilities,,,,,,,,,,,,,,,,,,,


In [None]:
pre

### 4. Create column names based on what is on the site, and load the pre_pandas_table into a table

In [56]:
cols = ['Ion','Observed Wavelength (nm)','O Uncertainty (nm)'
       ,'Ritz Wavelength (nm)','R Uncertainty (nm)'
       ,'Rel. Int.',r'A$_{kl}$ (s$^{-1}$)','Acc'
       ,'Ei','TBD','Ek'
       ,'LL Config','LL Term','LL J'
       ,'UL Config','UL Term','UL J'
       ,'Type', 'TP Ref,','Line Ref']


#How many rows do we have to chop
rows_to_be_chopped = 10

df = pd.DataFrame(pre_pandas_table, columns = cols)[rows_to_be_chopped:]
#Remove empty rows and reset index
df = df[df['Ion'].astype(str) != ""].reset_index(drop = True)
#Drop the 0th row, also empty
df = df.drop([0]).reset_index(drop = True)

#Remove some empty spaces
df['Ei'] = df['Ei'].str.replace(' ', '')
df['Ek'] = df['Ek'].str.replace(' ', '')

#Convert columns that have numeric values to numerics, not strings
for col in cols[1:11]:
    #print(col)
    #print(df[col])
    df[col] = pd.to_numeric(df[col],downcast = 'float',errors = 'coerce')

df.loc[df['UL Config'].str.contains('5d2')]
df.head(5)

Unnamed: 0,Ion,Observed Wavelength (nm),O Uncertainty (nm),Ritz Wavelength (nm),R Uncertainty (nm),Rel. Int.,A$_{kl}$ (s$^{-1}$),Acc,Ei,TBD,Ek,LL Config,LL Term,LL J,UL Config,UL Term,UL J,Type,"TP Ref,",Line Ref
0,W XLVI,1.0017,0.0004,1.0017,0.0004,15.0,,,2819600.0,,12803000.0,3d104d,2D,3/2,3d106f,2F°,5/2,,,L15201c89
1,W XLVI,1.0181,0.0004,1.0181,0.0004,20.0,,,2993550.0,,12816000.0,3d104d,2D,5/2,3d106f,2F°,7/2,,,L15201c89
2,W LXX,1.0397,0.0005,1.0397,0.0005,4.0,,,,,,2s2p2(3P0),"(1/2,0)",1/2,2s22p,2P°,3/2,,,L16568
3,W XLVI,1.151,0.0015,1.1511,0.0015,40.0,,,4293400.0,,12981000.0,3d104f,2F°,5/2,3d106g,2G,7/2,,,L10257
4,W XLVI,1.1567,0.0006,1.1568,0.0006,40.0,,,4337200.0,,12982000.0,3d104f,2F°,7/2,3d106g,2G,9/2,,,L15201c89


### 5. Reward your valiant efforts with a pint
Also note TBD col can be dropped