### Using lxml parser for Matt's uranus_occs_obs_syst_to_spreadsheet.py script:

Note: I've called Matt's output `uranus_occs_info_auto.csv` as `uranus_occs_info_auto_matt.csv` (which I use to check my code works below)

In [None]:
from bs4 import BeautifulSoup as bsoup # Import BeautifulSoup4 (not BeautifulSoup3)
from astropy.table import Table
import os

infile = 'obs_syst-templates-2021-03-22a_matt.txt'
outfile = 'uranus_occs_info_auto_mia.csv'

# Check input file exists
if os.path.isfile(infile):
    print( 'Found source file: ', infile )
else:
    print( infile, 'not found!' )
    sys.exit(1)
    
# Read the XML file
with open(infile, "r") as fin:
    # Read each line in the file, readlines() returns a list of lines
    content = fin.readlines()
    # Combine the lines in the list into a string
    bsoup_content = bsoup("".join(content), "lxml") #pass the content into the BeautifulSoup library as well as the lxml parser 

# Initialise lists for storage purposes:
facility_names = []; telescope_names = []; instrument_names = []
facility_lids = []; telescope_lids = []; instrument_lids = []

for obsyscomp in bsoup_content.find_all('observing_system_component'):
    for obsyscomptype in obsyscomp.find("type"):
        if obsyscomptype == "Host" and obsyscomp.parent.find('name').text != "Hubble Space Telescope FOS" : # Gotcha: The HST does not have a <type>="Telescope" (only "Host" and "Instrument")
            facility_name = (obsyscomp.find("name").text)
            facility_lid = obsyscomp.find("lid_reference").text
            facility_names.append(facility_name)
            facility_lids.append(facility_lid)
        elif obsyscomptype == "Telescope" and obsyscomp.parent.find('name').text != "Hubble Space Telescope FOS" : 
            #print("here:", obsyscomp.parent.text)
            telescope_name = (obsyscomp.find("name").text)
            telescope_lid = obsyscomp.find("lid_reference").text
            telescope_names.append(telescope_name)
            telescope_lids.append(telescope_lid)
        elif obsyscomptype == "Instrument" and obsyscomp.parent.find('name').text != "Hubble Space Telescope FOS":
            instrument_name = (obsyscomp.find("name").text)
            instrument_lid = obsyscomp.find("lid_reference").text
            instrument_names.append(instrument_name)
            instrument_lids.append(instrument_lid)
        else:
            print("Hubble Space Telescope has no type = Instrument, therefore skipping")

table = Table({'facility_lid': facility_lids, 
               'telescope_lid': telescope_lids,
               'instrument_lid': instrument_lids,
               'facility_name': facility_names, 
               'telescope_name': telescope_names,
              'instrument_name': instrument_names},
              names=('facility_lid', 'telescope_lid', 'instrument_lid', 'facility_name', 'telescope_name', 'instrument_name'))

table.write('uranus_occs_info_auto_mia.csv', format='ascii', delimiter=",", overwrite=True)  

In [None]:
!diff ./uranus_occs_info_auto_matt.csv ./uranus_occs_info_auto_mia.csv # The only difference is that I gave each column a header

In [None]:
table.show_in_notebook()

In [None]:
### Using lxml parser for Matt's `uranus_occs_create_facility_conprods.py`

In [None]:
from lxml import etree
import sys
import os
from datetime import datetime

# Set up lxml parser
parser = etree.XMLParser(remove_blank_text=True)
namespace = {'ns':'http://pds.nasa.gov/pds4/pds/v1'}

# Define name of input files and make sure they exist, then initialize
infofile = './uranus_occs_info_facilities.csv'
templatefile = './observatory.template.nohash.xml'

for file in [infofile,templatefile]:
    if os.path.isfile(file):
        print( 'Found source file: ', file )
    else:
        print( file, 'not found!' )
        sys.exit(1)
        
with open( infofile, 'r' ) as info:
    infolines = info.readlines()

# Get field tags from the first line of infofile
infotags = infolines[0].split(',')
# The first column does not have (or need) a well-behaved tag, but do this
#   so that everything works easily
infotags[0] = 'create_conprod'

# Remove leading and trailing $ in tag names
for i in range(len(infotags)):
    infotags[i] = infotags[i].strip("$,\n") #trim off prefix/suffix "$"s and the concluding CR
    
for i,infoline in enumerate(infolines):
    if infoline.startswith('1'): # I'm guessing this is a switch (flag) that Matt has manually put in to designate whether that record has enough infor to warrant output file being written?
        infofields = infoline.split(',') # splits the row-string into a list where each word is an element
        infofields[-1] = infofields[-1].rstrip("\n")   # Remove the concluding CR
        infodict = dict(zip( infotags, infofields )) # Create a dictionary where the names and corresponding values are zipped into a dictionary of (header, value) pairs
        outfile = 'outputs_lxml/'+infofields[1].rsplit(':')[-1]+'_1.0.xml' # Use the unique name from the facility URN to name the output file        
        # Define name of output file and make sure we don't overwrite anything,
        #   then initialize
        if os.path.isfile(outfile):
            print( outfile, 'already exists!' )
            sys.exit(1)
        else:
            print( 'Writing to', outfile )
  
        # Parse the template file
        tree = etree.parse(templatefile, parser)        
        # Find the placeholders and replace with desired values from infodict:
        tree.find("//ns:logical_identifier", namespaces=namespace).text =  infodict['facility_lid']
        tree.find("//ns:title", namespaces=namespace).text =  infodict['facility_title']
          
        for i, alt_id in enumerate(tree.findall("//ns:alternate_id", namespaces=namespace), start=1):
            alt_id.text = infodict['alternate_id'+str(i)]
            #Remove all unpopulated tags
            if not infodict['alternate_id'+str(i)]: #if field is empty
                alt_id.getparent().remove(alt_id)# delete empty element

        for i, alt_title in enumerate(tree.findall("//ns:alternate_title", namespaces=namespace), start=1):
            alt_title.text = infodict['alternate_title'+str(i)]
            #Remove all unpopulated tags
            if not infodict['alternate_title'+str(i)]: #if field is empty
                alt_title.getparent().remove(alt_title)# delete empty element
        
        tree.find("//ns:modification_date", namespaces=namespace).text = datetime.today().strftime('%Y-%m-%d')
        
        for i, lid_ref in enumerate(tree.findall("//ns:lid_reference", namespaces=namespace), start=1):
            lid_ref.text = infodict['telescope_lid'+str(i)]
            #Remove all unpopulated tags
            if not infodict['telescope_lid'+str(i)]: # if field is empty
                (lid_ref.getparent()).getparent().remove(lid_ref.getparent())# delete empty element AND corresponding <reference_type>.... There must be a neater way than accessing the grandparent?!
                
        tree.find("//ns:name", namespaces=namespace).text = infodict['facility_title']
        tree.find("//ns:country", namespaces=namespace).text = infodict['facility_country']
        
        # Save output to file
        tree.write(outfile, encoding='utf8', pretty_print=True)
        