In [1]:
import pprint

def SPASE_Scraper(path):
    import xml.etree.ElementTree as ET
    import os
    
    # establish path of XML file
    print("You entered " + path)
    if os.path.isfile(path) and path.endswith(".xml"):
        file_size_bytes = os.path.getsize(path)
        file_size = file_size_bytes/(1024*1024*1024)
        print(f"File size is: {file_size:.2f} GB")
        # root[1] = NumericalData
        # root = Spase
        tree = ET.parse(path)
        root = tree.getroot()
    else:
        print(path + " is not a file or not an xml file")
        
    
    # iterate thru NumericalData/DisplayData to obtain ResourceID and locate ResourceHeader
    for child in root[1]:
        if child.tag.endswith("ResourceID"):
            RID = child.text
        elif child.tag.endswith("ResourceHeader"):
            targetChild = child

    # obtain Author, Publication Date, Publisher, Persistent Identifier, and Dataset Name

    # define vars
    author="" 
    pubDate=""
    pub = ""
    dataset = ""
    PI = ""
    # holds role values that are not considered for author var
    UnapprovedAuthors = ["MetadataContact", "ArchiveSpecialist", "HostContact", "Publisher", "User"]

    # iterate thru ResourceHeader
    for child in targetChild:
        # find backup Dataset Name
        if child.tag.endswith("ResourceName"):
            targetChild = child
            dataset = child.text
        # find Persistent Identifier
        elif child.tag.endswith("DOI"):
            PI = child.text
        # find Publication Info
        elif child.tag.endswith("PublicationInfo"):
            PI_Child = child
        # find Contact
        elif child.tag.endswith("Contact"):
            C_Child = child
            # iterate thru Contact to find PersonID and Role
            for child in C_Child:
                # find PersonID
                if child.tag.endswith("PersonID"):
                    # store PID
                    PID = child.text
                # find Role
                elif child.tag.endswith("Role"):
                    # backup author
                    if child.text == "PrincipalInvestigator" or "PI":
                        author = PID
                    # backup publisher
                    elif child.text == "Publisher":
                        pub = child.text
                    # backup author
                    elif child.text not in UnapprovedAuthors:
                        author = PID

    # access Publication Info
    for child in PI_Child:
        if child.tag.endswith("Authors"):
            author = child.text
        elif child.tag.endswith("PublicationDate"):
            pubDate = child.text
        elif child.tag.endswith("PublishedBy"):
            pub = child.text
        elif child.tag.endswith("Title"):
            dataset = child.text
    
    
    # obtain data links and license

    # dictionaries labled by the Access Rights which will store all URLs and their Product Keys if given
    AccessRights = {}
    AccessRights["Open"] = {}
    AccessRights["PartiallyRestricted"] = {}
    AccessRights["Restricted"] = {}

    # iterate thru children to locate Access Information
    for child in root[1]:
        if child.tag.endswith("AccessInformation"):
            targetChild = child
            # iterate thru children to locate AccessURL, AccessRights, and RepositoryID
            for child in targetChild:
                if child.tag.endswith("AccessRights"):
                    access = child.text
                elif child.tag.endswith("AccessURL"):
                    targetChild = child
                    # iterate thru children to locate URL
                    for child in targetChild:
                        if child.tag.endswith("URL"):
                            # check if url is one for consideration
                            if ("nasa.gov" or "virtualsolar.org") in child.text:
                                url = child.text
                                # provide "NULL" value if no keys are found
                                if access == "Open":
                                    AccessRights["Open"][url] = []
                                elif access == "PartiallyRestricted":
                                    AccessRights["PartiallyRestricted"][url] = []
                                else:
                                    AccessRights["Restricted"][url] = []
                            else:
                                break
                        # check if URL has a product key
                        elif child.tag.endswith("ProductKey"):
                            prodKey = child.text
                            if access == "Open":
                                # if only one prodKey exists
                                if AccessRights["Open"][url] == []:
                                    AccessRights["Open"][url] = [prodKey]
                                # if multiple prodKeys exist
                                else:
                                    AccessRights["Open"][url] += [prodKey]
                            elif access == "PartiallyRestricted":
                                if AccessRights["PartiallyRestricted"][url] == []:
                                    AccessRights["PartiallyRestricted"][url] = prodKey
                                else:
                                    AccessRights["PartiallyRestricted"][url] += [prodKey]
                            else:
                                if AccessRights["Restricted"][url] == []:
                                    AccessRights["Restricted"][url] = prodKey
                                else:
                                    AccessRights["Restricted"][url] += [prodKey]
                # find backup Publisher if needed
                elif pub == "":
                    if child.tag.endswith("RepositoryID"):
                        # use partition to split text by Repository/ and assign only the text after it to pub 
                        before, sep, after = child.text.partition("Repository/")
                        pub = after
                # continue to check for additional AccessURLs            
                continue
        # continue to check for additional Access Informations
        continue
        
    #pubYear = pubDate[0:4]
    
    # return stmt
    return RID, author, pub, pubDate, dataset, PI, AccessRights

    
# test path : "../NASA/NumericalData/DE2/IDM/PT0.25S.xml"
print("Enter path of SPASE record ... such as C:\dir1\dir2\spaseRecord_x.xml")
path = input()
RID, author, pub, pubDate, dataset, PI, AccessRights = SPASE_Scraper(path)

print("The ResourceID is " + RID)
print("The author is " + author)
print("The publication year is " + pubDate)
print("The publisher is " + pub)
print("The dataset is " + dataset)
print("The persistent identifier is " + PI)
print("The URLs with their associated product keys and license(s) are:")
pprint.pprint(AccessRights)

Enter path of SPASE record ... such as C:\dir1\dir2\spaseRecord_x.xml


 ../NASA/NumericalData/DE2/IDM/PT0.25S.xml


You entered ../NASA/NumericalData/DE2/IDM/PT0.25S.xml
File size is: 0.00 GB
The ResourceID is spase://NASA/NumericalData/DE2/IDM/PT0.25S
The author is Heelis, Roderick, A.; Candey, Robert, M.
The publication year is 2023-01-01T00:00:00
The publisher is NASA Space Physics Data Facility
The dataset is DE 2 250ms IDM Ion Drift Velocities
The persistent identifier is https://doi.org/10.48322/reef-jt02
The URLs with their associated product keys and license(s) are:
{'Open': {'ftps://spdf.gsfc.nasa.gov/pub/data/de/de2/plasma_idm/': [],
          'https://cdaweb.gsfc.nasa.gov/cgi-bin/eval2.cgi?dataset=DE2_VION250MS_IDM&index=sp_phys': ['DE2_VION250MS_IDM'],
          'https://cdaweb.gsfc.nasa.gov/hapi': ['DE2_VION250MS_IDM@0',
                                                'DE2_VION250MS_IDM@1',
                                                'DE2_VION250MS_IDM@2'],
          'https://spdf.gsfc.nasa.gov/pub/data/de/de2/plasma_idm/': []},
 'PartiallyRestricted': {},
 'Restricted': {}}
