In [None]:
# import functions from .py files
import pprint, sqlite3
from SPASE_Scraper_Script import SPASE_Scraper
from PathGrabber import getPaths
from SQLiteFun import (create_tables, add_Metadata, add_Sources)

# list that holds paths returned by PathGrabber
SPASE_paths = []

# print Flag (True prints, False does not)
printFlag = False

# get user input and extract all SPASE records
print("Enter root folder you want to search")
folder = input()
print("You entered " + folder)
SPASE_paths = getPaths(folder, SPASE_paths)
if printFlag:
    print("The number of records is "+ str(len(SPASE_paths)))
    print("The SPASE records found are:")
    print(SPASE_paths)
print("===========================================================================================================")

# list that holds SPASE records already checked
searched = []

# iterate through all SPASE records returned by PathGrabber
for record in SPASE_paths:
    # scrape metadata for each record
    if record not in searched:
        (RID, RIDField, author, authorField, authorRole, pub, pubField, pubDate, pubDateField, dataset, datasetField, 
         desc, descField, PI, PIField, AccessRights, licenseField, datalinkField) = SPASE_Scraper(record)
        
        # list that holds required fields
        # required = [RID, description, author, authorRole, url]
        
        searched.append(record)
        pubYear = pubDate[0:4]
        author = ", ".join(author)
        authorRole = ", ".join(authorRole)
        if printFlag:
            print("The ResourceID is " + RID + " which was obtained from " + RIDField)
            print("The author(s) are " + author + " who are " + authorRole + " which was obtained from " + authorField)
            print("The publication year is " + pubYear + " which was obtained from " + pubDateField)
            print("The publisher is " + pub + " which was obtained from " + pubField)
            print("The dataset is " + dataset + " which was obtained from " + datasetField)
            print("The description is " + desc + " which was obtained from " + descField)
            print("The persistent identifier is " + PI + " which was obtained from " + PIField)
            print("The URLs with their associated product keys obtained from " + datalinkField + """ and their 
                  license(s) obtained from """ + licenseField + " are: ")
            pprint.pprint(AccessRights)

        url = []
        prodKey= []
        license = ""
        desired = False

        # separate license, url, and product keys from AccessRights to store in db
        for k, v in AccessRights.items():
            # check if no URLs returned for any access
            if not v:                
                continue
            else:
                license = k
                # check if any urls are for consideration
                for link in v.keys():
                    if ("nasa.gov" or "virtualsolar.org") in link:
                        desired = True
                for key, val in v.items():
                    # allow desired URLs to be added to database
                    if ("nasa.gov" or "virtualsolar.org") in key:
                        url.append(str(key))
                        prodKey.append(str(val))
                    else:
                        # if no desired URLs present
                        if not desired:
                            url = ["No NASA Links"]
                            prodKey = [""]
                            
                if printFlag:
                    print(k + " was assigned to license")
                    print(str(url) + " was assigned to url")
                    print(str(prodKey) + " was assigned to prodKey")

                    
        # add tables to existing database
        create_tables()
        
        # insert metadata entries into table
        i = 0        
        try:
            with sqlite3.connect('SPASE_Data.db') as conn:
                # add a new SPASE Record
                for urls in url:
                    Metadata = (RID,author,authorRole,pub,pubYear,dataset,license,url[i],prodKey[i],desc,PI)
                    Record_id = add_Metadata(conn, Metadata)
                    print(f'Created a Metadata entry with the row number {Record_id}')
                    i += 1
                # add a new Source record
                Sources = (RID,authorField,pubField,pubDateField,datasetField,licenseField,
                           datalinkField,descField,PIField)
                Record_id = add_Sources(conn, Sources)
                print(f'Created a Sources entry with row number {Record_id}')

        except sqlite3.Error as e:
            print(e)

        print("===========================================================================================================")

    else:
        continue

# call .py file directly from notebook
#%run ./SPASE_test.py {path} in notebook
#import sys
#args = sys.argv
#path = args[1] in source file

# test path : Small = "/home/jovyan/NASA/NumericalData/DE2" or Big = "/home/jovyan/NASA/NumericalData/ACE"
# complex author ex: /home/jovyan/NASA/NumericalData/Cassini/MAG/PT60S.xml 
# OR /home/jovyan/NASA/NumericalData/ACE/Attitude/Definitive/PT1H.xml
# complex URL ex: /home/jovyan/NASA/NumericalData/ACE/CRIS/L2/P1D.xml


In [2]:
from SPASE_Scraper_Script import SPASE_Scraper

SPASE_Scraper.__doc__

'Takes path of a .xml SPASE record file and returns a tuple of values of varying types which hold all \n    desired metadata and the fields they came from. This will collect the desired metadata following the \n    priority rules determined by SPASE record experts. If any desired metadata is not found, the default \n    value assigned is an empty string.\n    \n    :param path: A string of the absolute/relative path of the SPASE record to be scraped.\n    :type path: String\n    :return: A tuple containing the metadata desired and where they were obtained.\n    :rtype: tuple\n    '

In [16]:
import sqlite3
from SQLiteFun import execution

def CountRemover(stmt, nested):
    # if nested = 0 (stmt not a nested query)
    before, sep, after = stmt.partition("COUNT(")
    b4, sep, after = after.partition(")")
    cleanStmt = before + b4 + after
    # add ending parentheses if stmt will be in a nested query (nested=1)
    if nested:
        before, sep, after = cleanStmt.partition(";")
        cleanStmt = before + ")" + sep + after
    return cleanStmt

totalStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries;"""
authorStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE author NOT LIKE "" ;"""
pubStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE publisher NOT LIKE "" ;"""
pubYrStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE publicationYr NOT LIKE "" ;"""
datasetStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE dataset NOT LIKE "" ;"""
licenseStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE license LIKE "%cc0%" 
                    OR license LIKE "%Creative Commons Zero v1.0 Universal%" ;"""
urlStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE url NOT LIKE "" ;"""
NASAurlStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE url NOT LIKE "" AND url!="No NASA Links";"""

PIStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE PI NOT LIKE "" ;"""
descStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries WHERE description NOT LIKE "" ;"""
citationStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                WHERE author NOT LIKE ""
                AND dataset NOT LIKE ""
                AND publicationYr NOT LIKE ""
                AND publisher NOT LIKE "";"""
complianceStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                WHERE description NOT LIKE ""
                AND dataset NOT LIKE ""
                AND PI NOT LIKE "";"""

# edit once get data link checker to include working data links as a check
AL1Stmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                WHERE (author NOT LIKE ""
                AND dataset NOT LIKE ""
                AND publicationYr NOT LIKE ""
                AND publisher NOT LIKE "")
                OR (description NOT LIKE ""
                AND dataset NOT LIKE ""
                AND PI NOT LIKE "")
                OR PI NOT LIKE ""
                OR license LIKE "%cc0%" OR license LIKE "%Creative Commons Zero v1.0 Universal%";"""
# improve by using a list to store strings and using a nested for loop to iterate thru all combos 
#   given the number of items needed, n (this example is n=2)
AL2Stmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                WHERE ((author NOT LIKE ""
                    AND dataset NOT LIKE ""
                    AND publicationYr NOT LIKE ""
                    AND publisher NOT LIKE "")
                AND
                    (description NOT LIKE ""
                    AND dataset NOT LIKE ""
                    AND PI NOT LIKE "")) 
                OR
                    ((author NOT LIKE ""
                    AND dataset NOT LIKE ""
                    AND publicationYr NOT LIKE ""
                    AND publisher NOT LIKE "")
                AND
                    PI NOT LIKE "")
                OR
                    ((author NOT LIKE ""
                    AND dataset NOT LIKE ""
                    AND publicationYr NOT LIKE ""
                    AND publisher NOT LIKE "")
                AND
                    license LIKE "%cc0%" OR license LIKE "%Creative Commons Zero v1.0 Universal%")
                OR
                    ((description NOT LIKE ""
                    AND dataset NOT LIKE ""
                    AND PI NOT LIKE "")
                AND
                    license LIKE "%cc0%" OR license LIKE "%Creative Commons Zero v1.0 Universal%")
                OR
                    ((description NOT LIKE ""
                    AND dataset NOT LIKE ""
                    AND PI NOT LIKE "")
                AND 
                    PI NOT LIKE "")
                OR  
                    (PI NOT LIKE ""
                AND
                    license LIKE "%cc0%" OR license LIKE "%Creative Commons Zero v1.0 Universal%");"""
# not actually all but all at this moment
allStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                WHERE (author NOT LIKE ""
                AND dataset NOT LIKE ""
                AND publicationYr NOT LIKE ""
                AND publisher NOT LIKE "")
                AND (description NOT LIKE ""
                AND dataset NOT LIKE ""
                AND PI NOT LIKE "")
                AND PI NOT LIKE ""
                AND license LIKE "%cc0%" OR license LIKE "%Creative Commons Zero v1.0 Universal%";"""
# test for above queries with specified publisher
SDACStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                    WHERE (publisher LIKE "%SDAC" OR publisher LIKE 
                            "%Solar Data Analysis Center")"""
SPDFStmt = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                    WHERE (publisher LIKE "%SPDF" OR publisher LIKE 
                            "%Space Physics Data Facility")"""
SDACIntersect = """SELECT COUNT(DISTINCT SPASE_id) FROM (
                    SELECT DISTINCT SPASE_id FROM MetadataEntries 
                    WHERE (publisher LIKE "%SDAC" OR publisher LIKE 
                            "%Solar Data Analysis Center")
                    INTERSECT """
SPDFIntersect = """SELECT COUNT(DISTINCT SPASE_id) FROM (
                    SELECT DISTINCT SPASE_id FROM MetadataEntries 
                    WHERE (publisher LIKE "%SPDF" OR publisher LIKE 
                            "%Space Physics Data Facility")
                    INTERSECT """
# can pass any stmt to CountRemover to return records instead of counts of records
#SDAC_NumStmt = CountRemover(SDACStmt,0)
#SDACauthor = CountRemover(SDACIntersect,0) + CountRemover(authorStmt,1)
SDACauthor = SDACIntersect + CountRemover(authorStmt,1)
SPDFauthor = SPDFIntersect + CountRemover(authorStmt,1)
SDAC_AL1 = SDACIntersect + CountRemover(AL1Stmt,1)
SPDF_AL1 = SPDFIntersect + CountRemover(AL1Stmt,1)
SDAC_AL2 = SDACIntersect + CountRemover(AL2Stmt,1)
SPDF_AL2 = SPDFIntersect + CountRemover(AL2Stmt,1)


rows = execution(totalStmt)
for row in rows:
    print("There are " + str(row[0]) + " records total")

rows = execution(authorStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with at least one author")

rows = execution(pubStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a publisher")

rows = execution(pubYrStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a publication year")
    
rows = execution(datasetStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a dataset")
    
rows = execution(licenseStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a license")
    
rows = execution(urlStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a url")
    
rows = execution(NASAurlStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a NASA url")

rows = execution(PIStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a persistent identifier")
    
rows = execution(descStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with a description")
    
rows = execution(citationStmt)
for row in rows:
    print("There are " + str(row[0]) + " records with citation info")
    
rows = execution(complianceStmt)
for row in rows:
    print("There are " + str(row[0]) + " records that meet DCAT-US3 compliance")
    
rows = execution(atLeastOneStmt)
for row in rows:
    print("There are " + str(row[0]) + " records that have at least one desired field")
    
rows = execution(atLeastTwoStmt)
for row in rows:
    print("There are " + str(row[0]) + " records that have at least two desired fields")
    
rows = execution(allStmt)
for row in rows:
    print("There are " + str(row[0]) + " records that have all desired fields")
    
rows = execution(SDACStmt)
for row in rows:
    print("There are " + str(row[0]) + " records published by SDAC")

# if need links instead of count use this print
#rows = execution(SDAC_NumStmt)
#for row in rows:
 #   print(row)
    
rows = execution(SPDFStmt)
for row in rows:
    print("There are " + str(row[0]) + " records published by SPDF")
    
rows = execution(SDACauthor)
for row in rows:
    #print(row) in case need links
    print("There are " + str(row[0]) + " records with at least one author published by SDAC")
    
rows = execution(SPDFauthor)
for row in rows:
    print("There are " + str(row[0]) + " records with at least one author published by SPDF")
    
rows = execution(SDAC_AL1)
for row in rows:
    print("There are " + str(row[0]) + " records with at least one desired field published by SDAC")
    
rows = execution(SPDF_AL1)
for row in rows:
    print("There are " + str(row[0]) + " records with at least one desired field published by SPDF")
    
rows = execution(SDAC_AL2)
for row in rows:
    print("There are " + str(row[0]) + " records with at least two desired fields published by SDAC")
    
rows = execution(SPDF_AL2)
for row in rows:
    print("There are " + str(row[0]) + " records with at least two desired fields published by SPDF")

There are 3017 records total
There are 2890 records with at least one author
There are 3017 records with a publisher
There are 1864 records with a publication year
There are 3017 records with a dataset
There are 0 records with a license
There are 3017 records with a url
There are 2236 records with a NASA url
There are 1853 records with a persistent identifier
There are 3017 records with a description
There are 1864 records with citation info
There are 1853 records that meet DCAT-US3 compliance
There are 1870 records that have at least one desired field
There are 1853 records that have at least two desired fields
There are 0 records that have all desired fields
There are 7 records published by SDAC
There are 2170 records published by SPDF
There are 7 records with at least one author published by SDAC
There are 2165 records with at least one author published by SPDF
There are 6 records with at least one desired field published by SDAC
There are 1676 records with at least one desired fiel

In [1]:
import sqlite3
from SQLiteFun import SDAC_records

SDAC_records("1996")

There are 5 records published by SDAC in the year 1996


In [15]:
# unfinished!
desiredFields = ["(author NOT LIKE '' AND dataset NOT LIKE '' AND publicationYr NOT LIKE '' AND publisher NOT LIKE '')",
                 "(description NOT LIKE '' AND dataset NOT LIKE '' AND PI NOT LIKE '')",
                 "PI NOT LIKE ''", "license LIKE '%cc0%' OR license LIKE '%Creative Commons Zero v1.0 Universal%'"]
def getCombos(n):
    combos = []
    for i in range(len(desiredFields)):
        for j in range(i+1,len(desiredFields)):
            combo = desiredFields[i] + "AND" + desiredFields[j] + ")"
            combos.append(combo)
    return combos

# test of getCombos
AL2StmtTest = """SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                WHERE ("""
k=0
combos = getCombos(2)
for combo in combos:
    AL2StmtTest += combos[k] + " OR ("
    k += 1
AL2StmtTest += ";"
print(AL2StmtTest)

SELECT COUNT(DISTINCT SPASE_id) FROM MetadataEntries 
                WHERE ((author NOT LIKE '' AND dataset NOT LIKE '' AND publicationYr NOT LIKE '' AND publisher NOT LIKE '')AND(description NOT LIKE '' AND dataset NOT LIKE '' AND PI NOT LIKE '')) OR ((author NOT LIKE '' AND dataset NOT LIKE '' AND publicationYr NOT LIKE '' AND publisher NOT LIKE '')ANDPI NOT LIKE '') OR ((author NOT LIKE '' AND dataset NOT LIKE '' AND publicationYr NOT LIKE '' AND publisher NOT LIKE '')ANDlicense LIKE '%cc0%' OR license LIKE '%Creative Commons Zero v1.0 Universal%') OR ((description NOT LIKE '' AND dataset NOT LIKE '' AND PI NOT LIKE '')ANDPI NOT LIKE '') OR ((description NOT LIKE '' AND dataset NOT LIKE '' AND PI NOT LIKE '')ANDlicense LIKE '%cc0%' OR license LIKE '%Creative Commons Zero v1.0 Universal%') OR (PI NOT LIKE ''ANDlicense LIKE '%cc0%' OR license LIKE '%Creative Commons Zero v1.0 Universal%') OR (;
