In [1]:
"""
1. lift table from text

2. extract study ID from text

3. get metadata from NOAA API with study ID

"""

import requests
import re
from collections import OrderedDict


# get metadata from extracted study ID
def getMetaData (url):    
    txt_data = requests.get(url).text
    txt_data = txt_data.replace('\r', '')  # remove "\r" in text
    splited_data = txt_data.split('\n')   
    
    table = getTable(splited_data)
    
    # find "study" to get study id: 5 digits after "study"
    for line in splited_data:
        if "study" in line.lower():
            study_line = line.lower().partition("study")[2]
            temp = re.search('\d{% s}'% 5, study_line)
            if temp:
                study_id = temp.group(0)
#                 print(study_id)
                
            #    NOAA API
                api = "https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=" + study_id

                return table, requests.get(api).json()


def getTable (splited_data):
    length = len(splited_data)
    
    # remove the ‘#’ in front
    for i in range(length):
        if splited_data[i].startswith("#"):
            splited_data[i] = splited_data[i][1:].lstrip()          
   
  
    """
    tabular data: 1. tab seperated 2. the number of columns is the same in the table.     
    """
    
    # if tab seperated, get the number of tab seperated words (the number of columns)
    num_cols = []
    for i in range(length):
        if "\t" in splited_data[i]:
            splited_data[i] = splited_data[i].split('\t')
            num_cols.append(len(splited_data[i]))
        else:
            num_cols.append(0)
            
    # get start index of table with # of columns            
    start_index = []
    for i, num in enumerate(num_cols):
        if num > 2 and i != len(num_cols)-1 and num == num_cols[i+1] and num != num_cols[i-1] or num>2 and i == 0 and num ==num_cols[i+1]:
            start_index.append(i)
            
    # get tabular data    
    tables = []
    for i in start_index:            
        
        table = [splited_data[i]]
        num = len(splited_data[i])
        for j in range(i+1, len(splited_data)):            
            if len(splited_data[j]) == num:      
                
                table.append(splited_data[j])
            else:
                break
                
        if len(table) > 2: # # of raw > 2
            tables.append(table)       
        
        # TBC: differentiate types of table: chronological and paleo information        
            
    return tables[-1]  # TBC: need to update for multiple table, missing value: change        
            


In [2]:
"""
#. Convert metadata to LiPD
1) geo data
2) publication
3) paleodata

#. reference
lipd.net
https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016-discussion.html#discussion

LiPD utilities in Python: load, manipulate and write out lipd file
https://github.com/nickmckay/LiPD-utilities

https://lipdverse.org
https://lipdverse.org/Temp12k/current_version/CangoCave.Talma.1992.html
"""

def getLiPD (table, metadata, url):    
    # convert metadata to LiPD
    
    D = dict()   

    #     noaa_geo = metadata['study'][0]['site'][0]['geo']
    #     noaa_pub = metadata['study'][0]['publication'][0]    
    
    D['geo'] = getGeo(metadata['study'][0]['site'][0]['geo'])
    D['pub'] = getPub(metadata['study'][0]['publication'][0] )
    
    #    # check # of columns are same => problem: no info in NOAA api about notes column in text      
    D['paleoData'] = getPaleoData (table, metadata, url)

    D['archiveType'] = "TBC" # needs a converter for NOAA paleo data type, see issue #3, e.g. 'Speleothem'
    # Map NOAA data type to the archive type used in LinkedEarth
    
    D["createdBy"] = "NOAAconverter"
    
    D["datatSetName"] = "TBC"
    # SiteName.Author.Year. From the NOAA API, lift as: siteName, Last name of first author in contributor field, most recent "pubYear"
    
#     D["originalDataUrl"] = "dataCenterUrl"
    D["originalDataUrl"] = metadata['study'][0]['contactInfo']['dataCenterUrl']
    
    return D

In [3]:
# needs to update
def getGeo(noaa_geo):
    
    lipd_geo = {"geo": {"type": noaa_geo['geoType'], 
                        "geometry": {"type": noaa_geo['geometry']['type'].capitalize(), 
                                     "coordinates": noaa_geo['geometry']['coordinates']
                                    + [noaa_geo['properties']['minElevationMeters']]
                                    },
                        "properties": noaa_geo['properties']}}
    
    return lipd_geo
    


In [4]:
# needs to update
def getPub(noaa_pub):
    lipd_pub = {"pub": {
        "author": getAuthor(noaa_pub['author']),
        "type" : noaa_pub['type'],
        "identifier" : [
            {"type": noaa_pub['identifier']['type'],
             "id": noaa_pub['identifier']['id'],
             "url": noaa_pub['identifier']['url']}],
        "year": noaa_pub['pubYear']}}
    
    return lipd_pub
    

In [5]:
def getPaleoData (table, metadata, url): 
    paleodata = OrderedDict()
    paleodata['paleo0'] = OrderedDict()
    paleodata['paleo0']['measurementTable'] = OrderedDict()
    paleodata['paleo0']['measurementTable']['paleo0measurement0'] = dict()
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['tableName'] = 'paleo0measurement0'
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['missingValue'] = 'nan' 
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['filename'] = None
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'] = OrderedDict()
    
    
    # units
    variables = []    
    for variable in flatten2(metadata, url)[0]['variables']:
        variables.append(variable['cvUnit'].split('>')[-1])
    
    for i, name in enumerate(table[0]):
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name] = dict()
        
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['number'] = i+1
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['variableName'] = name
               
        try:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = variables[i]
        except:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = None
        
    
        # value
        value_list = []
        for string in list(zip(*table[1:]))[i]:
            try:
                num = float(string)
                
                '''need to update for checking missing value, missing value=-999.9 (TBC)'''
                if num == -999.9:                     
                    value_list.append('nan')
                else:
                    value_list.append(float(string))
            except:
                value_list.append(string)
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['value'] = value_list
    
    return paleodata

In [6]:
def getAuthor(author):        
    names = flatten(author)
    name_list = []
    for name_cand in names:
        if name_cand:
            for name in name_cand.split(','):
                name_list.append({"name": name.strip()})
        else:
            name_list.append({"name": None})
            
    return name_list

In [7]:
# get values from nested dict
def flatten(value):    
    if isinstance(value, dict):
        item = []
        helper(value,item)
    else:
        item = [value]
        
    return item
def helper(dict2, item):
    
    for key, value in dict2.items():
        if isinstance(value, dict):
            helper(value, item)
        else:
            item.append(value)
    
    return item

# find specific value from nested dict, list
def flatten2(value, url):    
    if isinstance(value, dict):
        item = []
        helper2(value,item, url)        

    else:
        if value == url:
            item = [value]
        
    return item
def helper2(dict2, item, url):
    
    for key, value in dict2.items():
        if isinstance(value, dict):
            helper2(value, item, url)
            
        elif isinstance(value, list):
            helper3(value, item, url)
        else:
            if value == url:
                item.append(dict2)
    
    return item

def helper3(list2, item, url):    
    for value in list2:
        if isinstance(value, dict):
            helper2(value, item, url)


In [8]:
url = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018jpc56.txt"
url1 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-raw.txt"



In [9]:
table, metadata = getMetaData (url)

In [10]:
table1, metadata1 = getMetaData (url1)

In [11]:
getLiPD (table, metadata, url)

{'geo': {'geo': {'type': 'Feature',
   'geometry': {'type': 'Point',
    'coordinates': ['27.4835', '-112.0743', '-881']},
   'properties': {'southernmostLatitude': '27.4835',
    'northernmostLatitude': '27.4835',
    'westernmostLongitude': '-112.0743',
    'easternmostLongitude': '-112.0743',
    'minElevationMeters': '-881',
    'maxElevationMeters': '-881'}}},
 'pub': {'pub': {'author': [{'name': 'Tripti Bhattacharya'},
    {'name': 'Jessica E. Tierney'},
    {'name': 'Jason A. Addison'},
    {'name': 'James W. Murray'}],
   'type': 'publication',
   'identifier': [{'type': 'doi',
     'id': '10.1038/s41561-018-0220-7',
     'url': 'http://dx.doi.org/10.1038/s41561-018-0220-7'}],
   'year': 2018}},
 'paleoData': OrderedDict([('paleo0',
               OrderedDict([('measurementTable',
                             OrderedDict([('paleo0measurement0',
                                           {'tableName': 'paleo0measurement0',
                                            'missingValu

In [12]:
getLiPD (table1, metadata1, url1)

{'geo': {'geo': {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': ['6.3', '125.83', '-2114']},
   'properties': {'southernmostLatitude': '6.3',
    'northernmostLatitude': '6.3',
    'westernmostLongitude': '125.83',
    'easternmostLongitude': '125.83',
    'minElevationMeters': '-2114',
    'maxElevationMeters': '-2114'}}},
 'pub': {'pub': {'author': [{'name': None}],
   'type': 'publication',
   'identifier': [{'type': 'doi',
     'id': '10.1038/nature0293',
     'url': 'http://dx.doi.org/10.1038/nature0293'}],
   'year': 2004}},
 'paleoData': OrderedDict([('paleo0',
               OrderedDict([('measurementTable',
                             OrderedDict([('paleo0measurement0',
                                           {'tableName': 'paleo0measurement0',
                                            'missingValue': 'nan',
                                            'filename': None,
                                            'columns': OrderedDict([('depth_cm',
  