In [1]:
"""
1. lift table from text

2. extract study ID from text

3. get metadata from NOAA API with study ID

"""

import requests
import re
from collections import OrderedDict


# get metadata from extracted study ID
def getMetaData (url):    
    txt_data = requests.get(url).text
    txt_data = txt_data.replace('\r', '')  # remove "\r" in text
    splited_data = txt_data.split('\n')   
    
#     table = getTable(splited_data)
    
    # find "study" to get study id: 5 digits after "study"
    for line in splited_data:
        if "study" in line.lower():
            study_line = line.lower().partition("study")[2]
            temp = re.search('\d{% s}'% 5, study_line)
            if temp:
                study_id = temp.group(0)
#                 print(study_id)
                
            #    NOAA API
                api = "https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=" + study_id

                return requests.get(api).json()


def getTable (url):
    txt_data = requests.get(url).text
    txt_data = txt_data.replace('\r', '')  # remove "\r" in text
    splited_data = txt_data.split('\n')   
    
    length = len(splited_data)
    
    # remove the ‘#’ in front
    for i in range(length):
        if splited_data[i].startswith("#"):
            splited_data[i] = splited_data[i][1:].lstrip()          
   
  
    """
    tabular data: 1. tab seperated 2. the number of columns is the same in the table.     
    """
    
    # if tab seperated, get the number of tab seperated words (the number of columns)
    num_cols = []
    for i in range(length):
        if "\t" in splited_data[i]:
            splited_data[i] = splited_data[i].split('\t')
            num_cols.append(len(splited_data[i]))
        else:
            num_cols.append(0)
            
    # get start index of table with # of columns            
    start_index = []
    for i, num in enumerate(num_cols):
        if num > 2 and i != len(num_cols)-1 and num == num_cols[i+1] and num != num_cols[i-1] or num>2 and i == 0 and num ==num_cols[i+1]:
            start_index.append(i)
            
    # get tabular data    
    tables = []
    for i in start_index:            
        
        table = [splited_data[i]]
        num = len(splited_data[i])
        for j in range(i+1, len(splited_data)):            
            if len(splited_data[j]) == num:      
                
                table.append(splited_data[j])
            else:
                break
                
        if len(table) > 2: # # of raw > 2
            tables.append(table)       
        
        # TBC: differentiate types of table: chronological and paleo information        
            
    return tables[-1]  # TBC: need to update for multiple table, missing value: change        
            


In [2]:
"""
#. Convert metadata to LiPD
1) geo data
2) publication
3) paleodata

#. reference
lipd.net
https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016-discussion.html#discussion

LiPD utilities in Python: load, manipulate and write out lipd file
https://github.com/nickmckay/LiPD-utilities

https://lipdverse.org
https://lipdverse.org/Temp12k/current_version/CangoCave.Talma.1992.html
"""

def getLiPD (table, metadata, url):    
    # convert metadata to LiPD
    
    D = dict()   

   
    D['geo'] = getGeo(metadata['study'][0]['site'], url)
    
    # use original NOAA metadata: can't decide lipd 'pub' format yet (TBC)
    D['pub'] = metadata['study'][0]['publication']
#     D['pub'] = getPub(metadata['study'][0]['publication'])
    
    #    # check # of columns are same => problem: no info in NOAA api about notes column in text      
    D['paleoData'] = getPaleoData (table, metadata['study'][0]['site'], url)

    D['archiveType'] = "TBC" # needs a converter for NOAA paleo data type, see issue #3, e.g. 'Speleothem'
    # Map NOAA data type to the archive type used in LinkedEarth
    
    D["createdBy"] = "NOAAconverter"
    
    D["datatSetName"] = "TBC"
    # SiteName.Author.Year. From the NOAA API, lift as: siteName, Last name of first author in contributor field, most recent "pubYear"
    
#   originalDataUrl": "onlineResourceLink"
    D["originalDataUrl"] = metadata['study'][0]['onlineResourceLink']
        
    return D

In [3]:
# updated
def getGeo(noaa_site, url):
    if len(noaa_site) == 1:
        noaa_geo = noaa_site[0]
    else: 
        for site in noaa_site:
            if site['paleoData'][0]['dataFile'][0]['fileUrl'] == url:
                noaa_geo = site                
  
    
    lipd_geo = {"type": noaa_geo['geo']['geoType'], 
                        "geometry": {"type": noaa_geo['geo']['geometry']['type'].capitalize(), 
                                     "coordinates": noaa_geo['geo']['geometry']['coordinates']
                                    + [noaa_geo['geo']['properties']['maxElevationMeters']]
                                    }}
    
    tmp = noaa_geo['geo']['properties'] 
    
    # add other properties: e.g. 'NOAASiteId', 'siteName', 'siteCode', 'mappable','locationName'
    for key, value in noaa_geo.items():
        if key != 'geo' and key != 'paleoData':
            tmp[key] = value   
    
    lipd_geo["properties"] = tmp       

    
    return lipd_geo   
    

In [4]:
# # needs to update
# def getPub(noaa_pubs):
#     lipd_pub = []
    
#     for noaa_pub in noaa_pubs:
#         tmp_dict =         
    
    
#     {"pub": {
#         "author": getAuthor(noaa_pub['author']),
#         "type" : noaa_pub['type'],
#         "identifier" : [
#             {"type": noaa_pub['identifier']['type'],
#              "id": noaa_pub['identifier']['id'],
#              "url": noaa_pub['identifier']['url']}],
#         "year": noaa_pub['pubYear']}}
    
#     return lipd_pub
    

In [5]:
def getPaleoData (table, noaa_site, url): 
    paleodata = OrderedDict()
    paleodata['paleo0'] = OrderedDict()
    paleodata['paleo0']['measurementTable'] = OrderedDict()
    paleodata['paleo0']['measurementTable']['paleo0measurement0'] = dict()
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['tableName'] = 'paleo0measurement0'
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['missingValue'] = 'nan' 
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['filename'] = None
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'] = OrderedDict()
    
    
    # units    
    # updated: remove recursions
    if len(noaa_site) == 1:
        for item in noaa_site[0]['paleoData']:
            if item['dataFile'][0]['fileUrl'] == url:
                noaa_paleo = item['dataFile'][0]
    else: 
        for site in noaa_site:
            if site['paleoData'][0]['dataFile'][0]['fileUrl'] == url:
                noaa_paleo = site['paleoData'][0]['dataFile'][0]     
    
    variables = []    
    for variable in noaa_paleo['variables']:
        variables.append(variable['cvUnit'].split('>')[-1])
    
    for i, name in enumerate(table[0]):
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name] = dict()
        
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['number'] = i+1
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['variableName'] = name
               
        try:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = variables[i]
        except:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = None
        
    
        # value
        value_list = []
        for string in list(zip(*table[1:]))[i]:
            try:
                # change string 
                num = float(string) # 
                
                '''need to update for checking missing value, missing value=-999.9 (TBC)'''
                if num == -999.9:                     
                    value_list.append('nan')
                else:
                    value_list.append(string)
            except:
                value_list.append(string)
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['value'] = value_list
    
    return paleodata

In [6]:
url1 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018jpc56.txt"
url2 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-raw.txt"

# https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=24890
# https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=16055

In [13]:
metadata1 = getMetaData (url1)

In [9]:
table1 = getTable (url1)

In [10]:
metadata2 = getMetaData (url2)

In [11]:
table2 = getTable (url2)

In [14]:
D1 = getLiPD (table1, metadata1, url1)

In [15]:
import lipd

In [16]:
lipd.extractTs(D1)

extracting paleoData...
Error: Unable to extractTs: local variable '_dsn' referenced before assignment


[]

In [17]:
D2 = getLiPD (table2, metadata2, url2)
ts2 = lipd.extractTs(D2)

extracting paleoData...
Error: Unable to extractTs: local variable '_dsn' referenced before assignment


In [18]:
D1

{'geo': {'type': 'Feature',
  'geometry': {'type': 'Point', 'coordinates': ['27.47', '-112.1', '-881']},
  'properties': {'southernmostLatitude': '27.47',
   'northernmostLatitude': '27.47',
   'westernmostLongitude': '-112.1',
   'easternmostLongitude': '-112.1',
   'minElevationMeters': '-881',
   'maxElevationMeters': '-881',
   'NOAASiteId': '57715',
   'siteName': 'JPC-56',
   'siteCode': None,
   'mappable': 'Y',
   'locationName': 'Ocean>Pacific Ocean>Eastern Pacific Ocean'}},
 'pub': [{'author': {'name': 'Tripti Bhattacharya, Jessica E. Tierney, Jason A. Addison, James W. Murray'},
   'pubYear': 2018,
   'title': 'Ice-sheet modulation of deglacial North American monsoon intensification',
   'journal': 'Nature Geoscience',
   'volume': None,
   'edition': None,
   'issue': None,
   'pages': None,
   'reportNumber': None,
   'citation': 'Tripti Bhattacharya, Jessica E. Tierney, Jason A. Addison, James W. Murray. 2018. Ice-sheet modulation of deglacial North American monsoon inten

In [19]:
D2

{'geo': {'type': 'Feature',
  'geometry': {'type': 'Point', 'coordinates': ['6.3', '125.83', '-2114']},
  'properties': {'southernmostLatitude': '6.3',
   'northernmostLatitude': '6.3',
   'westernmostLongitude': '125.83',
   'easternmostLongitude': '125.83',
   'minElevationMeters': '-2114',
   'maxElevationMeters': '-2114',
   'NOAASiteId': '19265',
   'siteName': 'MD98-2181',
   'siteCode': None,
   'mappable': 'Y',
   'locationName': 'Ocean>Pacific Ocean>Western Pacific Ocean'}},
 'pub': [{'author': None,
   'pubYear': 2004,
   'title': 'Decline of surface temperature and salinity in the western tropical Pacific Ocean in the Holocene epoch',
   'journal': 'Nature',
   'volume': None,
   'edition': None,
   'issue': None,
   'pages': None,
   'reportNumber': None,
   'citation': 'Stott, L.D., K.G. Cannariato, R.C. Thunell, G.H. Haug, A. Koutavas, and S.P. Lund. 2004. Decline of surface temperature and salinity in the western tropical Pacific Ocean in the Holocene epoch. Nature, v. 4