In [1]:
"""
1. lift table from text

2. extract study ID from text

3. get metadata from NOAA API with study ID

"""

import requests
import re
from collections import OrderedDict
import time

# get metadata from extracted study ID
def getMetaData (url):    
    time.sleep(3)
    txt_data = requests.get(url).text
    
    # in case request error
    while txt_data[:9] == "<!DOCTYPE":
        time.sleep(3)
        txt_data = requests.get(url).text
        
    
    txt_data = txt_data.replace('\r', '')  # remove "\r" in text
    splited_data = txt_data.split('\n')   
    
    paleo_tables, chron_tables = getTable(splited_data)
    
    # TBC: multi tables
    table = paleo_tables[0]
    
    # find "study" to get study id: 5 digits after "study"
    for line in splited_data:
        if "study" in line.lower():
            study_line = line.lower().partition("study")[2]
            temp = re.search('\d{% s}'% 5, study_line)
            if temp:
                study_id = temp.group(0)
#                 print(study_id)
                
            #    NOAA API
                api = "https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=" + study_id
                metadata = requests.get(api).json()
#                 print(metadata)           

                return table, metadata            


In [2]:
"""
# updated
1. differentiate types of table: chronological and paleo information
2. change missing value -> "nan"
"""

def getTable (splited_data):
#     txt_data = requests.get(url).text
#     txt_data = txt_data.replace('\r', '')  # remove "\r" in text
#     splited_data = txt_data.split('\n')   

    length = len(splited_data)
    
    """
    tabular data: 1. tab seperated (exception) 2. the number of columns is the same in the table.     
    """
    
    # remove the ‘#’ in front
    for i in range(length):    
        if splited_data[i].startswith("#"):
            splited_data[i] = splited_data[i][1:].lstrip()
    
    # to get the number of columns
    # count # of tab seperated words, otherwise white space seperated # of words for exception 
       
    table_index = list()  # [[start_index, end_index],..]
    
    i = 0
    while (i<length):            
        if "\t" in splited_data[i]:
            num = len(splited_data[i].split('\t'))
            if num > 2:
                start_index = i  # candidate of the first row of the table
                end_index = i
                i += 1            
                while (i<length):                
                    if len(splited_data[i].split('\t')) == num:
                        end_index = i
                        i +=1
                        
                    else:
                        break
                if (end_index-start_index) > 2:
                    table_index.append([start_index, end_index])
            else:
                i += 1
                    
        else:  # in case of white space seperated table
            num = len(splited_data[i].split())
            if num > 2:
                start_index = i  # candidate of the first row of the table
                end_index = i            
                
                i += 1
                while (i<length):                
                    if len(splited_data[i].split()) == num:
                        end_index = i
                        i +=1                        
                    else:
                        break
                if end_index-start_index > 2:
                    table_index.append([start_index, end_index])
            else:
                i += 1   
    
    # get tabular data  
    
    # differentiate types of table: chronological and paleo information      
    chron_tables = []
    paleo_tables = []
    
    for start_index, end_index in table_index:
        
        table = list()
        if '\t' in splited_data[start_index]:
            for i in range(start_index, end_index+1):
                table.append(splited_data[i].split('\t'))
                
        else: 
            for i in range(start_index, end_index+1):
                table.append(splited_data[i].split())

                
        # differentiate types of table
        # btw "----" and the table, word "chronology" exists, then chronological
        
        missing_value = False 
        flag = True  # default: paleo info flag
        for k in range(start_index-1, 0, -1):
            if "-----" in splited_data[k]: 
                break

            lower = splited_data[k].lower()
            if "chronology" in lower: 
                flag = False

            # missing value detect      
            if "missing" in lower and "value" in lower:
                missing_value = lower.split(":")[-1].strip()                   

            # missing value => change to nan
            if missing_value != False:
                for row_i in range(1, len(table)):
                    for column_i in range(len(table[0])):
                        if table[row_i][column_i].lower() == missing_value:
                            table[row_i][column_i] = 'nan'
                            
                        else:               
                            try:
                                missing_value2 = float(missing_value)
                                num2 = float(table[row_i][column_i])

                                if num2 == missing_value2:  
                                    table[row_i][column_i] = 'nan'
                            except:
                                continue    
                
            
        if flag:
            paleo_tables.append(table)
        else:
            chron_tables.append(table)   
                
    return paleo_tables, chron_tables    

In [3]:
"""
#. Convert metadata to LiPD
1) geo data
2) publication
3) paleodata

#. reference
lipd.net
https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016-discussion.html#discussion

LiPD utilities in Python: load, manipulate and write out lipd file
https://github.com/nickmckay/LiPD-utilities

https://lipdverse.org
https://lipdverse.org/Temp12k/current_version/CangoCave.Talma.1992.html
"""

def getLiPD (table, metadata, url):    
    # convert metadata to LiPD
    
    D = dict()   

   
    D['geo'] = getGeo(metadata['study'][0]['site'], url)
    
    # use original NOAA metadata: can't decide lipd 'pub' format yet (TBC)
    D['pub'] = metadata['study'][0]['publication']
#     D['pub'] = getPub(metadata['study'][0]['publication'])
    
    #    # check # of columns are same => problem: no info in NOAA api about notes column in text      
    D['paleoData'] = getPaleoData (table, metadata['study'][0]['site'], url)

    D['archiveType'] = "TBC" # needs a converter for NOAA paleo data type, see issue #3, e.g. 'Speleothem'
    # Map NOAA data type to the archive type used in LinkedEarth
    
    D["createdBy"] = "NOAAconverter"
    
    
    D["dataSetName"] = getDataSetName(D['geo']['properties']['siteName'], D['pub'],
                                       metadata['study'][0]['investigators'])      
    
#   originalDataUrl": "onlineResourceLink"
    D["originalDataUrl"] = metadata['study'][0]['onlineResourceLink']
        
    return D

In [4]:
# updated
def getGeo(noaa_site, url):
    if len(noaa_site) == 1:
        noaa_geo = noaa_site[0]
    else: 
        for site in noaa_site:
            if site['paleoData'][0]['dataFile'][0]['fileUrl'] == url:
                noaa_geo = site                
  
    
    lipd_geo = {"type": noaa_geo['geo']['geoType'], 
                        "geometry": {"type": noaa_geo['geo']['geometry']['type'].capitalize(), 
                                     "coordinates": noaa_geo['geo']['geometry']['coordinates']
                                    + [noaa_geo['geo']['properties']['maxElevationMeters']]
                                    }}
    
    tmp = noaa_geo['geo']['properties'] 
    
    # add other properties: e.g. 'NOAASiteId', 'siteName', 'locationName'
    for key, value in noaa_geo.items():
        if key != 'geo' and key != 'paleoData' and key != 'siteCode' and key !='mappable':
            tmp[key] = value   
    
    lipd_geo["properties"] = tmp       

    
    return lipd_geo   
    

In [5]:
# # needs to update
# def getPub(noaa_pubs):
#     lipd_pub = []
    
#     for noaa_pub in noaa_pubs:
#         tmp_dict =         
    
    
#     {"pub": {
#         "author": getAuthor(noaa_pub['author']),
#         "type" : noaa_pub['type'],
#         "identifier" : [
#             {"type": noaa_pub['identifier']['type'],
#              "id": noaa_pub['identifier']['id'],
#              "url": noaa_pub['identifier']['url']}],
#         "year": noaa_pub['pubYear']}}
    
#     return lipd_pub
    

In [6]:
def getPaleoData (table, noaa_site, url): 
    paleodata = OrderedDict()
    paleodata['paleo0'] = OrderedDict()
    paleodata['paleo0']['measurementTable'] = OrderedDict()
    paleodata['paleo0']['measurementTable']['paleo0measurement0'] = dict()
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['tableName'] = 'paleo0measurement0'
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['missingValue'] = 'nan' 
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['filename'] = None
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'] = OrderedDict()
    
    
    # units    
    # updated: remove recursions
    if len(noaa_site) == 1:
        for item in noaa_site[0]['paleoData']:
            if item['dataFile'][0]['fileUrl'] == url:
                noaa_paleo = item['dataFile'][0]
    else: 
        for site in noaa_site:
            if site['paleoData'][0]['dataFile'][0]['fileUrl'] == url:
                noaa_paleo = site['paleoData'][0]['dataFile'][0]     
    
    variables = []    
    for variable in noaa_paleo['variables']:
        variables.append(variable['cvUnit'].split('>')[-1])
    
    for i, name in enumerate(table[0]):
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name] = dict()
        
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['number'] = i+1
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['variableName'] = name
               
        try:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = variables[i]
        except:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = None
        
    
        # value
        value_list = []
        for string in list(zip(*table[1:]))[i]:
#             try:
#                 # change string 
#                 num = float(string) # 
                
#                 '''need to update for checking missing value, missing value=-999.9 (TBC)'''
#                 if num == -999.9:                     
#                     value_list.append('nan')
#                 else:
#                     value_list.append(string)
#             except:
            value_list.append(string)
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['value'] = value_list
    
    return paleodata

In [7]:
def getDataSetName(sitename, pubs, investigators):
# SiteName.Author.Year. From the NOAA API, lift as: siteName, 
# Last name of first author in investigators, most recent "pubYear"

    year = 0
    for pub in pubs:
        if pub['pubYear']:
            if pub['pubYear'] > year:
                year = pub['pubYear']
                
    if year == 0:
        year = 'None'
    else:
        year = str(year)
        
    author = investigators.split(',')[0]            
                
    return sitename+'.'+author+'.'+year

In [8]:
url1 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018jpc56.txt"
url2 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-raw.txt"
url3 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2011/khider2011.txt"

# https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=24890
# https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=16055
# https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=18315

In [9]:
table1, metadata1 = getMetaData (url1)

In [10]:
D1 = getLiPD (table1, metadata1, url1)

import lipd
ts1 = lipd.extractTs(D1)

extracting paleoData...
extracting: JPC-56.Bhattacharya.2018
Created time series: 7 entries


In [11]:
D1

{'geo': {'type': 'Feature',
  'geometry': {'type': 'Point', 'coordinates': ['27.47', '-112.1', '-881']},
  'properties': {'southernmostLatitude': '27.47',
   'northernmostLatitude': '27.47',
   'westernmostLongitude': '-112.1',
   'easternmostLongitude': '-112.1',
   'minElevationMeters': '-881',
   'maxElevationMeters': '-881',
   'NOAASiteId': '57715',
   'siteName': 'JPC-56',
   'locationName': 'Ocean>Pacific Ocean>Eastern Pacific Ocean'}},
 'pub': [{'author': {'name': 'Tripti Bhattacharya, Jessica E. Tierney, Jason A. Addison, James W. Murray'},
   'pubYear': 2018,
   'title': 'Ice-sheet modulation of deglacial North American monsoon intensification',
   'journal': 'Nature Geoscience',
   'volume': None,
   'edition': None,
   'issue': None,
   'pages': None,
   'reportNumber': None,
   'citation': 'Tripti Bhattacharya, Jessica E. Tierney, Jason A. Addison, James W. Murray. 2018. Ice-sheet modulation of deglacial North American monsoon intensification. Nature Geoscience. . doi: 10.

In [12]:
table2, metadata2 = getMetaData (url2)

In [13]:
D2 = getLiPD (table2, metadata2, url2)
ts2 = lipd.extractTs(D2)

extracting paleoData...
extracting: MD98-2181.Khider.2014
Created time series: 7 entries


In [14]:
D2

{'geo': {'type': 'Feature',
  'geometry': {'type': 'Point', 'coordinates': ['6.3', '125.83', '-2114']},
  'properties': {'southernmostLatitude': '6.3',
   'northernmostLatitude': '6.3',
   'westernmostLongitude': '125.83',
   'easternmostLongitude': '125.83',
   'minElevationMeters': '-2114',
   'maxElevationMeters': '-2114',
   'NOAASiteId': '19265',
   'siteName': 'MD98-2181',
   'locationName': 'Ocean>Pacific Ocean>Western Pacific Ocean'}},
 'pub': [{'author': None,
   'pubYear': 2004,
   'title': 'Decline of surface temperature and salinity in the western tropical Pacific Ocean in the Holocene epoch',
   'journal': 'Nature',
   'volume': None,
   'edition': None,
   'issue': None,
   'pages': None,
   'reportNumber': None,
   'citation': 'Stott, L.D., K.G. Cannariato, R.C. Thunell, G.H. Haug, A. Koutavas, and S.P. Lund. 2004. Decline of surface temperature and salinity in the western tropical Pacific Ocean in the Holocene epoch. Nature, v. 431, p. 56-59.',
   'type': 'publication',

In [15]:
table3, metadata3 = getMetaData (url3)

In [16]:
D3 = getLiPD (table3, metadata3, url3)
ts3 = lipd.extractTs(D3)

extracting paleoData...
extracting: MD98-2177.Khider.2011
Created time series: 7 entries


In [17]:
D3

{'geo': {'type': 'Feature',
  'geometry': {'type': 'Point', 'coordinates': ['1.4033', '119.078', '-968']},
  'properties': {'southernmostLatitude': '1.4033',
   'northernmostLatitude': '1.4033',
   'westernmostLongitude': '119.078',
   'easternmostLongitude': '119.078',
   'minElevationMeters': '-968',
   'maxElevationMeters': '-968',
   'NOAASiteId': '53040',
   'siteName': 'MD98-2177',
   'locationName': 'Ocean>Pacific Ocean>Western Pacific Ocean'}},
 'pub': [{'author': {'name': 'Khider, D., L. Stott, J. Emile-Geay, R. Thunell, and D.E. Hammond'},
   'pubYear': 2011,
   'title': 'Assessing El Nino Southern Oscillation variability during the past millennium',
   'journal': 'Paleoceanography',
   'volume': '26',
   'edition': None,
   'issue': None,
   'pages': None,
   'reportNumber': 'PA3222',
   'citation': 'Khider, D., L. Stott, J. Emile-Geay, R. Thunell, and D.E. Hammond. 2011. Assessing El Nino Southern Oscillation variability during the past millennium. Paleoceanography, 26, PA3