In [1]:
"""
1. URL example
url = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018jpc56.txt"
url1 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-raw.txt"

2. extract study ID

3. get metadata from NOAA API with study ID

4. convert metadata to LiPD
1) geo data
2) publication


#. reference
lipd.net
https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016-discussion.html#discussion

LiPD utilities in Python: load, manipulate and write out lipd file
https://github.com/nickmckay/LiPD-utilities

https://lipdverse.org
https://lipdverse.org/Temp12k/current_version/CangoCave.Talma.1992.html
"""

import requests
import re

# get metadata from extracted study ID
def getMetaData (url):
    txt_data = requests.get(url).text
    splited_data = txt_data.split('\n')
 
   
    # find "study" to get study id: 5 digits after "study"
    study_lines = []
    for line in splited_data:
        if "study" in line.lower():
            study_lines.append(line.lower().partition("study")[2])
            
    study_id = getStudyID(study_lines)
    
    # NOAA API
    api = "https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=" + study_id
    
    metadata = requests.get(api).json()
    
    # convert metadata to LiPD
    noaa_geo = metadata['study'][0]['site'][0]['geo']
    noaa_pub = metadata['study'][0]['publication'][0]
    
    lipd_geo = {"geo": {"type": noaa_geo['geoType'], 
                        "geometry": {"type": noaa_geo['geometry']['type'].capitalize(), 
                                     "coordinates": noaa_geo['geometry']['coordinates']
                                    + [noaa_geo['properties']['minElevationMeters']]
                                    },
                        "properties": noaa_geo['properties']}}
    
    lipd_pub = {"pub": {
        "author": getAuthor(noaa_pub['author']),
        "type" : noaa_pub['type'],
        "identifier" : [
            {"type": noaa_pub['identifier']['type'],
             "id": noaa_pub['identifier']['id'],
             "url": noaa_pub['identifier']['url']}],
        "year": noaa_pub['pubYear']}}

    
    return noaa_geo, noaa_pub, lipd_geo, lipd_pub

            
def getStudyID(study_lines):            
    for line in study_lines:
        temp = re.search('\d{% s}'% 5, line)
        study_id = (temp.group(0) if temp else '')
        if study_id:
            return study_id 

def getAuthor(author):        
    names = flatten(author)
    name_list = []
    for name_cand in names:
        if name_cand:
            for name in name_cand.split(','):
                name_list.append({"name": name.strip()})
        else:
            name_list.append({"name": None})
            
    return name_list

# get values from nested dict
def flatten(value):    
    if isinstance(value, dict):
        item = []
        helper(value,item)
    else:
        item = [value]
        
    return item
def helper(dict2, item):
    
    for key, value in dict2.items():
        if isinstance(value, dict):
            helper(value, item)
        else:
            item.append(value)
    
    return item

In [2]:
url = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018jpc56.txt"
url1 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-raw.txt"

In [3]:
noaa_geo, noaa_pub, lipd_geo, lipd_pub = getMetaData (url)

In [4]:
noaa_geo1, noaa_pub1, lipd_geo1, lipd_pub1 = getMetaData (url1)

In [5]:
noaa_geo

{'geoType': 'Feature',
 'geometry': {'type': 'POINT', 'coordinates': ['27.4835', '-112.0743']},
 'properties': {'southernmostLatitude': '27.4835',
  'northernmostLatitude': '27.4835',
  'westernmostLongitude': '-112.0743',
  'easternmostLongitude': '-112.0743',
  'minElevationMeters': '-881',
  'maxElevationMeters': '-881'}}

In [6]:
lipd_geo

{'geo': {'type': 'Feature',
  'geometry': {'type': 'Point',
   'coordinates': ['27.4835', '-112.0743', '-881']},
  'properties': {'southernmostLatitude': '27.4835',
   'northernmostLatitude': '27.4835',
   'westernmostLongitude': '-112.0743',
   'easternmostLongitude': '-112.0743',
   'minElevationMeters': '-881',
   'maxElevationMeters': '-881'}}}

In [7]:
noaa_pub

{'author': {'name': 'Tripti Bhattacharya, Jessica E. Tierney, Jason A. Addison, James W. Murray'},
 'pubYear': 2018,
 'title': 'Ice-sheet modulation of deglacial North American monsoon intensification',
 'journal': 'Nature Geoscience',
 'volume': None,
 'edition': None,
 'issue': None,
 'pages': None,
 'reportNumber': None,
 'citation': 'Tripti Bhattacharya, Jessica E. Tierney, Jason A. Addison, James W. Murray. 2018. Ice-sheet modulation of deglacial North American monsoon intensification. Nature Geoscience. . doi: 10.1038/s41561-018-0220-7',
 'type': 'publication',
 'identifier': {'type': 'doi',
  'id': '10.1038/s41561-018-0220-7',
  'url': 'http://dx.doi.org/10.1038/s41561-018-0220-7'},
 'abstract': 'The North American monsoon, the dominant source of rainfall for much of the arid US Southwest, remains one of the least understood monsoon systems. The late Pleistocene evolution of this monsoon is poorly constrained, largely because glacial changes in winter rainfall obscure summer mon

In [8]:
lipd_pub

{'pub': {'author': [{'name': 'Tripti Bhattacharya'},
   {'name': 'Jessica E. Tierney'},
   {'name': 'Jason A. Addison'},
   {'name': 'James W. Murray'}],
  'type': 'publication',
  'identifier': [{'type': 'doi',
    'id': '10.1038/s41561-018-0220-7',
    'url': 'http://dx.doi.org/10.1038/s41561-018-0220-7'}],
  'year': 2018}}

In [9]:
noaa_geo1

{'geoType': 'Feature',
 'geometry': {'type': 'POINT', 'coordinates': ['6.3', '125.83']},
 'properties': {'southernmostLatitude': '6.3',
  'northernmostLatitude': '6.3',
  'westernmostLongitude': '125.83',
  'easternmostLongitude': '125.83',
  'minElevationMeters': '-2114',
  'maxElevationMeters': '-2114'}}

In [10]:
lipd_geo1

{'geo': {'type': 'Feature',
  'geometry': {'type': 'Point', 'coordinates': ['6.3', '125.83', '-2114']},
  'properties': {'southernmostLatitude': '6.3',
   'northernmostLatitude': '6.3',
   'westernmostLongitude': '125.83',
   'easternmostLongitude': '125.83',
   'minElevationMeters': '-2114',
   'maxElevationMeters': '-2114'}}}

In [11]:
noaa_pub1

{'author': None,
 'pubYear': 2004,
 'title': 'Decline of surface temperature and salinity in the western tropical Pacific Ocean in the Holocene epoch',
 'journal': 'Nature',
 'volume': None,
 'edition': None,
 'issue': None,
 'pages': None,
 'reportNumber': None,
 'citation': 'Stott, L.D., K.G. Cannariato, R.C. Thunell, G.H. Haug, A. Koutavas, and S.P. Lund. 2004. Decline of surface temperature and salinity in the western tropical Pacific Ocean in the Holocene epoch. Nature, v. 431, p. 56-59.',
 'type': 'publication',
 'identifier': {'type': 'doi',
  'id': '10.1038/nature0293',
  'url': 'http://dx.doi.org/10.1038/nature0293'},
 'abstract': None,
 'pubRank': '3'}

In [12]:
lipd_pub1

{'pub': {'author': [{'name': None}],
  'type': 'publication',
  'identifier': [{'type': 'doi',
    'id': '10.1038/nature0293',
    'url': 'http://dx.doi.org/10.1038/nature0293'}],
  'year': 2004}}