In [1]:
def StudyID_LiPD(NOAAStudyID):   # studyid example: 24890, 16055, 18315, # 30813: lpd file
    metadata = queryNOAA(NOAAStudyID)
    return getLiPD (metadata)   

In [2]:
import requests
def queryNOAA(NOAAStudyID):
    #    NOAA file type
    if (NOAAStudyID >= 13000):   
    
        api = "https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=" + str(NOAAStudyID)
        metadata = requests.get(api).json()

        return metadata
    
    else:
        print("Study ID below 13000 is not available.")   

In [3]:
def getLiPD (metadata):    
    # convert metadata to LiPD    
    
    if metadata['study'][0]["dataType"].lower() == "software" or metadata['study'][0]["dataType"].lower() == "repository": 
        # e.g. studyid: 1002459, 1002682
        print("Data type is " +metadata['study'][0]["dataType"].lower()+". No lpd file.")
    else:    
        noaa_site = metadata['study'][0]['site']
        lipd_pub = getPub(metadata['study'][0]['publication'])


        Ds = list()

        # different site -> different lipd (D)
        for site in noaa_site:
            D = dict()        

            D['paleoData'] = getPaleoData (site)

            D["createdBy"] = "NOAAconverter"


            D['geo'] = getGeo(site)
            D['pub'] = lipd_pub
            D['archiveType'] = getArchiveType(site)
            D["dataSetName"] = getDataSetName(D['geo']['properties']['siteName'], D['pub'],
                                           metadata['study'][0]['investigators'])

            #   originalDataUrl": "onlineResourceLink"
            D["originalDataUrl"] = metadata['study'][0]['onlineResourceLink']

            Ds.append(D)

    #         print(D)


        return Ds    
    

In [4]:
def getArchiveType(site):
    # https://www.ncdc.noaa.gov/data-access/paleoclimatology-data/datasets
    archiveTypeDict = {
    "BOREHOLE": "borehole",
    "SPELEOTHEMS": "speleothem",
    "CORALS AND SCLEROSPONGES": "coral",    
    "FAUNA": "other",
    "FIRE HISTORY": "other",    
    "CLIMATE FORCING": "other",
    "HISTORICAL": "documents",
    "ICE CORES": "ice-other",
    "PALEOLIMNOLOGY": "lakesediment",
    "LAKE LEVELS": "other",
    "CLIMATE RECONSTRUCTIONS": "other",
    "LOESS AND PALEOSOL": "other",
    "PALEOCLIMATIC MODELING": "other",
    "PALEOCEANOGRAPHY": "marinesediment",
    "PLANT MACROFOSSILS": "other",
    "POLLEN": "lakesediment",
    "INSECT": "other",
    "TREE RING": "wood",
    "INSTRUMENTAL": "other"    
    }   
    
    candidate = set()
    for paleo in site['paleoData']:
        for datafile in paleo['dataFile']:
            for variable in datafile['variables']:
                candidate = candidate.union({variable['cvDataType'].split("|")[-1].upper()})
    
    if len(candidate) == 1:
        try: 
            archiveType = archiveTypeDict[list(candidate)[0]]
        except:
            archiveType = "other"
    else:
        archiveType = "other"     
        

    return archiveType

In [5]:
def getDataSetName(sitename, pubs, investigators):
# SiteName.Author.Year. From the NOAA API, lift as: siteName, 
# Last name of first author in investigators, most recent "pubYear"

    year = 0
    for pub in pubs:
        if pub['year']:
            if pub['year'] > year:
                year = pub['year']
                
    if year == 0:
        year = 'None'
    else:
        year = str(year)
        
    author = investigators.split(',')[0]            
                
    return sitename+'.'+author+'.'+year

In [6]:
def getPub(noaa_pubs):
    lipd_pub = []
    
    for noaa_pub in noaa_pubs:
        tmp_dict = dict()    
        
        for key, value in noaa_pub.items():
            if key == "pubYear":
                tmp_dict["year"] = value
            elif key == "identifier":
                doi_flag = False
                for k, v in noaa_pub[key].items():
                    if k != "type" and k != "id":
                        tmp_dict[k] = v
                    elif k == "type" and v == "doi":
                        doi_flag = True
                if doi_flag:
                    tmp_dict['doi'] = noaa_pub['identifier']['id']
                        
            else:
                tmp_dict[key] = value
        
        lipd_pub.append(tmp_dict)
    
    return lipd_pub

In [7]:
def getGeo(noaa_geo):   
    lipd_geo = {"type": noaa_geo['geo']['geoType'], 
                        "geometry": {"type": noaa_geo['geo']['geometry']['type'].capitalize(), 
                                     "coordinates": noaa_geo['geo']['geometry']['coordinates']
                                    + [noaa_geo['geo']['properties']['maxElevationMeters']]
                                    }}
    
    tmp = noaa_geo['geo']['properties'] 
    
    # add other properties: e.g. 'NOAASiteId', 'siteName', 'locationName'
    for key, value in noaa_geo.items():
        if key != 'geo' and key != 'paleoData' and key != 'siteCode' and key !='mappable':
            tmp[key] = value   
    
    lipd_geo["properties"] = tmp       

    return lipd_geo   
    

In [8]:
def getPaleoData (site):    
    paleodata_list = []
    for paleo in site['paleoData']:
        for noaa_paleo in paleo['dataFile']:   
            url = noaa_paleo['fileUrl']
            
            if url[-3:] == "lpd":
                print(url)
                return "tbc"
          
            paleo_tables, chron_tables = getTable(url)
            
            for paleo_table in paleo_tables:         
                paleodata_list.append(getPaleoDict(noaa_paleo, paleo_table))
    
    """TBC: multiple tables """
    return paleodata_list[0]

In [9]:
def getPaleoDict(noaa_paleo, table):
    paleodata = OrderedDict()
    paleodata['paleo0'] = OrderedDict()
    paleodata['paleo0']['measurementTable'] = OrderedDict()
    paleodata['paleo0']['measurementTable']['paleo0measurement0'] = dict()

    paleodata['paleo0']['measurementTable']['paleo0measurement0']['tableName'] = 'paleo0measurement0'
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['missingValue'] = 'nan' 
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['filename'] = None

    paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'] = OrderedDict()               

    variables = [] 
    cvwhats = []
    for variable in noaa_paleo['variables']:
        variables.append(variable['cvUnit'].split('>')[-1])        
        cvwhats.append(variable['cvWhat'].split('>')[-1])        
    """need to update: matching btw columns name of table & cvWhat"""

    for i, name in enumerate(table[0]):
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name] = dict()

        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['number'] = i+1
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['variableName'] = name

        try:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = variables[i]
        except:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = None


        # value
        value_list = []
        for string in list(zip(*table[1:]))[i]:
            value_list.append(string)
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['value'] = value_list

    return paleodata
    

In [10]:
"""
1. lift tables from a text file
e.g. https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018md022517.txt

# updated
1. differentiate types of table: chronological and paleo information
2. change missing value -> "nan"
"""

import re
from collections import OrderedDict

def getTable (url):    
    txt_data = requests.get(url).text
    
    # in case request error
    while txt_data[:9] == "<!DOCTYPE":
        txt_data = requests.get(url).text        
    
    txt_data = txt_data.replace('\r', '')  # remove "\r" in text
    splited_data = txt_data.split('\n')   
    length = len(splited_data)
    
    """
    tabular data: 1. tab seperated (exception) 2. the number of columns is the same in the table.     
    """
    
    # remove the ‘#’ in front
    # store format_index starts with "Data line variables format"
    format_index = list()
    for i in range(length):    
        if splited_data[i].startswith("#"):
            splited_data[i] = splited_data[i][1:].lstrip()
            
        if splited_data[i].startswith("Variables"):
            if splited_data[i-1].startswith("---"):
                format_index.append(i+4)
            
    
    # to get the number of columns
    # count # of tab seperated words, otherwise white space seperated # of words for exception 
       
    table_index = list()  # [[start_index, end_index],..]
    
    i = 0
    while (i<length):            
        if "\t" in splited_data[i]:
            num = len(splited_data[i].split('\t'))
            if num > 2:
                start_index = i  # candidate of the first row of the table
                end_index = i
                i += 1            
                while (i<length):                
                    if len(splited_data[i].split('\t')) == num:
                        end_index = i
                        i +=1
                        
                    else:
                        break
                if (end_index-start_index) > 2:
                    table_index.append([start_index, end_index])
            else:
                i += 1
                    
        else:  # in case of white space seperated table
            num = len(splited_data[i].split())
            if num > 2:
                start_index = i  # candidate of the first row of the table
                end_index = i            
                
                i += 1
                while (i<length):                
                    if len(splited_data[i].split()) == num:
                        end_index = i
                        i +=1                        
                    else:
                        break
                if end_index-start_index > 2:
                    table_index.append([start_index, end_index])
            else:
                i += 1   
    
    # get tabular data  
    
    # differentiate types of table: chronological and paleo information      
    chron_tables = []
    paleo_tables = []
    
    for start_index, end_index in table_index:
        
        table = list()
        if '\t' in splited_data[start_index]:
            for i in range(start_index, end_index+1):
                table.append(splited_data[i].split('\t'))
                
        else: 
            for i in range(start_index, end_index+1):
                table.append(splited_data[i].split())

                
        # differentiate types of table
        # btw "----" and the table, word "chronology" exists, then chronological
        
        missing_value = False 
        flag = True  # default: paleo info flag
        for k in range(start_index-1, 0, -1):
            if "-----" in splited_data[k]: 
                break

            lower = splited_data[k].lower()
            if "chronology" in lower: 
                flag = False

            # missing value detect      
            if "missing" in lower and "value" in lower:
                missing_value = lower.split(":")[-1].strip()                   

            # missing value => change to nan
            if missing_value != False:
                for row_i in range(1, len(table)):
                    for column_i in range(len(table[0])):
                        if table[row_i][column_i].lower() == missing_value:
                            table[row_i][column_i] = 'nan'
                            
                        else:               
                            try:
                                missing_value2 = float(missing_value)
                                num2 = float(table[row_i][column_i])

                                if num2 == missing_value2:  
                                    table[row_i][column_i] = 'nan'
                            except:
                                continue    
                
            
        if flag:
            paleo_tables.append(table)
        else:
            chron_tables.append(table)
            
#     for paleo_table in paleo_tables:        
#         num_cols = len(paleo_table[0])
#         for i in range(num_cols):
#             print(splited_data[format_index[0]+i])
    
    
    
    return paleo_tables, chron_tables      

In [11]:
# studyid example: 24890, 16055, 18315, # 30813: lpd file

Ds = StudyID_LiPD(18315)

In [12]:
import lipd
for D in Ds:
    lipd.extractTs(D)

extracting paleoData...
extracting: MD98-2177.Khider.2011
Created time series: 7 entries


In [13]:
Ds = StudyID_LiPD(24890)

import lipd
for D in Ds:
    lipd.extractTs(D)

extracting paleoData...
extracting: MD02-2515.Bhattacharya.2018
Created time series: 7 entries
extracting paleoData...
extracting: MD02-2517.Bhattacharya.2018
Created time series: 7 entries
extracting paleoData...
extracting: NH-8P.Bhattacharya.2018
Created time series: 7 entries
extracting paleoData...
extracting: JPC-56.Bhattacharya.2018
Created time series: 7 entries


In [14]:
Ds = StudyID_LiPD(16055)

import lipd
for D in Ds:
    lipd.extractTs(D)

extracting paleoData...
extracting: MD98-2181.Khider.2014
Created time series: 7 entries
