In [14]:
"""
1. lift table

2. extract study ID

3. get metadata from NOAA API with study ID

4. convert metadata to LiPD
1) geo data
2) publication
3) paleodata

#. reference
lipd.net
https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016-discussion.html#discussion

LiPD utilities in Python: load, manipulate and write out lipd file
https://github.com/nickmckay/LiPD-utilities

https://lipdverse.org
https://lipdverse.org/Temp12k/current_version/CangoCave.Talma.1992.html
"""

import requests
import re
from collections import OrderedDict

            
def getStudyID(study_lines):            
    for line in study_lines:
        temp = re.search('\d{% s}'% 5, line)
        study_id = (temp.group(0) if temp else '')
        if study_id:
            return study_id 

def getAuthor(author):        
    names = flatten(author)
    name_list = []
    for name_cand in names:
        if name_cand:
            for name in name_cand.split(','):
                name_list.append({"name": name.strip()})
        else:
            name_list.append({"name": None})
            
    return name_list

# get values from nested dict
def flatten(value):    
    if isinstance(value, dict):
        item = []
        helper(value,item)
    else:
        item = [value]
        
    return item
def helper(dict2, item):
    
    for key, value in dict2.items():
        if isinstance(value, dict):
            helper(value, item)
        else:
            item.append(value)
    
    return item

# find specific value from nested dict, list
def flatten2(value, url):    
    if isinstance(value, dict):
        item = []
        helper2(value,item, url)        

    else:
        if value == url:
            item = [value]
        
    return item
def helper2(dict2, item, url):
    
    for key, value in dict2.items():
        if isinstance(value, dict):
            helper2(value, item, url)
            
        elif isinstance(value, list):
            helper3(value, item, url)
        else:
            if value == url:
                item.append(dict2)
    
    return item

def helper3(list2, item, url):    
    for value in list2:
        if isinstance(value, dict):
            helper2(value, item, url)

In [15]:
import pandas as pd
import numpy as np

def getTable (url):
    txt_data = requests.get(url).text
    txt_data = txt_data.replace('\r', '')  # remove "\r" in text
    splited_data = txt_data.split('\n')   
    length = len(splited_data)
    
    # remove the ‘#’ in front
    for i in range(length):
        if splited_data[i].startswith("#"):
            splited_data[i] = splited_data[i][1:].lstrip()          
   
  
    """
    tabular data: 1. tab seperated 2. the number of columns is the same in the table.     
    """
    
    # if tab seperated, get the number of tab seperated words (the number of columns)
    num_cols = []
    for i in range(length):
        if "\t" in splited_data[i]:
            splited_data[i] = splited_data[i].split('\t')
            num_cols.append(len(splited_data[i]))
        else:
            num_cols.append(0)
            
    # get start index of table with # of columns            
    start_index = []
    for i, num in enumerate(num_cols):
        if num > 2 and i != len(num_cols)-1 and num == num_cols[i+1] and num != num_cols[i-1] or num>2 and i == 0 and num ==num_cols[i+1]:
            start_index.append(i)
            
    # get tabular data    
    tables = []
    for i in start_index:            
        
        table = [splited_data[i]]
        num = len(splited_data[i])
        for j in range(i+1, len(splited_data)):            
            if len(splited_data[j]) == num:      
                
                table.append(splited_data[j])
            else:
                break
                
        if len(table) > 2: # # of raw > 2
            tables.append(table)       
        
        # TBC: differentiate types of table: chronological and paleo information        
            
    return tables[-1]  # TBC: need to update for multiple table, missing value: change

In [16]:
def getPaleoData (metadata, table, url): 
    paleodata = OrderedDict()
    paleodata['paleo0'] = OrderedDict()
    paleodata['paleo0']['measurementTable'] = OrderedDict()
    paleodata['paleo0']['measurementTable']['paleo0measurement0'] = dict()
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['tableName'] = 'paleo0measurement0'
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['missingValue'] = 'nan' 
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['filename'] = None
    
    paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'] = OrderedDict()
    
    
    # units
    variables = []    
    for variable in flatten2(metadata, url)[0]['variables']:
        variables.append(variable['cvUnit'].split('>')[-1])
    
    for i, name in enumerate(table[0]):
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name] = dict()
        
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['number'] = i+1
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['variableName'] = name
               
        try:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = variables[i]
        except:
            paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['units'] = None
        
    
        # value
        value_list = []
        for string in list(zip(*table[1:]))[i]:
            try:
                num = float(string)
                
                '''need to update for checking missing value, missing value=-999.9 (TBC)'''
                if num == -999.9:                     
                    value_list.append('nan')
                else:
                    value_list.append(float(string))
            except:
                value_list.append(string)
        paleodata['paleo0']['measurementTable']['paleo0measurement0']['columns'][name]['value'] = value_list
    
    return paleodata

In [None]:

# get metadata from extracted study ID
def getMetaData (url):
    txt_data = requests.get(url).text
    splited_data = txt_data.split('\n')
 
   
    # find "study" to get study id: 5 digits after "study"
    study_lines = []
    for line in splited_data:
        if "study" in line.lower():
            study_lines.append(line.lower().partition("study")[2])
            
#     study_id = getStudyID(study_lines)
#     print(study_id)
    
    # NOAA API
#     api = "https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=" + study_id
    api = "https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=24890"
    metadata = requests.get(api).json()
    
    # convert metadata to LiPD
    
    D = dict()   
    
#     noaa_geo = metadata['study'][0]['site'][0]['geo']
#     noaa_pub = metadata['study'][0]['publication'][0]
    
#     D['geo'] = {"geo": {"type": noaa_geo['geoType'], 
#                         "geometry": {"type": noaa_geo['geometry']['type'].capitalize(), 
#                                      "coordinates": noaa_geo['geometry']['coordinates']
#                                     + [noaa_geo['properties']['minElevationMeters']]
#                                     },
#                         "properties": noaa_geo['properties']}}
    
#     D['pub'] = {"pub": {
#         "author": getAuthor(noaa_pub['author']),
#         "type" : noaa_pub['type'],
#         "identifier" : [
#             {"type": noaa_pub['identifier']['type'],
#              "id": noaa_pub['identifier']['id'],
#              "url": noaa_pub['identifier']['url']}],
#         "year": noaa_pub['pubYear']}}

#     D['archiveType'] = ""

#    # check # of columns are same => problem: notes column      
    table = getTable(url)
    D['paleoData'] = getPaleoData (metadata, table, url)
    
    return metadata, D

In [18]:
url = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018jpc56.txt"
url1 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-raw.txt"

metadata, D = getMetaData (url)
# metadata1, D1 = getMetaData (url1)

In [19]:
D

{'paleoData': OrderedDict([('paleo0',
               OrderedDict([('measurementTable',
                             OrderedDict([('paleo0measurement0',
                                           {'tableName': 'paleo0measurement0',
                                            'missingValue': 'nan',
                                            'filename': None,
                                            'columns': OrderedDict([('depth_cm',
                                                          {'number': 1,
                                                           'variableName': 'depth_cm',
                                                           'units': 'centimeter',
                                                           'value': [512.0,
                                                            537.0,
                                                            593.0,
                                                            609.0,
                                         

In [None]:
getTable(url1)

In [None]:
l = [0]
print(l[-1])

In [None]:
for variable in flatten2(metadata, url)[0]['variables']:
    print(variable['cvUnit'].split('>')[-1]) 

In [None]:
flatten2(metadata, url)

In [None]:
url = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/bhattacharya2018/bhattacharya2018jpc56.txt"
url1 = "https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-raw.txt"


"""
https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=24890
https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=16055


https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2014/khider2014-benth.txt
#  Online_Resource: http://www.hurricane.ncdc.noaa.gov/pls/paleox/f?p=519:1:::::P1_STUDY_ID:16055
https://www.ncdc.noaa.gov/paleo-search/study/search.json?NOAAStudyId=16055

"""

In [None]:
metadata1 = getMetaData (url1)

# metadata1, D1 = getMetaData (url1)

In [None]:
metadata = getMetaData (url)

In [None]:
metadata1.keys()

In [None]:
metadata.keys()

In [None]:
len(metadata1['study'])

In [None]:
len(metadata['study'])

In [None]:
metadata1['study'][0].keys()

In [None]:
metadata['study'][0].keys()

In [None]:
len(metadata1['study'][0]['site'])

In [None]:
len(metadata['study'][0]['site'])

In [None]:
metadata1['study'][0]['site'][0].keys()

In [None]:
len(metadata1['study'][0]['site'][0]['paleoData'])

In [None]:
len(metadata['study'][0]['site'][0]['paleoData'])

In [None]:
metadata['study'][0]['site'][0]['paleoData']

In [None]:
for paleodata in metadata1['study'][0]['site'][0]['paleoData']:
    if paleodata['dataFile'][0]["fileUrl"] == url1:
        variables = paleodata['dataFile'][0]['variables']
        length = len(variables)
        
        for variable in variables:
            print(variable['cvUnit'].split('>')[-1]) 

    

In [None]:
for paleodata in metadata['study'][0]['site'][0]['paleoData']:
    print(paleodata['dataFile'][0]["fileUrl"])
    if paleodata['dataFile'][0]["fileUrl"] == url:
        variables = paleodata['dataFile'][0]['variables']
        length = len(variables)
        
        for variable in variables:
            print(variable['cvUnit'].split('>')[-1]) 

In [None]:
url

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][1]['dataFile'][0]["fileUrl"]

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][0]['dataFile']

# cvUnit: unit, variable name: from text file columng

In [None]:
len(metadata1['study'][0]['site'][0]['paleoData'])

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][0].keys()

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][1].keys()

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][2].keys()

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][0]['dataFile']

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][1]['dataFile']

In [None]:
metadata1['study'][0]['site'][0]['paleoData'][2]['dataFile']

In [None]:
D1

In [None]:
metadata1['study'][0]['publication']

In [None]:
metadata['study'][0]['publication']

In [None]:
noaa_geo

In [None]:
lipd_geo

In [None]:
noaa_pub

In [None]:
lipd_pub

In [None]:
noaa_geo1

In [None]:
lipd_geo1

In [None]:
noaa_pub1

In [None]:
lipd_pub1