In [2]:
import requests
import pandas as pd
import openpyxl
import numpy as np
import re
import yaml
import os
import requests

In [30]:
class pyTups:

    def __init__(self):
        self.BASE_URL = "https://www.ncei.noaa.gov/access/paleo-search/study/search.json"
        self.parameters = {}
        self.response_data = None  # To store API response for display and selection later
        self.studies_url = None  # To store the URL of the chosen study
        self.response_df = None  # DataFrame to store the studies for selection
        self.studies_df = None
    def search_studies(self, xml_id=None, noaa_id=None, data_publisher=None,
                       investigators=None, latitude=None, longitude=None, location=None):
        """
        Search for studies based on provided parameters.

        Parameters:
            xml_id (str): XML ID of the study (if known).
            noaa_id (str): NOAA ID of the study (if known).
            data_publisher (str): Name of the data publisher.
            investigators (str): Name of the investigator(s).
            latitude (str): Latitude for geographic search.
            longitude (str): Longitude for geographic search.
            location (str): Location name for geographic search.

        If `xml_id` or `noaa_id` is provided, they take precedence, and other parameters are ignored.
        """
        # Prepare parameters based on provided arguments
        if xml_id:
            self.parameters = {"xmlId": xml_id}
        elif noaa_id:
            self.parameters = {"NOAAStudyId": noaa_id}
        else:
            self.parameters = {
                "dataPublisher": data_publisher,
                "investigators": investigators,
                "latitude": latitude,
                "longitude": longitude,
                "location": location
            }

        response = requests.get(self.BASE_URL, params={k: v for k, v in self.parameters.items() if v})
        
        if response.status_code == 200:
            self.response_data = response.json()
            self.display_results()  
        else:
            print(f"Error: {response.status_code}")
            self.response_data = None

    def display_results(self):
        # Ensure response data is available
        if not self.response_data:
            print("No data available to display. Please try with other parameters.")
            return

        # Check for 'study' key in response and display in DataFrame
        studies = self.response_data.get("study", [])
        if studies:
            # Extract relevant information into a structured DataFrame
            study_list = []
            for study in studies:
                # Traverse to find the URL in the nested 'site > paleodata > datafile > fileurl' path
                file_url = study.get("site", [{}])[0].get("paleoData", [{}])[0].get("dataFile", [{}])[0].get("fileUrl", "N/A")
                study_list.append({
                    "xmlId": study.get("xmlId", "N/A"),
                    "NOAAStudyId": study.get("NOAAStudyId", "N/A"),
                    "Study Name": study.get("studyName", "N/A"),
                    "Data Publisher": study.get("dataPublisher", "N/A"),
                    "Investigator": study.get("investigators", "N/A"),
                    "Publication Year": study.get("pubYear", "N/A"),
                    "Data URL": file_url
                })
            
            # Store and display the DataFrame
            self.response_df = pd.DataFrame(study_list)
            display(self.response_df)
        else:
            print("No data available for the given parameters.")

    def select_study_url(self, index):
        # Allows the user to access the URL of a chosen study based on the index
        try:
            self.studies_url = self.response_df.loc[index, "Data URL"]
            print(f"Selected URL: {self.studies_url}")
            return self.studies_url
        except (IndexError, AttributeError) as e:
            print("Invalid index. Please ensure you've selected a valid entry from the displayed DataFrame.")
            return None
        
    def load_data_from_selected_url(self):
        # Fetch data from the selected URL only if it has been set
        if not self.studies_url:
            print("Error: No study selected. Run search_studies and set selected_url first.")
            return
        # Use provided URL if specified, otherwise use selected_url

        response = requests.get(self.studies_url)
        lines = response.text.splitlines()

        # Extract metadata and data rows
        self.metadata = {"comments": []}
        data_rows = []
        
        for line in lines:
            if line.startswith('#'):
                line = line[1:].strip()
                if ':' in line:
                    key, value = line.split(':', 1)
                    self.metadata[key.strip()] = value.strip()
                else:
                    self.metadata["comments"].append(line)  # Store standalone comments
            else:
                data_rows.append(line.strip())

        self.studies_df = pd.DataFrame([row.split() for row in data_rows])
        # print("\nExtracted Metadata:", self.metadata)

        return self.studies_df, self.metadata


pytups = pyTups()

# Pass search parameters directly to the search_studies function
pytups.search_studies(xml_id=16017)
pytups.search_studies(data_publisher="NOAA", investigators="Khider")

# To access the URL of a particular study, call the select_study_url function with an index
# Example:
# pytups.select_study_url(0)  # Replace 0 with the desired index

Unnamed: 0,xmlId,NOAAStudyId,Study Name,Data Publisher,Investigator,Publication Year,Data URL
0,16017,18315,Makassar Strait - Single specimens of P. obliq...,NOAA,"Khider, D.; Stott, L.D.; Emile-Geay, J.; Thune...",,https://www.ncei.noaa.gov/pub/data/paleo/contr...


Unnamed: 0,xmlId,NOAAStudyId,Study Name,Data Publisher,Investigator,Publication Year,Data URL
0,16017,18315,Makassar Strait - Single specimens of P. obliq...,NOAA,"Khider, D.; Stott, L.D.; Emile-Geay, J.; Thune...",,https://www.ncei.noaa.gov/pub/data/paleo/contr...
1,2151,8630,"Reuter et al. 2009 Cascayunga Cave, Peru 1000 ...",NOAA,"Cheng, H.; Edwards, R.L.; Khider, D.; Sinha, A...",,https://www.ncei.noaa.gov/pub/data/paleo/spele...
2,13818,16055,Western Tropical Pacific SST and Isotope Data ...,NOAA,"Khider, D.; Jackson, C.S.; Stott, L.D.",,https://www.ncei.noaa.gov/pub/data/paleo/contr...


In [32]:
url = pytups.select_study_url(0)  # Replace 0 with the desired index

Selected URL: https://www.ncei.noaa.gov/pub/data/paleo/contributions_by_author/khider2011/khider2011.txt


In [33]:
df, metadata = pytups.load_data_from_selected_url()


Extracted Metadata: {'comments': ['Makassar Strait - Single specimens of P. obliquiloculata d18O and d13C from 704-1851 AD', '-----------------------------------------------------------------------', 'World Data Center for Paleoclimatology, Boulder', 'and', 'NOAA Paleoclimatology Program', '-----------------------------------------------------------------------', 'If there is no publication information, please cite Investigators, Title, and Online_Resource and date accessed.', '', '', 'Description/Documentation lines begin with #', 'Data lines have no #', '', '--------------------', 'Contribution_Date', '--------------------', 'Title', '--------------------', 'Investigators', '--------------------', 'Description_Notes_and_Keywords', '--------------------', 'Publication', '------------------', 'Funding_Agency', '------------------', 'Site_Information', '------------------', 'Data_Collection', '------------------', 'Depth      \tMid-Depth of the interval (cm)', '14C.raw  \tConventional 

In [29]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,depth_top,depth_bottom,age_ADbot,age_ADtop,d13CcarbVPDB,d18OcarbVPDB,wgt-ind
1,0,1,1843,1851,0.936,-2.254,34
2,0,1,1843,1851,0.895,-2.356,37
3,0,1,1843,1851,0.514,-2.63,20
4,0,1,1843,1851,0.9,-2.48,25
...,...,...,...,...,...,...,...
1161,96,98,704,734,0.711,-1.719,23
1162,96,98,704,734,0.852,-1.69,23
1163,96,98,704,734,0.68,-2.093,18
1164,96,98,704,734,0.69,-2.023,17


In [18]:
metadata

{'NOTE': 'Please cite Publication, and Online_Resource and date accessed when using these data.',
 'Online_Resource': 'ftp://ftp.ncdc.noaa.gov/pub/data/paleo/contributions_by_author/khider2011/khider2011.txt',
 'Archive': 'Paleoceanography',
 'Date': '2015-03-31',
 'Study_Name': 'Makassar Strait - Single specimens of P. obliquiloculata d18O and d13C from 704-1851 AD',
 'Investigators': 'Khider, D.; Stott, L.D.; Emile-Geay, J.; Thunell, R.; Hammond D.E.',
 'Description': 'This dataset contains the d18O, d13C, and weights of single specimens of P. obliquiloculata used in the reconstruction of ENSO variability over the past 2,000 years"',
 'Authors': 'Khider, D., L. Stott, J. Emile-Geay, R. Thunell, and D.E. Hammond',
 'Published_Date': '2011-09-15',
 'Published_Title': 'Assessing El Nino Southern Oscillation variability during the past millennium',
 'Journal_Name': 'Paleoceanography',
 'Volume': '26',
 'Report_Number': 'PA3222',
 'DOI': '10.1029/2011PA002139',
 'Abstract': 'We present a 