In [1]:
import requests
import pandas as pd

In [None]:
class NOAAStudies:
    def __init__(self):
        """
        Initialize the NOAAStudies class with base URL and dictionaries to hold studies and data table indices.
        """
        self.BASE_URL = "https://www.ncei.noaa.gov/access/paleo-search/study/search.json"
        self.studies = {}
        self.data_table_index = {}

    def search_studies(self, xml_id=None, noaa_id=None, data_publisher=None,
                       investigators=None, latitude=None, longitude=None, location=None):
        """
        Search for NOAA studies using specific search parameters.
        
        Parameters:
            xml_id (str): XML ID of the study.
            noaa_id (str): NOAA study ID.
            data_publisher (str): Data publisher's name.
            investigators (str): Name(s) of investigators.
            latitude (float): Latitude for location-based search.
            longitude (float): Longitude for location-based search.
            location (str): Location description.

        @TODO 
        - (Prioirty 2 - Nov 13): Add support for other parameters like KeyWord, publication (to be implemented using searchText), cvWhat, datTypeID, search etc. 
        - (Prioirty 2): Add validation for parameters like latitude/longitude"""

        params = {
            'xmlId': xml_id,
            'NOAAStudyId': noaa_id,
            'dataPublisher': data_publisher,
            'investigators': investigators,
            'latitude': latitude,
            'longitude': longitude,
            'location': location
        }
        # Filtering out None values
        params = {k: v for k, v in params.items() if v is not None}
        response = requests.get(self.BASE_URL, params=params)
        if response.status_code == 200:
            self.response_parser(response.json())
        else:
            print(f"Error fetching studies: {response.status_code}")

    def response_parser(self, data):
        """
        Parse the JSON response from NOAA and populate the studies dictionary.
        
        Parameters:
            data (dict): The JSON data returned from a search query.
        """
        for study in data.get('study', []):
            study_id = study.get('xmlId') or study.get('NOAAStudyId')
            self.studies[study_id] = {
                'base_meta': self.load_base_meta(study),
                'investigators': self.load_investigators(study),
                'publications': self.load_publications(study),
                'sites': self.load_sites(study, study_id),
                # 'number of sites': len(sites)
            }

    def load_base_meta(self, study):
        """
        Load base metadata for a study.
        
        Parameters:
            study (dict): Part of the JSON data pertaining to a single study.
        """
        fields = ['xmlId', 'studyName', 'dataType', 'earliestYearBP', 'mostRecentYearBP',
                  'earliestYearCE', 'mostRecentYearCE', 'studyNotes', 'scienceKeywords']
        return {field: study.get(field, 'N/A') for field in fields}

    def load_investigators(self, study):
        """
        Extract investigator details from the study data.
        
        Parameters:
            study (dict): Part of the JSON data pertaining to a single study.
        """
        investigators = study.get("investigatorDetails", [])
        if investigators:
            return ", ".join([f"{i.get('firstName', 'N/A')} {i.get('lastName', 'N/A')}" for i in investigators])
        return "N/A"

    def load_publications(self, study):
        """
        Extract and format publication data from the study.
        
        Parameters:
            study (dict): Part of the JSON data pertaining to a single study.
        """
        publications = []
        for pub in study.get('publication', []):
            # Safely access 'author' and 'identifier' which might be None
            author_info = pub.get('author') or {}
            identifier_info = pub.get('identifier') or {}
            pub_details = {
                'author': author_info.get('name', 'N/A'),
                'year': pub.get('pubYear', 'N/A'),
                'citation': pub.get('citation', 'N/A'),
                'url': identifier_info.get('url', 'N/A'),
                'pubRank': pub.get('pubRank', 'N/A')
            }
            publications.append(pub_details)
        return publications

    def load_sites(self, study, study_id):
        """
        Load and format site data associated with the study.
        
        Parameters:
            study (dict): Part of the JSON data pertaining to a single study.
            study_id (str): The unique identifier of the study for reference.
        """        
        return {
            site.get('NOAASiteId', 'N/A'): {
                'siteName': site.get('siteName', 'N/A'),
                'locationName': site.get('locationName', 'N/A'),
                'coordinates': ",".join(site.get('geo', {}).get('geometry', {}).get('coordinates', ['N/A', 'N/A'])),
                'paleoData': self.load_paleo_data(site.get('paleoData', []), study_id, site.get('NOAASiteId', 'N/A'))
            }
            for site in study.get('site', [])
        }

    def load_paleo_data(self, paleoData, study_id, site_id):
        """
        Extract and format paleo data associated with a site.
        
        Parameters:
            paleoData (list): List of paleo data from the site.
            study_id (str): The unique identifier of the study.
            site_id (str): The unique identifier of the site.
        """
        paleo_dict = {}
        for paleo in paleoData:
            # Safe access to 'dataFile' list
            data_files = paleo.get('dataFile', [])
            file_url = data_files[0].get('fileUrl', 'N/A') if data_files else 'N/A'
            variables = []
            if data_files:  # Check if 'dataFile' is not empty
                variables = [var.get('cvShortName', 'N/A') for var in data_files[0].get('variables', [])]

            paleo_details = {
                'NOAADataTableId': paleo.get('NOAADataTableId', 'N/A'),
                'dataTableName': paleo.get('dataTableName', 'N/A'),
                'timeUnit': paleo.get('timeUnit', 'N/A'),
                'fileUrl': file_url,
                'variables': variables
            }
            paleo_dict[paleo.get('NOAADataTableId', 'N/A')] = paleo_details

            self.data_table_index[paleo.get('NOAADataTableId', 'N/A')] = {
                'file_url': file_url,
                'study_id': study_id,
                'site_id': site_id
            }
        return paleo_dict


    def display_responses(self):
        """
        Compile and return a DataFrame of all loaded studies along with their detailed metadata and linked information.
        
        Returns:
            DataFrame: A DataFrame representing the consolidated data of all studies, including metadata, investigators,
                       publications, and site details.
        """
        data = [{
            **study['base_meta'],
            'Investigators': study['investigators'],
            'publications': study['publications'],
            'sites': study['sites']
        } for study in self.studies.values()]
        return pd.DataFrame(data)
    
    def display_publications(self, study_id):
        """
        Return a DataFrame of publications associated with a specific study.
        
        Parameters:
            study_id (str): The unique identifier for the study.
            
        Returns:
            DataFrame: A DataFrame containing publication details for the specified study or an empty DataFrame if the
                       study ID is not found or there are no publications.
        """
        if study_id in self.studies:
            publications = self.studies[study_id].get('publications', [])
            return pd.DataFrame(publications)
        else:
            return pd.DataFrame()
        
    
    def display_sites(self, study_id):
        """
        Display all sites associated with a given study ID as a DataFrame, flatten out the sites with multiple paleoData attributes.
        
        Parameters:
            study_id (str): The unique identifier for the study.

        Returns:
            DataFrame: A DataFrame containing the site information for the specified study.
        """
        # Check if the study ID is in the dictionary
        if study_id not in self.studies:
            print("Study ID not found.")
            return pd.DataFrame()  # Return an empty DataFrame if study ID is not found
        
        sites_data = self.studies[study_id].get('sites', {})
        # Creating a list to hold data for DataFrame construction
        sites_list = []
        for site_id, site_info in sites_data.items():
            site_info['NOAASiteId'] = site_id  # Ensure the site ID is included in the data
            # Flatten paleoData if necessary and create a record for each paleo data entry
            paleo_list = site_info.pop('paleoData', {})
            for paleo_id, paleo_info in paleo_list.items():
                # Prepare a single record for the DataFrame
                record = {**site_info, **paleo_info, 'NOAADataTableId': paleo_id}
                sites_list.append(record)

        # Create DataFrame from the list of dictionaries
        if sites_list:
            df = pd.DataFrame(sites_list)
            df.set_index('NOAASiteId', inplace=True)  # Set NOAASiteId as the index
            return df
        else:
            return pd.DataFrame()  # Return an empty DataFrame if no sites are found
        

    def get_data(self, dataTableID=None, file_url=None):
        """
        Fetch and return the data from a specified dataTableID or file URL.
        
        Parameters:
            dataTableID (str): Optional. The unique identifier for the data table.
            file_url (str): Optional. Direct URL to the data file.
        
        Returns:
            DataFrame: A DataFrame containing the fetched data or an empty DataFrame if there are errors or no data available.

        @TODO: 
        - (Priority 1 - Nov 12): implement error handling if .txt can not be parsed/ request timed out, etc.  
        - (Priority 2): implement metadata collection (like delimiter and NaN types) from the .txt (raw data file)
        - (Priority 2): rearrangethe data with correct data type
        - (Priority 3): implement Table Understanding
        """
        
        if dataTableID:
            file_url = self.data_table_index.get(dataTableID, {}).get('file_url')
            if not file_url:
                print(f"Data Table ID {dataTableID} not found or no associated file URL.")
                return pd.DataFrame()
        
        response = requests.get(file_url)
        if response.status_code == 200:
            lines = response.text.split('\n')
            data_lines = [line for line in lines if not line.startswith('#') and line.strip()]
            if data_lines:
                # Assuming the first line of data contains headers
                headers = data_lines[0].split('\t')
                # Process the actual data lines
                data = [line.split('\t') for line in data_lines[1:]]
                return pd.DataFrame(data, columns=headers)
            else:
                print("No data lines found in the file.")
                return pd.DataFrame()
        else:
            print("Failed to fetch data.")
            return pd.DataFrame()

In [3]:
# Example usage:
studies = NOAAStudies()
studies.search_studies(investigators="Khider")
display(studies.display_responses())

Unnamed: 0,xmlId,studyName,dataType,earliestYearBP,mostRecentYearBP,earliestYearCE,mostRecentYearCE,studyNotes,scienceKeywords,Investigators,publications,sites
0,16017,Makassar Strait - Single specimens of P. obliq...,PALEOCEANOGRAPHY,1246.0,99.0,704.0,1851.0,"This dataset contains the d18O, d13C, and weig...","[Medieval Climate Anomaly (MCA), ENSO, Little ...","Deborah Khider, Lowell Stott, Julien Emile-Gea...","[{'author': 'Khider, D., L. Stott, J. Emile-Ge...","{'53040': {'siteName': 'MD98-2177', 'locationN..."
1,2151,"Reuter et al. 2009 Cascayunga Cave, Peru 1000 ...",SPELEOTHEMS,862.0,-55.0,1088.0,2005.0,,"[PAGES 2k Network, PAGES LOTRED SA2k]","Hai Cheng, R. Lawrence Edwards, Deborah Khider...","[{'author': 'Cheng, H.; Edwards, R.L.; Khider,...","{'22935': {'siteName': 'Cascayunga Cave', 'loc..."
2,13818,Western Tropical Pacific SST and Isotope Data ...,PALEOCEANOGRAPHY,11031.0,199.0,-9081.0,1751.0,Benthic (Cibicicoides mundulus) foraminifera c...,,"Deborah Khider, Charles Jackson, Lowell Stott","[{'author': 'Stott, L.D., K.G. Cannariato, R.C...","{'19265': {'siteName': 'MD98-2181', 'locationN..."
3,54361,(Table 2) Age determination of sediment core M...,,,,,,For all details see the full metadata descript...,"[MD98-2177, MD982177, Calypso Corer, MD111]",,"[{'author': 'N/A', 'year': None, 'citation': '...","{'Unspecified': {'siteName': 'Unspecified', 'l..."
4,54362,(Table 3) Lead 214 and Lead 210 concentration ...,,,,,,For all details see the full metadata descript...,"[MD982177, Calypso Corer, MD111, MD98-2177]",,"[{'author': 'N/A', 'year': None, 'citation': '...","{'Unspecified': {'siteName': 'Unspecified', 'l..."
5,54360,(Table S1) Stable carbon and oxygen isotope ra...,,,,,,For all details see the full metadata descript...,"[MD98-2177, MD111, MD982177, Calypso Corer]",,"[{'author': 'N/A', 'year': None, 'citation': '...","{'Unspecified': {'siteName': 'Unspecified', 'l..."
6,64805,CTD data from a long-running sediment trap tim...,,,,,,For all details see the full metadata descript...,"[GMT_Gulf_of_Mexico, Mooring]",,"[{'author': 'N/A', 'year': None, 'citation': '...","{'Unspecified': {'siteName': 'Unspecified', 'l..."
7,64706,Carbonate measurements from a long-running sed...,,,,,,For all details see the full metadata descript...,"[Mooring, GMT_Gulf_of_Mexico]",,"[{'author': 'N/A', 'year': None, 'citation': '...","{'Unspecified': {'siteName': 'Unspecified', 'l..."
8,64463,Entire water column analysis of d18O and salin...,,,,,,For all details see the full metadata descript...,"[GMT_Gulf_of_Mexico, Mooring]",,"[{'author': 'N/A', 'year': None, 'citation': '...","{'Unspecified': {'siteName': 'Unspecified', 'l..."
9,64547,Globigerinoides ruber flux analysis from a lon...,,,,,,For all details see the full metadata descript...,"[GMT_Gulf_of_Mexico, Mooring]",,"[{'author': 'N/A', 'year': None, 'citation': '...","{'Unspecified': {'siteName': 'Unspecified', 'l..."


In [None]:
# Displays all publications for a specified study
display(studies.display_publications('13818'))

Unnamed: 0,author,year,citation,url,pubRank
0,"Stott, L.D., K.G. Cannariato, R.C. Thunell, G....",2004,"Stott, L.D., K.G. Cannariato, R.C. Thunell, G....",http://dx.doi.org/10.1038/nature02903,3
1,"Stott, L.D., A. Timmermann, and R.C. Thunell",2007,"Stott, L.D., A. Timmermann, and R.C. Thunell. ...",http://dx.doi.org/10.1126/science.1143791,2
2,"Khider, D., C.S. Jackson, and L.D. Stott",2014,"Khider, D., C.S. Jackson, and L.D. Stott. 2014...",http://dx.doi.org/10.1002/2013PA002534,1


In [None]:
"""Displays detailed information for all sites associated with a specific study.

This method lists each site related to the study in individual rows. For studies that include multiple sites, each site is presented in a separate row.

If a site contains multiple entries under 'paleoData', the information is flattened such that each 'paleoData' entry is expanded into its own row. As a result, each row in the display represents a single 'paleoData' entry from a site, ensuring that detailed paleo data are clearly and separately visible."""

display(studies.display_sites('13818'))

Unnamed: 0_level_0,siteName,locationName,coordinates,NOAADataTableId,dataTableName,timeUnit,fileUrl,variables
NOAASiteId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
19265,MD98-2181,Ocean>Pacific Ocean>Western Pacific Ocean,"6.3,125.83",26023,MD98-2181 Raw K14,,https://www.ncei.noaa.gov/pub/data/paleo/contr...,"[notes, depth_cm, Mg/Ca-g.rub-w, d13Cg.rub-w, ..."
19265,MD98-2181,Ocean>Pacific Ocean>Western Pacific Ocean,"6.3,125.83",26024,MD98-2181 Benthic K14,cal yr BP,https://www.ncei.noaa.gov/pub/data/paleo/contr...,"[depth_cm, age_calBP, age2.5%, age34%, age68%,..."
19265,MD98-2181,Ocean>Pacific Ocean>Western Pacific Ocean,"6.3,125.83",26025,MD98-2181 SST K14,cal yr BP,https://www.ncei.noaa.gov/pub/data/paleo/contr...,"[SST, SST2.5%, SST34%, SST68%, SST97.5%, d18Os..."


In [6]:
data_df = studies.get_data('26023')
data_df

Unnamed: 0,depth_cm,Mg/Ca-g.rub-w,d18Og.rub-w,d13Cg.rub-w,d18Oc.mund,d13Cc.mund,notes\r
0,1,5.21,-999.90,-999.90,-999.90,-999.90,"data from Stott et al. (2004, 2007)\r"
1,1,-999.90,-999.90,-999.90,-0.30,2.45,\r
2,2.5,5.04,0.76,-2.83,-999.90,-999.90,\r
3,3,-999.90,1.53,-2.81,-999.90,-999.90,"data from Stott et al. (2004, 2007)\r"
4,4,-999.90,0.90,-2.78,-999.90,-999.90,\r
...,...,...,...,...,...,...,...
842,843,5.45,0.80,-2.64,-999.90,-999.90,"data from Stott et al. (2004, 2007)\r"
843,845,-999.90,-999.90,-999.90,-0.44,2.91,\r
844,847,5.42,0.75,-2.53,-999.90,-999.90,"data from Stott et al. (2004, 2007)\r"
845,849,-999.90,-999.90,-999.90,-0.29,2.73,\r


In [7]:
data_df.isna().value_counts()

depth_cm  Mg/Ca-g.rub-w  d18Og.rub-w  d13Cg.rub-w  d18Oc.mund  d13Cc.mund  notes\r
False     False          False        False        False       False       False      847
Name: count, dtype: int64

In [8]:
# Example usage:
studies_2 = NOAAStudies()
studies_2.search_studies(investigators="Bhattacharya")
display(studies_2.display_responses())


Unnamed: 0,xmlId,studyName,dataType,earliestYearBP,mostRecentYearBP,earliestYearCE,mostRecentYearCE,studyNotes,scienceKeywords,Investigators,publications,sites
0,78437,Benguela Upwelling System Hydrogen and Carbon ...,PALEOCEANOGRAPHY,5183867,0,-5181917,1950,Hydrogen Isotopic Reconstruction of North Amer...,[hydrology],"Claire Rubbelke, Tripti Bhattacharya, Ran Feng...","[{'author': 'Rubbelke, Claire; Bhattacharya, T...","{'59689': {'siteName': 'ODP 1081', 'locationNa..."
1,78037,California Margin Hydrogen and Carbon Isotope ...,PALEOCEANOGRAPHY,3700000,0,-3698050,1950,Hydrogen Isotopic Reconstruction of North Amer...,"[Monsoon, hydrology]","Tripti Bhattacharya, Ran Feng, Jessica Tierney...","[{'author': 'Bhattacharya, Tripti; Feng, Ran; ...","{'30617': {'siteName': 'ODP 1012', 'locationNa..."
2,80339,"Clayton Valley, Nevada Hydrogen and Carbon Iso...",PALEOLIMNOLOGY,2800000,0,-2798050,1950,Hydrogen and carbon isotopes of long-chain lea...,,"Tripti Bhattacharya, Peter Brennan, Daniel Iba...","[{'author': 'Gagnon, Catherine; Butler, Kristi...","{'60188': {'siteName': 'Clayton Valley', 'loca..."
3,79938,Eastern Equatorial Pacific Leaf Wax Isotope Da...,PALEOCEANOGRAPHY,4956,517,-3006,1433,Age assigned based on age models from Etournea...,,"David Fastovich, Tripti Bhattacharya, Lina Pér...","[{'author': 'Fastovich, David; Bhattacharya, T...","{'56876': {'siteName': 'ODP 1239', 'locationNa..."
4,77552,Eastern Pacific Alkenone Sea Surface Temperatu...,CLIMATE RECONSTRUCTIONS,144467,2430,-142517,-480,,,"Dervla Meegan Kumar, Jessica Tierney, Tripti B...","[{'author': 'Meegan Kumar, Dervla; Tierney, Je...","{'19117': {'siteName': 'NH22P', 'locationName'..."
5,78817,Geochemical Proxies from a Northeast Mexico Sp...,SPELEOTHEMS,62500,5100,-60550,-3150,,"[Last Glacial Maximum, Monsoon, Tropics, Milan...","Clay Tabor, Gregory Goldsmith, David McGee, Ga...","[{'author': 'Kevin T. Wright, Kathleen R. John...","{'59509': {'siteName': 'Cueva Bonita', 'locati..."
6,23014,Guaymas Basin 9-12kYrBP Leaf Wax Isotopes and ...,PALEOCEANOGRAPHY,24266,1460,-22316,490,Stable isotope (dD and d13C) data on leaf waxe...,[Monsoon],"Tripti Bhattacharya, Jessica Tierney, Jason Ad...","[{'author': 'Tripti Bhattacharya, Jessica E. T...","{'53312': {'siteName': 'MD02-2515', 'locationN..."
7,15397,"Laguna de Aljojuca, Mexico 4000 Year Stable Is...",PALEOLIMNOLOGY,4139,-65,-2189,2015,Fluctuations in climate over the past 4000 cal...,[drought],"Tripti Bhattacharya, None Byrne, Harald Böhnel...","[{'author': 'Tripti Bhattacharya, Roger Byrne,...","{'56460': {'siteName': 'Laguna de Aljojuca', '..."
8,78038,Leaf Wax Hydrogen Isotope Data from the Chilea...,PALEOCEANOGRAPHY,24225,-26,-22275,1976,,"[Last Glacial Maximum, westerlies]","Jessica Tierney, Tripti Bhattacharya, Jiang Zh...","[{'author': 'Blumm, Aria, Jessica E. Tierney, ...","{'19251': {'siteName': 'ODP 1233', 'locationNa..."
9,74654,Mesoamerica Last Millennium Lake and Speleothe...,CLIMATE RECONSTRUCTIONS,1100,100,850,1850,Synthesis of lacustrine records of hydroclimat...,[Other Hydroclimate Reconstruction],"Tripti Bhattacharya, Sloan Coats","[{'author': 'Tripti Bhattacharya, Sloan Coats'...","{'58864': {'siteName': 'Mesoamerica', 'locatio..."


In [9]:
display(studies_2.display_sites('78037'))

Unnamed: 0_level_0,siteName,locationName,coordinates,NOAADataTableId,dataTableName,timeUnit,fileUrl,variables
NOAASiteId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
30617,ODP 1012,Ocean>Pacific Ocean>North Pacific Ocean,"32.28,-118.38",49503,ODP1012-leafwax Bhattacharya2022,cal yr BP,https://www.ncei.noaa.gov/pub/data/paleo/paleo...,"[age_ma, dD_wax, dD_wax_err, d13C_wax, d13C_wa..."
59597,DSDP 475,Ocean>Pacific Ocean>Eastern Pacific Ocean,"23.03,-109.03",49502,DSDP475-leafwax Bhattacharya2022,cal yr BP,https://www.ncei.noaa.gov/pub/data/paleo/paleo...,"[age_ma, dD_wax, dD_wax_err, d13C_wax, d13C_wa..."
