In [20]:
## This file will be a combination of functions that both clean that GNIP data and combine it with other data sources
## depending on the features we want to include in the model

In [21]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import xarray as xr
import glob

In [22]:
# Function to clean the GNIP data
def cleanData():
    # Load in the uncleaned GNIP data
    dataUnclean = pd.read_csv('GNIP_Uncleaned.csv')

    # Changing the measurement symbol/unit and amount into separate columns
    data = dataUnclean.copy()
    data['Precip (mm)'] = data['Amount'].where(data['Symbol'] == 'Precipitation', np.nan)
    data['Temp (\u00B0C)'] = data['Amount'].where(data['Symbol'] == 'TempAir', np.nan)
    data['O18 (\u2030)'] = data['Amount'].where(data['Symbol'] == 'O18', np.nan)
    data['H2 (\u2030)'] = data['Amount'].where(data['Symbol'] == 'H2', np.nan)
    data = data.drop(['Amount', 'Symbol', 'Units', 'SampleType'], axis=1)

    # Changing the date to a datetime object
    data['Date'] = pd.to_datetime(data['Date'])

    # Combine the rows with the same date, lat, and lon into one row
    dataAgg = data.groupby(['Lat', 'Lon', 'Date', 'Alt']).agg({
        'Precip (mm)': 'first',  # Replace 'first' with your preferred aggregation for non-NaN values
        'Temp (\u00B0C)': 'first',
        'O18 (\u2030)': 'first',
        'H2 (\u2030)': 'first'
    }).reset_index()

    # Remove rows with NaN values in the O18 and H2 columns as they are the target variables
    dataDrop = dataAgg.dropna(subset=['O18 (\u2030)', 'H2 (\u2030)'])

    return dataDrop

In [None]:
# Function to load in the HydroGFD data, returns a dictionary with dates as keys and dataframes as values
def loadHydroGFD():
    allHydroGFD = glob.glob("../HydroGFD/*.nc")
    dictHydroGFD = {}
    for file in allHydroGFD:
        dates = file.split('_')[2]
        dates = dates.split('.')[0]
        
        dateTuple = (int(dates.split('-')[0]), int(dates.split('-')[1]))

        if dateTuple in dictHydroGFD:
            dictHydroGFD[dateTuple].append(file)
        else:
            dictHydroGFD[dateTuple] = [file]
        
    return dictHydroGFD

In [23]:
def main():
    df = cleanData()
    dictHydroGFD = loadHydroGFD()

loadHydroGFD()

  data['Date'] = pd.to_datetime(data['Date'])


Unnamed: 0,Lat,Lon,Date,Alt,Precip (mm),Temp (°C),O18 (‰),H2 (‰)
0,-90.00,0.000000,1990-01-01 00:00:00+01:00,2880.0,,-49.3,-51.200,-398.3
1,-89.88,114.370000,1990-01-01 00:00:00+01:00,2880.0,,-49.6,-51.700,-404.7
2,-89.53,108.283333,1990-01-01 00:00:00+01:00,2900.0,,-50.3,-51.600,-403.2
3,-89.37,-91.650000,1990-01-01 00:00:00+01:00,2850.0,,-51.1,-50.900,-395.4
4,-89.18,105.580000,1990-01-01 00:00:00+01:00,2950.0,,-51.5,-51.600,-398.7
...,...,...,...,...,...,...,...,...
159448,82.50,-62.333333,2021-10-15 00:00:00+00:00,62.0,4.3,,-27.675,-212.2
159449,82.50,-62.333333,2021-11-15 00:00:00+00:00,62.0,6.1,,-32.800,-245.5
159450,82.50,-62.333333,2021-12-15 00:00:00+00:00,62.0,6.9,,-32.150,-248.8
159452,82.50,-62.333333,2022-02-15 00:00:00+00:00,62.0,10.5,,-33.377,-255.5
