Redo of hydroGFD and CNIP extraction
================
I made a mistake in the hydroGFD extraction. And I need to try again but starting from the beginning. The mistake I made was in not converting the Longitude values to negative values for the Western Hemisphere. 

In [None]:
import pandas as pd
import numpy as np
import netCDF4 as nc
import glob
from datetime import datetime, timedelta

## CNIP data

In [None]:
# List the stations within the CNIP dataset to extract
stationList = ["BAB", "BON", "CPA", "EGB", "ELA", "EST", "GOB", "HAB", "SAT", "SNA", "SKT", "OTT", "BRA", "CAM", "EUR", "RES", "ALR", "HAL"]
cnipDict = {} # Dictionary to store the CNIP data
stationCoords = {} # Dict to store the station coordinates
# Loop through each station and extract the data
for station in stationList:
    # Read in the data
    data = pd.read_excel("CNIP Updated Data Stations 10.08.2009..xls", header=None, sheet_name=station, skiprows=[0, 1])

    #Making sure the columns are filled properly with constant values: Station, Lat, Long, Alt
    data[0] = station #Station
    data[1] = data[1].iloc[0] #Lat
    data[2] = data[2].iloc[0] *-1 #Long
    data[3] = data[3].iloc[0] #Alt
    stationCoords[station] = tuple([data[1].iloc[0], data[2].iloc[0]]) #Store the station coordinates
    #Add the station data to the dictionary
    cnipDict[station] = data

columns = ["Station", "Lat", "Long", "Alt", "Date", "Month", "O18(1)", "O18(2)", "O18Avg", "H2(1)", "H2(2)", "H2avg", "dex", "Temp", "Prec(1)", "Prec(2)", "Prec(3)"]
# Combine the CNIP data into one dataframe
cnip = pd.concat(cnipDict.values(), ignore_index=True)
cnip.columns = columns

# Print the CNIP dataframe
cnip

In [None]:
# Convert the date column to datetime format
cnip["Date"] = pd.to_datetime(cnip["Date"], format="%Y-%m-%d")

# Filling in empty strings with NaN
cnip = cnip.replace(r'^\s*$', np.nan, regex=True)

# Removing unnecessary columns
cnip = cnip.drop(["Month", "O18(1)", "O18(2)", "H2(1)", "H2(2)", "Prec(1)", "Prec(2)", "Prec(3)", "Temp"], axis=1)

# Removing rows in the Date column with NaT values
cnip = cnip[cnip.Date.isnull() == False].reset_index()
cnip

In [None]:
# Make the CNIP dates to datetime objects with no time
cnip["Date"] = pd.to_datetime(cnip["Date"], format="%Y-%m-%d")

## HydroGFD data

### Precipitation


In [None]:
# All the precipitation file starts with a "prAdjust" and ends with a ".nc"
path = "HydroGFD/prAdjust*"
precipFiles = glob.glob(path) #Creates a list of all the precipitation flux files relative paths

# Loop through each precipitation file and extract the data which should only be separated by time
# and store them in a single dataframe
precip = pd.DataFrame(columns=["Station","Lat", "Long", "Time", "Precipitation"])

# Loop through each file and pull out the data at each time step for every lat and lon coordinate we have in the CNIP dataset that is stored in the stationCoords dictionary
for file in precipFiles:
    ncid = nc.Dataset(file, "r")

    #Pull out the time data and coordiante data
    time = ncid.variables["time"][:].filled(np.nan)
    lat = ncid.variables["lat"][:].filled(np.nan)
    lon = ncid.variables["lon"][:].filled(np.nan)

    for stat, coords in stationCoords.items():
        latIndex = (np.abs(lat - coords[0])).argmin()
        lonIndex = (np.abs(lon - coords[1])).argmin()

        # Pull out the precipitation data at each time step
        precipData = ncid.variables["prAdjust"][:, latIndex, lonIndex].filled(0) #Filling with 0 is an assumption that if there is no data, then there is no precipitation
        
        # Place the lat, lon, time, and precipitation data into a dataframe
        df = pd.DataFrame({"Station": stat, "Lat": coords[0], "Long": coords[1], "Time": time, "Precipitation": precipData})
        precip = pd.concat([precip, df], ignore_index=True)
    print("Finished extracting data from " + file[-20:-3])
    ncid.close()

# Convert the time data to datetime format
precip["Time"] = precip["Time"].apply(lambda x: datetime(1850, 1, 1) + timedelta(days=x))

### Temperature

In [None]:
# All the temperature files starts with a "tasAdjust" and ends with a ".nc"
path = "HydroGFD/tasAdjust*"
tempFiles = glob.glob(path) #Creates a list of all the temperature flux files relative paths

# Loop through each temperature file and extract the data which should only be separated by time
# and store them in a single dataframe
temperature = pd.DataFrame(columns=["Station","Lat", "Long", "Time", "Temperature"])

# Loop through each file and pull out the data at each time step for every lat and lon coordinate we have in the CNIP dataset that is stored in the stationCoords dictionary
for file in tempFiles:
    ncid = nc.Dataset(file, "r")

    #Pull out the time data and coordiante data
    time = ncid.variables["time"][:].filled(np.nan)
    lat = ncid.variables["lat"][:].filled(np.nan)
    lon = ncid.variables["lon"][:].filled(np.nan)

    for stat, coords in stationCoords.items():
        latIndex = (np.abs(lat - coords[0])).argmin()
        lonIndex = (np.abs(lon - coords[1])).argmin()

        # Pull out the temperature data at each time step
        tempData = ncid.variables["tasAdjust"][:, latIndex, lonIndex].filled(np.nan) #Filling with nan, as we can't make an assumption about the temperature
        
        # Place the lat, lon, time, and temperature data into a dataframe
        df = pd.DataFrame({"Station": stat, "Lat": coords[0], "Long": coords[1], "Time": time, "Temperature": tempData})
        temperature = pd.concat([temperature, df], ignore_index=True)
    print("Finished extracting data from " + file[-20:-3])
    ncid.close()

# Convert the time data to datetime format
temperature["Time"] = temperature["Time"].apply(lambda x: datetime(1850, 1, 1) + timedelta(days=x))

### HydroGFD

In [None]:
# Merge the precipitation and temperature dataframes
hydroGFD = pd.merge(precip, temperature, on=["Station", "Lat", "Long", "Time"], how="outer")

# Keep only the needed columns of the dataframe: station, lat, long, time, precipitation, temperature
# removing the index columns
hydroGFD = hydroGFD[["Station", "Lat", "Long", "Time", "Precipitation", "Temperature"]]

# Renaming some columns to include units
hydroGFD = hydroGFD.rename(columns={
    "Precipitation": "Precipitation (kg/m^2/s)", 
    "Temperature": "Temperature (K)",
    "Time": "Date"})

hydroGFD = hydroGFD.sort_values(["Date"])

# Finally saving this data as a CSV file just in case
hydroGFD.to_csv(r"hydroGFD.csv", index=False)

In [38]:
# Load hydroGFD csv data so I don't have to run the above code every time
hydroGFD = pd.read_csv("hydroGFD.csv")

In [48]:
# Make hydroGFD dates to datetime objects from strings
hydroGFD["Date"] = pd.to_datetime(hydroGFD["Date"], format="%Y-%m-%d %H:%M:%S")

# Change hours to 00:00:00
hydroGFD["Date"] = hydroGFD["Date"].dt.normalize()


## Combining the data

In [49]:
# First I will pull out all the dates in CNIP and HydroGFD
cnipDates = cnip["Date"].unique()
hydroGFDDates = hydroGFD["Date"].unique()

# Find the dates that are in both CNIP and HydroGFD
commonDates = np.intersect1d(cnipDates, hydroGFDDates)

# Remove the rows in the CNIP dataset that are not in the HydroGFD dataset and vice versa
#cnip = cnip[cnip.Date.isin(hydroGFD.Date)].reset_index()
#hydroGFD = hydroGFD[hydroGFD.Date.isin(cnip.Date)].reset_index()

commonDates

array(['1960-01-15T00:00:00.000000000', '1960-02-15T00:00:00.000000000',
       '1960-03-15T00:00:00.000000000', '1960-04-15T00:00:00.000000000',
       '1960-05-15T00:00:00.000000000', '1960-06-15T00:00:00.000000000',
       '1960-07-15T00:00:00.000000000', '1960-08-15T00:00:00.000000000',
       '1960-09-15T00:00:00.000000000', '1960-10-15T00:00:00.000000000',
       '1960-11-15T00:00:00.000000000', '1960-12-15T00:00:00.000000000',
       '1961-01-15T00:00:00.000000000', '1961-02-15T00:00:00.000000000',
       '1961-03-15T00:00:00.000000000', '1961-04-15T00:00:00.000000000',
       '1961-05-15T00:00:00.000000000', '1961-06-15T00:00:00.000000000',
       '1961-07-15T00:00:00.000000000', '1961-08-15T00:00:00.000000000',
       '1961-09-15T00:00:00.000000000', '1961-10-15T00:00:00.000000000',
       '1961-11-15T00:00:00.000000000', '1961-12-15T00:00:00.000000000',
       '1962-01-15T00:00:00.000000000', '1962-02-15T00:00:00.000000000',
       '1962-03-15T00:00:00.000000000', '1962-04-15

In [50]:
# Keep only the rows in the CNIP and HydroGFD datasets that have the same dates in the commonDates array
cnip = cnip[cnip.Date.isin(commonDates)].reset_index()
hydroGFD = hydroGFD[hydroGFD.Date.isin(commonDates)].reset_index()


In [51]:
# Create copies of the CNIP and HydroGFD dataframes so I don't have to rerun the code above
cnipCopy = cnip.copy()
hydroGFDCopy = hydroGFD.copy()

In [54]:
# Combining the copies of the CNIP and HydroGFD dataframes. 
# This will allow me to compare the data between the two datasets
#cnipCopy = cnipCopy.drop(["index"], axis=1)
#hydroGFDCopy = hydroGFDCopy.drop(["index"], axis=1)

# Merge the CNIP and HydroGFD dataframes
merged = pd.merge(cnipCopy, hydroGFDCopy, on=["Station", "Lat", "Long", "Date"], how="outer")

In [55]:
merged = merged.sort_values(["Date"]).reset_index()
merged = merged.drop(["index", "level_0", "index_x", "index_y"], axis=1)

In [56]:
# Drop all rows that have Nan values in the O18Avg, Precipitation (kg/m^2/s), and Temperature (K) columns
merged = merged.dropna(subset=["O18Avg", "Precipitation (kg/m^2/s)", "Temperature (K)"]).reset_index()

In [57]:
# Export the merged dataframe to a CSV file
merged.to_csv(r"merged.csv", index=False)