I F'd up
================
I made a mistake in the hydroGFD extraction. And I need to try again but starting from the beginning. 

In [7]:
import pandas as pd
import numpy as np
import netCDF4 as nc
import glob
from datetime import datetime, timedelta

## CNIP data

In [16]:
# List the stations within the CNIP dataset to extract
stationList = ["BAB", "BON", "CPA", "EGB", "ELA", "EST", "GOB", "HAB", "SAT", "SNA", "SKT", "OTT", "BRA", "CAM", "EUR", "RES", "ALR", "HAL"]
cnipDict = {} # Dictionary to store the CNIP data
stationCoords = {} # Dict to store the station coordinates
# Loop through each station and extract the data
for station in stationList:
    # Read in the data
    data = pd.read_excel("CNIP Updated Data Stations 10.08.2009..xls", header=None, sheet_name=station, skiprows=[0, 1])

    #Making sure the columns are filled properly with constant values: Station, Lat, Long, Alt
    data[0] = station #Station
    data[1] = data[1].iloc[0] #Lat
    data[2] = data[2].iloc[0] *-1 #Long
    data[3] = data[3].iloc[0] #Alt
    stationCoords[station] = tuple([data[1].iloc[0], data[2].iloc[0]]) #Store the station coordinates
    #Add the station data to the dictionary
    cnipDict[station] = data

columns = ["Station", "Lat", "Long", "Alt", "Date", "Month", "O18(1)", "O18(2)", "O18Avg", "H2(1)", "H2(2)", "H2avg", "dex", "Temp", "Prec(1)", "Prec(2)", "Prec(3)"]
# Combine the CNIP data into one dataframe
cnip = pd.concat(cnipDict.values(), ignore_index=True)
cnip.columns = columns

# Print the CNIP dataframe
cnip

Unnamed: 0,Station,Lat,Long,Alt,Date,Month,O18(1),O18(2),O18Avg,H2(1),H2(2),H2avg,dex,Temp,Prec(1),Prec(2),Prec(3)
0,BAB,47.98,-55.82,190.0,1997-02-02,2.0,,,-10.520,,,-50.700,33.460,-8.2,84.5,,8.1
1,BAB,47.98,-55.82,190.0,1997-03-02,3.0,,,-8.605,,,-48.040,20.800,-6.4,164.6,,12.1
2,BAB,47.98,-55.82,190.0,1997-04-02,4.0,,,-10.880,,,-27.485,59.555,1.7,95.4,,9.0
3,BAB,47.98,-55.82,190.0,1997-05-02,5.0,,,-7.550,,,-50.455,9.945,,,,12.4
4,BAB,47.98,-55.82,190.0,1997-06-02,6.0,,,-5.835,,,-76.480,-29.800,,,,9.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3265,HAL,68.47,-81.15,8.0,2007-03-02,3.0,,,,,,,,,,,
3266,HAL,68.47,-81.15,8.0,2007-04-02,4.0,,,,,,,,,,,
3267,HAL,68.47,-81.15,8.0,2007-05-02,5.0,,,,,,,,,,,
3268,HAL,68.47,-81.15,8.0,2007-06-02,6.0,,,,,,,,,,,


In [17]:
# Convert the date column to datetime format
cnip["Date"] = pd.to_datetime(cnip["Date"], format="%Y-%m-%d")

# Filling in empty strings with NaN
cnip = cnip.replace(r'^\s*$', np.nan, regex=True)

# Removing unnecessary columns
cnip = cnip.drop(["Month", "O18(1)", "O18(2)", "H2(1)", "H2(2)", "Prec(1)", "Prec(2)", "Prec(3)", "Temp"], axis=1)

# Removing rows in the Date column with NaT values
cnip = cnip[cnip.Date.isnull() == False].reset_index()
cnip

Unnamed: 0,index,Station,Lat,Long,Alt,Date,O18Avg,H2avg,dex
0,0,BAB,47.98,-55.82,190.0,1997-02-02,-10.520,-50.700,33.460
1,1,BAB,47.98,-55.82,190.0,1997-03-02,-8.605,-48.040,20.800
2,2,BAB,47.98,-55.82,190.0,1997-04-02,-10.880,-27.485,59.555
3,3,BAB,47.98,-55.82,190.0,1997-05-02,-7.550,-50.455,9.945
4,4,BAB,47.98,-55.82,190.0,1997-06-02,-5.835,-76.480,-29.800
...,...,...,...,...,...,...,...,...,...
3244,3265,HAL,68.47,-81.15,8.0,2007-03-02,,,
3245,3266,HAL,68.47,-81.15,8.0,2007-04-02,,,
3246,3267,HAL,68.47,-81.15,8.0,2007-05-02,,,
3247,3268,HAL,68.47,-81.15,8.0,2007-06-02,,,


In [36]:
# Make the CNIP dates to datetime objects with no time
cnip["Date"] = pd.to_datetime(cnip["Date"], format="%Y-%m-%d")

## HydroGFD data

### Precipitation


In [18]:
# All the precipitation file starts with a "prAdjust" and ends with a ".nc"
path = "HydroGFD/prAdjust*"
precipFiles = glob.glob(path) #Creates a list of all the precipitation flux files relative paths

# Loop through each precipitation file and extract the data which should only be separated by time
# and store them in a single dataframe
precip = pd.DataFrame(columns=["Station","Lat", "Long", "Time", "Precipitation"])

# Loop through each file and pull out the data at each time step for every lat and lon coordinate we have in the CNIP dataset that is stored in the stationCoords dictionary
for file in precipFiles:
    ncid = nc.Dataset(file, "r")

    #Pull out the time data and coordiante data
    time = ncid.variables["time"][:].filled(np.nan)
    lat = ncid.variables["lat"][:].filled(np.nan)
    lon = ncid.variables["lon"][:].filled(np.nan)

    for stat, coords in stationCoords.items():
        latIndex = (np.abs(lat - coords[0])).argmin()
        lonIndex = (np.abs(lon - coords[1])).argmin()

        # Pull out the precipitation data at each time step
        precipData = ncid.variables["prAdjust"][:, latIndex, lonIndex].filled(0) #Filling with 0 is an assumption that if there is no data, then there is no precipitation
        
        # Place the lat, lon, time, and precipitation data into a dataframe
        df = pd.DataFrame({"Station": stat, "Lat": coords[0], "Long": coords[1], "Time": time, "Precipitation": precipData})
        precip = pd.concat([precip, df], ignore_index=True)
    print("Finished extracting data from " + file[-20:-3])
    ncid.close()

# Convert the time data to datetime format
precip["Time"] = precip["Time"].apply(lambda x: datetime(1850, 1, 1) + timedelta(days=x))

  precip = pd.concat([precip, df], ignore_index=True)


Finished extracting data from 19600101-19641231
Finished extracting data from 19650101-19691231
Finished extracting data from 19700101-19741231
Finished extracting data from 19750101-19791231
Finished extracting data from 19800101-19841231
Finished extracting data from 19850101-19891231
Finished extracting data from 19900101-19941231
Finished extracting data from 19950101-19991231
Finished extracting data from 20000101-20041231
Finished extracting data from 20050101-20051231
Finished extracting data from 20060101-20101231


### Temperature

In [19]:
# All the temperature files starts with a "tasAdjust" and ends with a ".nc"
path = "HydroGFD/tasAdjust*"
tempFiles = glob.glob(path) #Creates a list of all the temperature flux files relative paths

# Loop through each temperature file and extract the data which should only be separated by time
# and store them in a single dataframe
temperature = pd.DataFrame(columns=["Station","Lat", "Long", "Time", "Temperature"])

# Loop through each file and pull out the data at each time step for every lat and lon coordinate we have in the CNIP dataset that is stored in the stationCoords dictionary
for file in tempFiles:
    ncid = nc.Dataset(file, "r")

    #Pull out the time data and coordiante data
    time = ncid.variables["time"][:].filled(np.nan)
    lat = ncid.variables["lat"][:].filled(np.nan)
    lon = ncid.variables["lon"][:].filled(np.nan)

    for stat, coords in stationCoords.items():
        latIndex = (np.abs(lat - coords[0])).argmin()
        lonIndex = (np.abs(lon - coords[1])).argmin()

        # Pull out the temperature data at each time step
        tempData = ncid.variables["tasAdjust"][:, latIndex, lonIndex].filled(np.nan) #Filling with nan, as we can't make an assumption about the temperature
        
        # Place the lat, lon, time, and temperature data into a dataframe
        df = pd.DataFrame({"Station": stat, "Lat": coords[0], "Long": coords[1], "Time": time, "Temperature": tempData})
        temperature = pd.concat([temperature, df], ignore_index=True)
    print("Finished extracting data from " + file[-20:-3])
    ncid.close()

# Convert the time data to datetime format
temperature["Time"] = temperature["Time"].apply(lambda x: datetime(1850, 1, 1) + timedelta(days=x))

  temperature = pd.concat([temperature, df], ignore_index=True)


Finished extracting data from 19600101-19641231
Finished extracting data from 19650101-19691231
Finished extracting data from 19700101-19741231
Finished extracting data from 19750101-19791231
Finished extracting data from 19800101-19841231
Finished extracting data from 19850101-19891231
Finished extracting data from 19900101-19941231
Finished extracting data from 19950101-19991231
Finished extracting data from 20000101-20041231
Finished extracting data from 20050101-20051231
Finished extracting data from 20060101-20101231


### HydroGFD

In [20]:
# Merge the precipitation and temperature dataframes
hydroGFD = pd.merge(precip, temperature, on=["Station", "Lat", "Long", "Time"], how="outer")

# Keep only the needed columns of the dataframe: station, lat, long, time, precipitation, temperature
# removing the index columns
hydroGFD = hydroGFD[["Station", "Lat", "Long", "Time", "Precipitation", "Temperature"]]

# Renaming some columns to include units
hydroGFD = hydroGFD.rename(columns={
    "Precipitation": "Precipitation (kg/m^2/s)", 
    "Temperature": "Temperature (K)",
    "Time": "Date"})

hydroGFD = hydroGFD.sort_values(["Date"])

# Finally saving this data as a CSV file just in case
hydroGFD.to_csv(r"hydroGFD.csv", index=False)

In [31]:
# Make hydroGFD dates to datetime objects from strings
hydroGFD["Date"] = pd.to_datetime(hydroGFD["Date"], format="%Y-%m-%d")


## Combining the data

In [40]:
# First I will pull out all the dates in CNIP and HydroGFD
cnipDates = cnip["Date"].unique()
hydroGFDDates = hydroGFD["Date"].unique()

# Find the dates that are in both CNIP and HydroGFD
commonDates = np.intersect1d(cnipDates, hydroGFDDates)

# Remove the rows in the CNIP dataset that are not in the HydroGFD dataset and vice versa
cnip = cnip[cnip.Date.isin(hydroGFD.Date)].reset_index()
hydroGFD = hydroGFD[hydroGFD.Date.isin(cnip.Date)].reset_index()

In [41]:
# Create copies of the CNIP and HydroGFD dataframes so I don't have to rerun the code above
cnipCopy = cnip.copy()
hydroGFDCopy = hydroGFD.copy()

In [42]:
# Combining the copies of the CNIP and HydroGFD dataframes. 
# This will allow me to compare the data between the two datasets
cnipCopy = cnipCopy.drop(["index"], axis=1)
hydroGFDCopy = hydroGFDCopy.drop(["index"], axis=1)

# Merge the CNIP and HydroGFD dataframes
merged = pd.merge(cnipCopy, hydroGFDCopy, on=["Station", "Lat", "Long", "Date"], how="outer")

In [43]:
merged = merged.sort_values(["Date"]).reset_index()
merged = merged.drop(["index", "level_0"], axis=1)

In [44]:
# Drop all rows that have Nan values in the O18Avg
merged = merged[merged["O18Avg"].isnull() == False].reset_index()
merged

Unnamed: 0,index,Station,Lat,Long,Alt,Date,O18Avg,H2avg,dex,Precipitation (kg/m^2/s),Temperature (K)
0,2179,OTT,45.32,-75.67,114.0,1970-02-15,-17.300000,,,0.000146,266.265289
1,2212,OTT,45.32,-75.67,114.0,1970-03-15,-16.790000,,,0.000000,274.185883
2,2230,OTT,45.32,-75.67,114.0,1970-04-15,-12.570000,,,0.000000,275.309753
3,2242,OTT,45.32,-75.67,114.0,1970-05-15,-7.960000,,,0.000000,285.693298
4,2267,OTT,45.32,-75.67,114.0,1970-06-15,-6.840000,,,0.000000,285.053406
...,...,...,...,...,...,...,...,...,...,...,...
2569,12023,OTT,45.32,-75.67,114.0,2007-08-02,-6.524852,-43.06,9.138815,0.000000,290.785278
2570,12040,OTT,45.32,-75.67,114.0,2007-09-02,-10.019821,-68.22,11.938571,0.000042,295.114288
2571,12058,OTT,45.32,-75.67,114.0,2007-10-02,-6.432879,-37.64,13.823032,0.000000,288.457428
2572,12076,OTT,45.32,-75.67,114.0,2007-11-02,-10.933000,-74.53,12.934000,0.000036,278.316406


In [None]:
# Export the merged dataframe to a CSV file
merged.to_csv(r"merged.csv", index=False)