In [None]:
%matplotlib inline
import pandas as pd
import nivapy3 as nivapy
import matplotlib.pyplot as plt
import xarray as xr
import numpy as np
import seaborn as sn
from calendar import monthrange

plt.style.use("ggplot")

In [None]:
# Connect to RESA
eng = nivapy.da.connect()

# 1000 Lakes 2019 (Part 2: Process ERA5 data)

This notebook extracts ERA5 data for temperature, precipitation and runoff for the 1000 Lakes stations. The ERA5 dataset was downloaded from Copernicus [here](https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview).

## 1. Get station co-ordinates 

The relevant project ID in RESA is 4530.

In [None]:
stn_df = nivapy.da.select_resa_project_stations([4530], eng)
stn_df.dropna(subset=["latitude", "longitude"], inplace=True)
print(len(stn_df), "station in project.")

# Save for use on Hub
csv_path = r'../output/resa_1000_lakes_stn_list.csv'
stn_df.to_csv(csv_path, index=False)

stn_df.head()

In [None]:
# Map
nivapy.spatial.quickmap(
    stn_df, popup="station_code", cluster=True,
)

## 2. ERA5

In [None]:
# Read nc
nc_path = r"../../../era5_tmp_pptn_runoff_1979-2019_monthly.nc"
ds = xr.open_dataset(nc_path)

# 'expver' = 1 is up to end of 2019; 'expver' = 2 is for most recent data
ds = ds.sel(expver=1).drop("expver")

ds = ds.load()

ds

### 2.1. Extract monthly data

Values for `tp` and `ro` are in mm/day (for the month), which is slightly awkward (see [here](https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-Monthlymeans)). This will be easier to deal with in `pandas` *after* extracting monthly series for the station co-ordinates.

In [None]:
# Loop over sites
df_list = []
for idx, row in stn_df.iterrows():
    lat = row["latitude"]
    lon = row["longitude"]
    stn_id = row["station_id"]

    # Get time series
    df = (
        ds.sel(latitude=lat, longitude=lon, method="nearest")
        .to_dataframe()
        .reset_index()
    )
    df["station_id"] = stn_id
    df["year"] = df["time"].dt.year
    df["month"] = df["time"].dt.month

    df_list.append(df)

# Combine results
df = pd.concat(df_list, sort=True).reset_index(drop=True)
df = df[["station_id", "year", "month", "t2m", "tp", "ro"]]
df = df.melt(id_vars=["station_id", "year", "month"])

# Exclude 2020 as incomplete
df = df.query("year < 2020")

df.head()

### 2.2. Aggregate to annual

In [None]:
def days_in_month(row):
    return monthrange(row["year"], row["month"])[-1]

In [None]:
# Add days in month
df["days_in_month"] = df.apply(days_in_month, axis=1)

# Weight values by days in month
df["month_accum"] = df["value"] * df["days_in_month"]

# Aggregate to annual
df = df.groupby(["station_id", "year", "variable"]).sum().reset_index()

# Calculate mean for temp; total for tp and ro; convert units
df.loc[df["variable"] == "t2m", "month_accum"] = (
    df["month_accum"] / df["days_in_month"]
) - 273.15  # K to C

df.loc[df["variable"] == "ro", "month_accum"] = df["month_accum"] * 1000  # m to mm

df.loc[df["variable"] == "tp", "month_accum"] = df["month_accum"] * 1000  # m to mm

# Tidy
df = df[["station_id", "year", "variable", "month_accum"]]
df.rename({"month_accum": "value"}, inplace=True, axis=1)

# Some stations lie just "offshore" in ERA5, where 'ro' is always 0
# Drop these rows
df = df.query("not((value == 0) and (variable == 'ro'))")

# Save
csv_path = r"../output/1000_lakes_temp_pptn_runoff.csv"
df.to_csv(csv_path, index=False)

df.head()

In [None]:
# Plot
sn.relplot(
    data=df,
    x="year",
    y="value",
    row="variable",
    height=4,
    aspect=3,
    kind="line",
    legend=False,
    alpha=0.5,
    facet_kws={"sharey": False, "sharex": False},
)

## 3. Add to database

### 3.1. Add metadata parameters to database

I have created new entries in the `station_parameter_definitions` table in RESA, as shown below.

In [None]:
# Get metadata IDs
sql = (
    "SELECT * FROM resa2.station_parameter_definitions "
    "WHERE entered_by = 'JES' "
    "AND entered_date >= DATE '2020-05-26'"
)
par_df = pd.read_sql(sql, eng)

par_df

### 3.2. Extract data of interest

In [None]:
# 1995
df_1995 = df.query("year == 1995").copy()
df_1995["var_id"] = df_1995["variable"].replace({"t2m": 341, "tp": 342, "ro": 343})
df_1995 = df_1995[["station_id", "var_id", "value"]]

# 2019
df_2019 = df.query("year == 2019").copy()
df_2019["var_id"] = df_2019["variable"].replace({"t2m": 344, "tp": 345, "ro": 346})
df_2019 = df_2019[["station_id", "var_id", "value"]]

# Average 1981 - 2010
df_ltav = df.query("1981 <= year <= 2010").copy()
df_ltav = df_ltav.groupby(["station_id", "variable"]).mean().reset_index()
df_ltav["var_id"] = df_ltav["variable"].replace({"t2m": 347, "tp": 348, "ro": 349})
df_ltav = df_ltav[["station_id", "var_id", "value"]]

# Average 1990 - 2019
df_ltav2 = df.query("1990 <= year <= 2019").copy()
df_ltav2 = df_ltav2.groupby(["station_id", "variable"]).mean().reset_index()
df_ltav2["var_id"] = df_ltav2["variable"].replace({"t2m": 354, "tp": 355, "ro": 356})
df_ltav2 = df_ltav2[["station_id", "var_id", "value"]]

# Combine
stn_par_df = pd.concat([df_1995, df_2019, df_ltav, df_ltav2], axis=0)

stn_par_df.head()

### 3.3. Add to database

In [None]:
#stn_par_df.to_sql(
#   "stations_par_values",
#   schema="resa2",
#   con=eng,
#   if_exists="append",
#   index=False,
#)