In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import nivapy3 as nivapy
import seaborn as sn
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
from datetime import datetime

# warnings.filterwarnings("ignore")
plt.style.use("ggplot")

In [2]:
# Connect to NIVABASE
eng = nivapy.da.connect()

Username:  ···
Password:  ········


Connection successful.


# ICPW Thematic Report 2020 - Nitrogen (Part 5: Update catchment properties)

During 2020, Focal Centres were asked to supply up-to-date land cover information for their ICPW stations. The request was co-ordinated by Ingvild and responses are collated here:

    K:\Prosjekter\langtransporterte forurensninger\O-23300 - ICP-WATERS - HWI\Faglige rapporter\2020 report\Land cover
    
(See also the e-mail from Invild received 15.04.2020 at 09.25 for more details). I've compiled the responses into a single Excel file here:

    ./land_cover/replies_combined_23-04-2020.xlsx
    
and also upadted the land cover classes in `RESA.STATION_PARAMETER_DEFINITIONS` to match the categories in the recent land cover request.
    
This notebook first clean & removes any existing ICPW land cover data from the database and then adds the new data.

## 1. Get stations

In [3]:
# Read stations
stn_path = r"../data/all_icpw_sites_may_2019.xlsx"
stn_df = pd.read_excel(stn_path, sheet_name="all_icpw_stns")

# Check stn numbers seem OK
trend_df = stn_df.query("group in ('Trends', 'Trends+Core')")
core_df = stn_df.query("group in ('Core', 'Trends+Core')")

print(f"There are {len(stn_df)} unique stations within the ICPW project as a whole.")
stn_df.head()

There are 556 unique stations within the ICPW project as a whole.


Unnamed: 0,station_id,station_code,station_name,latitude,longitude,altitude,continent,country,region,group
0,38115,Tr18_CA_DO1,Blue Chalk Lake,45.1999,-78.9432,344.0,North America,Canada,Ont,Trends
1,38116,Tr18_CA_DO2,Chub Lake,45.2138,-78.9836,343.0,North America,Canada,Ont,Trends
2,38117,Tr18_CA_DO3,Crosson Lake,45.084,-79.036,371.0,North America,Canada,Ont,Trends
3,38118,Tr18_CA_DO4,Dickie Lake,45.151,-79.0876,379.0,North America,Canada,Ont,Trends
4,38119,Tr18_CA_DO5,Harp Lake,45.3798,-79.1335,327.0,North America,Canada,Ont,Trends


## 2. Remove previous land cover data

The land cover classes used in the 2020 data request are listed in `./land_cover/resa_2020_land_cover_classes.xlsx`.

In [4]:
# Read classes
class_path = r"./land_cover/resa_2020_land_cover_classes.xlsx"
class_df = pd.read_excel(class_path)
class_df

Unnamed: 0,var_id,var_name,unit,type,description,category
0,324,Bare rock,%,n,“Bare rock” in Corine (332),2
1,325,Coniferous,%,n,“Coniferous forest” in Corine (312),2
2,326,Cultivated,%,n,"As defined in Corine, main level 2",2
3,327,Deciduous,%,n,“Broad-leaved forest” in Corine (311),2
4,328,Glacier,%,n,“Glaciers and perpetual snow” in Corine (335),2
5,329,Grasslands,%,n,“Natural grasslands” in Corine (321),2
6,330,Heathlands,%,n,“Moors and heathlands” in Corine (322),2
7,331,Lake,%,n,Lake area as a percenatge of the catchment,2
8,332,Mixed forest,%,n,“Mixed forest” in Corine (313),2
9,333,Other,%,n,Land class not fitting within other categories,2


In [5]:
# Delete land cover data for ICPW stations
bind_stns = ",".join("%d" % i for i in stn_df["station_id"])
bind_vars = ",".join("%d" % i for i in class_df["var_id"])

sql = (
    f"SELECT count(*) FROM resa2.stations_par_values "
    f"WHERE station_id IN ({bind_stns}) "
    f"AND var_id IN ({bind_vars})"
)
n_recs = eng.execute(sql).fetchall()[0][0]

sql = (
    f"DELETE FROM resa2.stations_par_values "
    f"WHERE station_id IN ({bind_stns}) "
    f"AND var_id IN ({bind_vars})"
)
eng.execute(sql)

print(f"{n_recs} records deleted.")

0 records deleted.


In [6]:
# Delete catch area data for ICPW stations
sql = (
    f"SELECT count(*) FROM resa2.stations_par_values "
    f"WHERE station_id IN ({bind_stns}) "
    f"AND var_id = 16"
)
n_recs = eng.execute(sql).fetchall()[0][0]

sql = (
    f"DELETE FROM resa2.stations_par_values "
    f"WHERE station_id IN ({bind_stns}) "
    f"AND var_id = 16"
)
eng.execute(sql)

print(f"{n_recs} records deleted.")

0 records deleted.


## 3. Check catchment data for duplicates

In [7]:
# Read classes
lc_path = r"./land_cover/replies_combined_04-05-2020.xlsx"
lc_df = pd.read_excel(lc_path, sheet_name="land_cover")

# Get duplicates
dup_df = lc_df[lc_df.duplicated(subset="station_code", keep=False)]
lc_df.drop_duplicates(subset="station_code", inplace=True)

# DEAL WITH POLISH STATIONS AND REMOVE THIS!
lc_df = lc_df.query("station_id != '??'")
lc_df["station_id"] = lc_df["station_id"].astype(int)

# Save dops
dup_df = dup_df.sort_values("station_id").T
dup_csv = r"./land_cover/poland_duplicates.csv"
dup_df.to_csv(dup_csv)

dup_df

station_id
station_code
station_name
Catchment area (km2)
Urban (%)
Cultivated (%)
Total forest (%)
Deciduous (%)
Coniferous (%)
Mixed forest (%)
Total shrub and/or herbaceous vegetation (%)


## 4. Add to database

### 4.1. Update `STATIONS` table

The `STATIONS` table stores some station metadata used by the RESA application. Unfortunately, this needs updating separately to the main `STATIONS_PAR_VALUES` table (see section 4.2, below).

In [8]:
# Convert NaN to None
lc_df2 = lc_df.copy()
lc_df2 = lc_df2.round(3)
lc_df2 = lc_df2.where(pd.notnull(lc_df2), None)

with eng.begin() as conn:
    for idx, row in lc_df2.iterrows():
        # Add new vals to dict
        var_dict = {
            "area": row["Catchment area (km2)"],
            "tot_forest": row["Total forest (%)"],
            "decid_forest": row["Deciduous (%)"],
            "conif_forest": row["Coniferous (%)"],
            "cult": row["Cultivated (%)"],
            "glac": row["Glacier (%)"],
            "water": row["Water (excl. Lake) (%)"],
            "stn": row["station_id"],
        }

        # Update table
        sql = (
            "UPDATE resa2.stations "
            "SET catchment_area = :area, "
            "  catchment_total_forest = :tot_forest, "
            "  catchment_deciduous_forest = :decid_forest, "
            "  catchment_coniferous_forest = :conif_forest, "
            "  catchment_agric_area = :cult, "
            "  catchment_glacier_area = :glac, "
            "  catchment_peat_area = NULL, "
            "  catchment_water_area = :water "
            "WHERE station_id = :stn"
        )
        conn.execute(sql, **var_dict)

### 4.2. Update `STATIONS_PAR_VALUES`

The best (i.e. normalised) way to store these data is in `STATIONS_PAR_VALUES`. All of this data will be available under the **Additional station data** tab in the RESA application. I recommend using this, rather than the (incomplete) information taken from the `STATIONS` table and displayed on the **Basic station data** page.

In [9]:
# Melt to long
del lc_df["station_code"], lc_df["station_name"]
lc_df = lc_df.melt(id_vars="station_id")

# Map var names to var IDs
class_df["var_unit"] = class_df["var_name"] + " (" + class_df["unit"] + ")"
class_df.set_index("var_unit", inplace=True)
var_id_dict = class_df["var_id"].to_dict()

# Add catchment area
var_id_dict["Catchment area (km2)"] = 16

# Reclass land cover data
lc_df["variable"].replace(var_id_dict, inplace=True)

lc_df.dropna(how="any", inplace=True)

# Tidy for database
lc_df.rename({"variable": "var_id"}, inplace=True, axis="columns")

# Value is entered as string
lc_df["value"] = lc_df["value"].round(3).astype(str)
lc_df["var_id"] = lc_df["var_id"].astype(int)

lc_df["entered_by"] = "JES"
lc_df["entered_date"] = datetime.today()

lc_df.head()

Unnamed: 0,station_id,var_id,value,entered_by,entered_date
0,23472,16,2.941,JES,2020-05-04 15:33:00.689690
1,23474,16,1.816,JES,2020-05-04 15:33:00.689690
2,23475,16,1.247,JES,2020-05-04 15:33:00.689690
3,23478,16,0.722,JES,2020-05-04 15:33:00.689690
4,23488,16,1.104,JES,2020-05-04 15:33:00.689690


In [10]:
# Write to database
lc_df.to_sql(
    "stations_par_values", schema="resa2", con=eng, if_exists="append", index=False,
)

print(len(lc_df), "records added.")

8907 records added.


## 5. Update co-ordinates and elevation data

Some Focal Centres have reported new co-ordinates and/or site elevation information.

In [11]:
# Read classes
xl_path = r"./land_cover/replies_combined_04-05-2020.xlsx"
df = pd.read_excel(lc_path, sheet_name="stn_locs")

df = df.round(6)

# PL10 and PL11 are not yet added as ICPW stations
df.drop_duplicates(subset="station_id", inplace=True)
df = df.query("station_id != '??'")

# Convert NaN to None
df = df.where(pd.notnull(df), None)

with eng.begin() as conn:
    for idx, row in df.iterrows():
        # Add new vals to dict
        var_dict = {
            "lat": row["latitude"],
            "lon": row["longitude"],
            "alt": row["altitude"],
            "stn": row["station_id"],
        }

        # Update table
        sql = (
            "UPDATE resa2.stations "
            "SET latitude = :lat, "
            "  longitude = :lon, "
            "  altitude = :alt "
            "WHERE station_id = :stn"
        )
        conn.execute(sql, **var_dict)

print(len(df), "records updated.")

554 records updated.
