In [1]:
# # Run this and then restart the kernel at the start of each session to install
# # 'teotil3' in development mode
# !pip install -e /home/jovyan/projects/teotil3/

In [2]:
import glob
import os

import geopandas as gpd
import nivapy3 as nivapy
import pandas as pd
import teotil3 as teo
from sqlalchemy import text

In [3]:
eng = nivapy.da.connect_postgis(admin=True)
# eng = nivapy.da.connect_postgis()

Username:  ········
Password:  ········


Connection successful.


# TEOTIL3: Annual data processing

Each year, raw datasets for the TEOTIL3 model are gathered from a variety of sources:

 * **Monitored discharges for "large" wastewater treatment plants (>50 p.e.) from SSB**. This dataset is split into two parts: the raw data that SSB receives from Miljødirektoratet (which includes all variables, and is often call the "miljøgifter dataset"), and a statistically interpolated dataset for total N and P. The statistically interpolated dataset includes the data for TOTN and TOTP from the miljøgifter dataset (i.e. there is some duplication) but, for TOTN and TOTP, SSB fills data gaps by estimating discharges for non-reporting sites based on similar sites that have reported
 
 * **Aggregated, kommune-level discharges from smaller wastewater treatment plants (≤50 p.e.) from SSB**. This is often call the "spredt dataset" and includes e.g. discharges from septic tanks and other sources not connected to the main wastewater treatment network
 
 * **Industrial discharges from Miljødirektoratet** based on information in their discharge licensing database
 
 * **Aquaculture discharges based on productivity figures supplied by Fiskeridirektoratet**. The workflow for this component has been substantially revised and improved in the new model - see [Task 2.9](https://github.com/NIVANorge/teotil3/tree/main/notebooks/development#task-29-improve-workflow-for-aquaculture) for details
 
 * **Agricultural inputs from NIBIO**. Note that the agricultural modelling workflow is being revised and updated by NIBIO during 2022/3, so this component will change in the near future
 
`teo.preprocessing` includes functions for reading each of the raw datasets, estimating derived parameters and adding everything to the `teotil3` database on the JupyterHub. Many of these functions have been migrated and refactored from the [RID GitHub repository](https://github.com/JamesSample/rid) created for Elveovervåkingsprogrammet.

This notebook processes and uploads data for a user-specified range of years.

In [4]:
st_yr, end_yr = 2013, 2022
years = range(st_yr, end_yr + 1)

In [5]:
# Delete ALL industry, wastewater and aquaculture data already in the database for these years
sql = text(
    """DELETE FROM teotil3.point_source_values
       WHERE year >= :st_yr
       AND year <= :end_yr
"""
)
eng.execute(sql, st_yr=st_yr, end_yr=end_yr)

sql = text(
    """DELETE FROM teotil3.point_source_locations
       WHERE year >= :st_yr
       AND year <= :end_yr
"""
)
eng.execute(sql, st_yr=st_yr, end_yr=end_yr)

sql = text(
    """DELETE FROM teotil3.spredt_inputs
       WHERE year >= :st_yr
       AND year <= :end_yr
"""
)
eng.execute(sql, st_yr=st_yr, end_yr=end_yr)

  eng.execute(sql, st_yr=st_yr, end_yr=end_yr)


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f96ea072740>

## 1. Large wastewater treatment plants and industry

These three datasets (two for wastewater and one for industry) are treated similarly.

 * The **store anlegg** dataset is in wide format. Copy and rename the file to `avlop_stor_anlegg_{year}_raw.xlsx`, then change the worksheet name to `store_anlegg_{year}`. The header must also be tidied (see example datasets from previous years) and blank rows at the end of the worksheet can be deleted. It is a good idea to check the co-ordinate columns (for both site and outlet) for obvious errors, such as strange UTM zones. SSB also provides a file named `RID_Totalpopulasjon_{year}.csv`, which includes the treatment type for each facility together with statistical estimates for discharges of `BOF5` and `KOF`. This dataset should be saved as `avlop_stor_anlegg_{year}_treatment_types_bof_kof.xlsx` and the worksheet named `data`

 * The **miljøgifter** dataset is in wide format. Copy and rename the file to `avlop_miljogifter_{year}_raw.xlsx`, then change the worksheet name to `miljogifter_{year}`. In these files, site co-ordinates are stored in the columns named `SONEBELTE`, `UTMOST` and `UTMNORD`, while outlet co-ordinates are in columns named `RESIP2`, `RESIP3` and `RESIP4`. It is worth checking these for obvious errors (like inappropriate UTM zones).

 * The **industry** dataset is in long format and usually contains data for multiple years. Copy and rename the file to `industry_{year}_raw.xlsx`, then rename the worksheet to `industry_{year}`. Delete rows above the header and check the header is the same as in previous years. Remember to filter the data to only include the year of interest (i.e. delete rows for other years)
 
The tidied files should be added to a single folder, structured as follows:

    data_fold/
    ├─ avlop_stor_anlegg_{year}_raw.xlsx
    │  ├─ store_anlegg_{year} [worksheet]
    │
    ├─ avlop_stor_anlegg_{year}_treatment_types_bof_kof.xlsx
    │  ├─ data [worksheet]
    │
    ├─ avlop_miljogifter_{year}_raw.xlsx
    │  ├─ miljogifter_{year} [worksheet]
    │
    ├─ industry_{year}_raw.xlsx
    │  ├─ industry_{year} [worksheet]

Files for each year are hosted on `shared`

    shared/teotil3/point_data/

In [6]:
for year in years:
    print("#", year, "#################")
    data_fold = f"/home/jovyan/shared/common/teotil3/point_data/{year}"
    loc_gdf, df = teo.preprocessing.read_large_wastewater_and_industry_data(
        data_fold, year, eng
    )

    # Add new sites to the database
    loc_gdf.to_postgis(
        "point_source_locations",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

    # Add values to database
    df.to_sql(
        "point_source_values",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

# 2013 #################
1 locations do not have outlet co-ordinates in this year's data.
      site_id                          name
548  0940AL11  Brokke vatn og avlaupsanlegg
# 2014 #################
11 locations do not have outlet co-ordinates in this year's data.
      site_id                           name
306  0621AL37               Nedre Eggedal RA
549  0935AL57                    Skisland RA
557  0940AL11   Brokke vatn og avlaupsanlegg
719  1133AL29                        Følsvik
720  1133AL30                         Mosnes
721  1133AL31              Ølesund på Randøy
723  1133AL33                Årdal - Hortane
953  1224AL28      Gjermundshavn, Hamnavågen
955  1224AL30      Industriområdet på Husnes
956  1224AL31  Bogsnes (tidligare Søral sin)
958  1224AL33            Matre slamavskiljar
# 2015 #################
14 locations do not have outlet co-ordinates in this year's data.
       site_id                                    name
49    0226AL71             MIRA renseanlegg (

## 2. Small wastewater treatment plants

First, **download the latest fylke and kommune boundaries** from [Geonorge](https://www.geonorge.no/). These should be processed and linked to regines by following the workflow described in notebooks `T2-1a` to `T2-1c`. TEOTIL will raise an error if administrative boundaries for the year of interest cannot be found in PostGIS in the table `teotil3.regines`.

Next, copy and rename the data file to `avlop_sma_anlegg_{year}_raw.xlsx`, and rename the worksheet `sma_anlegg_{year}`. Delete rows above the header and delete unnecessary columns: the only columns required are `KOMMUNENR` and the `NITROGEN`, `FOSFOR` and `BOF` discharges from the 14 different types of plant (i.e. delete the `KOMMUNENAVN` column, the population columns and the total columns for N and P). Also delete the rows with totals and unit, to give a single-row header.

In [7]:
for year in years:
    print("#", year, "#################")
    xl_path = f"/home/jovyan/shared/common/teotil3/point_data/{year}/avlop_sma_anlegg_{year}_raw.xlsx"
    sheet_name = f"sma_anlegg_{year}"
    df = teo.preprocessing.read_raw_small_wastewater_data(
        xl_path, sheet_name, year, eng
    )

    # Add to database
    df.to_sql(
        "spredt_inputs",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

# 2013 #################
# 2014 #################
# 2015 #################
# 2016 #################
# 2017 #################
# 2018 #################
# 2019 #################
# 2020 #################
# 2021 #################
# 2022 #################


## 3. Aquaculture

The aquaculture dataset from Fiskeridirektoratet is usually encrypted and must be stored securely. Copy and rename the file to `fiske_oppdret_{year}_raw.xlsx`, and change the worksheet name to `fiskeoppdrett_{year}`. Check that data have only been provided for one year and that the column names match submissions from previous years.

As part of reporting to OSPAR, we usually also estimate data for **copper usage in aquaculture** and add it to the database. Miljødirektoratet will supply an annual total for the amount of copper used in aquaculture (in tonnes), which should be added to the file [here](https://github.com/NIVANorge/teotil3/blob/main/data/aquaculture_annual_copper_usage.csv) and pushed to GitHub.

In [8]:
for year in years:
    print("#", year, "#################")
    xl_path = f"/home/jovyan/shared/common/teotil3/point_data/{year}/fiske_oppdret_{year}_raw.xlsx"
    cu_tonnes = teo.preprocessing.get_annual_copper_usage_aquaculture(year)
    loc_gdf, df = teo.preprocessing.read_raw_aquaculture_data(
        xl_path,
        f"fiskeoppdrett_{year}",
        year,
    )
    df = teo.preprocessing.estimate_aquaculture_nutrient_inputs(
        df, year, eng, cu_tonnes=cu_tonnes, species_ids=[71401, 71101]
    )

    # Add new sites to the database
    loc_gdf.to_postgis(
        "point_source_locations",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

    # Add values to database
    df.to_sql(
        "point_source_values",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

# 2013 #################
The total annual copper lost to water from aquaculture is 923.1 tonnes.
# 2014 #################
The total annual copper lost to water from aquaculture is 960.5 tonnes.
# 2015 #################
The total annual copper lost to water from aquaculture is 980.9 tonnes.
# 2016 #################
The total annual copper lost to water from aquaculture is 1134.8 tonnes.
# 2017 #################
The total annual copper lost to water from aquaculture is 1217.2 tonnes.
# 2018 #################
The total annual copper lost to water from aquaculture is 1382.1 tonnes.
# 2019 #################
The total annual copper lost to water from aquaculture is 1443.3 tonnes.
# 2020 #################
The total annual copper lost to water from aquaculture is 1308.1 tonnes.
# 2021 #################
The total annual copper lost to water from aquaculture is 932.4 tonnes.
# 2022 #################
The total annual copper lost to water from aquaculture is 374.0 tonnes.


## 4. Agriculture

**To do**. The agricultural models are being updated by NIBIO. Code will eventually be required here to add the new agricultural data to the TEOTIL database.

## 5. HBV modelled discharge from NVE

In [9]:
# Period of data to upload (i.e. final year in dataset; the data_delivery_year
# is final_year + 1)
final_year = 2022

In [10]:
# # Folder containing modelled data
# data_fold = r"/home/jovyan/shared/common/teotil3/nve_hbv_data"

# df_list = []
# search_path = os.path.join(data_fold, f"RID_{final_year}", "hbv_*.var")
# flist = glob.glob(search_path)

# # Get number of days between 1990 and year of interest
# days = len(pd.date_range(start="1990-01-01", end="%s-12-31" % final_year, freq="D"))

# for fpath in flist:
#     name = os.path.split(fpath)[1]
#     vassom = name.split("_")[1][-7:-4]

#     df = pd.read_csv(
#         fpath, delim_whitespace=True, header=None, names=["date", "flow_m3/s"]
#     )
#     df["date"] = pd.to_datetime(df["date"], format="%Y%m%d/1200")
#     df["vassom"] = vassom
#     df["data_supply_year"] = year + 1
#     df = df[["data_supply_year", "vassom", "date", "flow_m3/s"]]

#     # Check st, end and length
#     assert df["date"].iloc[0] == pd.Timestamp(
#         "1990-01-01"
#     ), "Series does not start on 01/01/1990."
#     assert df["date"].iloc[-1] == pd.Timestamp("%s-12-31" % year), (
#         "Series does not end on 31/12/%s." % year
#     )
#     assert len(df) == days, "Unexpected length for new series."

#     df_list.append(df)

# df = pd.concat(df_list, axis="rows")
# assert df.duplicated(["data_supply_year", "vassom", "date"], keep=False).sum() == 0

# print(f"{len(df)/1e6:.1f} million rows to insert.")

In [11]:
# %%time

# # The databasse can't cope with writing millions of rows directly from pandas
# # Instead, manually split the dataframe into chunks and write one
# # at a time
# chunk_size = 100000

# table_name = "nve_hbv_discharge"

# # Write chunks in append mode
# chunks = [df[i : i + chunk_size] for i in range(0, df.shape[0], chunk_size)]
# for chunk in tqdm(chunks):
#     chunk.to_sql(
#         table_name,
#         eng,
#         schema="teotil3",
#         index=False,
#         if_exists="append",
#         method="multi",
#     )

## 6. Checking

### 6.1. Check point source locations

The code below plots the point source locations for a specific year. The map provides a useful way to identify obvious co-ordinate errors.

**It is a good idea to check one year at a time, because errors often overlap**.

In [12]:
# Year to plot
year = 2022
teo.vis.point_sources_map(year, eng, loc_type="outlet")