In [1]:
# # Run this and then restart the kernel at the start of each session to install
# # 'teotil3' in development mode
# !pip install -e /home/jovyan/projects/teotil3/

In [2]:
import nivapy3 as nivapy
import teotil3 as teo
from sqlalchemy import text

In [3]:
eng = nivapy.da.connect_postgis(admin=True)
# eng = nivapy.da.connect_postgis()

Username:  ········
Password:  ········


Connection successful.


# TEOTIL3: Annual data processing

Each year, raw datasets for the TEOTIL model are gathered from a variety of sources:

 * **Monitored discharges for "large" wastewater treatment plants (>50 p.e.) from SSB**. This dataset is split into two parts: the raw data that SSB receives from Miljødirektoratet (which includes all variables, and is often call the "miljøgifter dataset"), and a statistically interpolated dataset for total N and P. The statistically interpolated dataset includes the data for TOTN and TOTP from the miljøgifter dataset (i.e. there is some duplication) but, for TOTN and TOTP, SSB fills data gaps by estimating discharges for non-reporting sites based on similar sites that have reported
 
 * **Aggregated, kommune-level discharges from smaller wastewater treatment plants (≤50 p.e.) from SSB**. This is often call the "spredt dataset" and includes e.g. discharges from septic tanks and other sources not connected to the main wastewater treatment network
 
 * **Industrial discharges from Miljødirektoratet** based on information in their discharge licensing database
 
 * **Aquaculture discharges based on productivity figures supplied by Fiskeridirektoratet**. The workflow for this component has been substantially revised and improved in the new model - see [Task 2.9](https://github.com/NIVANorge/teotil3/tree/main/notebooks/development#task-29-improve-workflow-for-aquaculture) for details
 
 * **Agricultural inputs from NIBIO**. Note that the agricultural modelling workflow is being revised and updated by NIBIO during 2022/3, so this component will change in the near future
 
`teo.preprocessing` includes functions for reading each of the raw datasets, estimating derived parameters and adding everything to the `teotil3` database on the JupyterHub. Many of these functions have been migrated and refactored from the [RID GitHub repository](https://github.com/JamesSample/rid) created for Elveovervåkingsprogrammet.

This notebook processes and uploads data for a user-specified range of years.

In [4]:
st_yr, end_yr = 2016, 2021
years = range(st_yr, end_yr + 1)

In [5]:
# Delete ALL industry, wastewater and aquaculture data values already
# in the database for these years
sql = text(
    """DELETE FROM teotil3.point_source_values
       WHERE year >= :st_yr
       AND year <= :end_yr
"""
)
eng.execute(sql, st_yr=st_yr, end_yr=end_yr)

# Delete all spredt values in the database for these years
sql = text(
    """DELETE FROM teotil3.spredt_inputs
       WHERE year >= :st_yr
       AND year <= :end_yr
"""
)
eng.execute(sql, st_yr=st_yr, end_yr=end_yr)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f9648729ae0>

## 1. Large wastewater treatment plants and industry

These three datasets (two for wastewater and one for industry) are treated similarly, and there is some duplication between the files.

 * The **store anlegg** dataset is in wide format. Copy and rename the file to `avlop_stor_anlegg_{year}_raw.xlsx`, then change the worksheet name to `store_anlegg_{year}`. The header must also be tidied (see example datasets from previous years) and blank rows at the end of the worksheet can be deleted. SSB also provide a file named `RID_Totalpopulasjon_{year}.csv`, which includes the treatment type for each facility. This dataset should be saved as `avlop_stor_anlegg_{year}_treatment_types.xlsx` and the worksheet named `data`

 * The **miljøgifter** dataset is in wide format. Copy and rename the file to `avlop_miljogifter_{year}_raw.xlsx`, then change the worksheet name to `miljogifter_{year}`

 * The **industry** dataset is in long format and usually contains data for multiple years. Copy and rename the file to `industry_{year}_raw.xlsx`, then rename the worksheet to `industry_{year}`. Delete rows above the header and check the header is the same as in previous years. Remember to filter the data to only include the year of interest (i.e. delete rows for other years), otherwise the TEOTIL will raise and error
 
The tidied files should be added to a single folder, structured as follows:

    data_fold/
    ├─ avlop_stor_anlegg_{year}_raw.xlsx
    │  ├─ store_anlegg_{year} [worksheet]
    │
    ├─ avlop_stor_anlegg_{year}_treatment_types.xlsx
    │  ├─ data [worksheet]
    │
    ├─ avlop_miljogifter_{year}_raw.xlsx
    │  ├─ miljogifter_{year} [worksheet]
    │
    ├─ industry_{year}_raw.xlsx
    │  ├─ industry_{year} [worksheet]

Files for each year are hosted on `shared`

    shared/teotil3/point_data/

In [6]:
for year in years:
    print("#", year, "#################")
    data_fold = f"/home/jovyan/shared/teotil3/point_data/{year}"
    nidb_gdf, df = teo.preprocessing.read_large_wastewater_and_industry_data(
        data_fold, year, eng
    )

    # Add new sites to the database
    if nidb_gdf is not None:
        nidb_gdf.to_postgis(
            "point_source_locations",
            con=eng,
            schema="teotil3",
            if_exists="append",
            index=False,
        )
    # Add values to database
    df.to_sql(
        "point_source_values",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

# 2016 #################
28 locations do not have co-ordinates in this year's data.
3052 locations are not in the database.
28 locations are not in the database and do not have co-ordinates (and therefore must be ignored)
# 2017 #################
           site_id   variable  value
1517  5001.0121.01  SS_tonnes    5.7
1518  5001.0121.01  SS_tonnes    5.7
44 locations do not have co-ordinates in this year's data.
120 locations are not in the database.
40 locations are not in the database and do not have co-ordinates (and therefore must be ignored)
# 2018 #################
19 locations do not have co-ordinates in this year's data.
55 locations are not in the database.
14 locations are not in the database and do not have co-ordinates (and therefore must be ignored)
# 2019 #################
13 locations do not have co-ordinates in this year's data.
271 locations are not in the database.
7 locations are not in the database and do not have co-ordinates (and therefore must be ignored)
# 2020

## 2. Small wastewater treatment plants

First, **download the latest fylke and kommune boundaries** from [Geonorge](https://www.geonorge.no/). These should be processed and linked to regines by following the workflow described in notebooks `T2-1a` to `T2-1c`. TEOTIL will raise an error if administrative boundaries for the year of interest cannot be found in PostGIS in the table `teotil3.regine_2022`.

Next, copy and rename the data file to `avlop_sma_anlegg_{year}_raw.xlsx`, and rename the worksheet `sma_anlegg_{year}`. Delete rows above the header and delete unnecessary columns: the only columns required are `KOMMUNENR` and the `NITROGEN` and `FOSFOR` discharges from the 14 different types of plant (i.e. delete the `KOMMUNENAVN` column, the population columns and the total columns for N and P). Also delete the rows with totals and unit, to give a single-row header.

In [7]:
for year in years:
    print("#", year, "#################")
    xl_path = f"/home/jovyan/shared/teotil3/point_data/{year}/avlop_sma_anlegg_{year}_raw.xlsx"
    sheet_name = f"sma_anlegg_{year}"
    df = teo.preprocessing.read_raw_small_wastewater_data(
        xl_path, sheet_name, year, eng
    )

    # Add to database
    df.to_sql(
        "spredt_inputs",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

# 2016 #################
# 2017 #################
# 2018 #################
# 2019 #################
# 2020 #################
# 2021 #################


## 3. Aquaculture

The aquaculture dataset from Fiskeridirektoratet is usually encrypted and must be stored securely. Copy and rename the file to `fiske_oppdret_{year}_raw.xlsx`, and change the worksheet name to `fiskeoppdrett_{year}`. Check that data have only been provided for one year and that the column names match submissions from previous years.

As part of reporting to OSPAR, we usually also estimate data for **copper usage in aquaculture** and add it to the database. Miljødirektoratet will supply an annual total for the amount of copper used in aquaculture (in tonnes), which should be added to the file [here](https://github.com/NIVANorge/teotil3/blob/main/data/aquaculture_annual_copper_usage.csv) and pushed to GitHub.

In [8]:
for year in years:
    print("#", year, "#################")
    xl_path = (
        f"/home/jovyan/shared/teotil3/point_data/{year}/fiske_oppdret_{year}_raw.xlsx"
    )
    sheet_name = f"fiskeoppdrett_{year}"

    cu_tonnes = teo.preprocessing.get_annual_copper_usage_aquaculture(year)
    nidb_gdf, df = teo.preprocessing.read_raw_aquaculture_data(
        xl_path, sheet_name, year, eng
    )
    df = teo.preprocessing.estimate_aquaculture_nutrient_inputs(
        df, year, eng, cu_tonnes=cu_tonnes, species_ids=[71401, 71101]
    )

    # Add new sites to the database
    if nidb_gdf is not None:
        nidb_gdf.to_postgis(
            "point_source_locations",
            con=eng,
            schema="teotil3",
            if_exists="append",
            index=False,
        )
    # Add values to database
    df.to_sql(
        "point_source_values",
        con=eng,
        schema="teotil3",
        if_exists="append",
        index=False,
    )

# 2016 #################
0 locations do not have co-ordinates in this year's data.
824 locations are not in the database.
The total annual copper lost to water from aquaculture is 1134.8 tonnes.
# 2017 #################
0 locations do not have co-ordinates in this year's data.
88 locations are not in the database.
The total annual copper lost to water from aquaculture is 1217.2 tonnes.
# 2018 #################
0 locations do not have co-ordinates in this year's data.
47 locations are not in the database.
The total annual copper lost to water from aquaculture is 1382.1 tonnes.
# 2019 #################
0 locations do not have co-ordinates in this year's data.
45 locations are not in the database.
The total annual copper lost to water from aquaculture is 1443.3 tonnes.
# 2020 #################
0 locations do not have co-ordinates in this year's data.
30 locations are not in the database.
The total annual copper lost to water from aquaculture is 1308.1 tonnes.
# 2021 #################
0 lo

## 4. Agriculture

**To do**. The agricultural models are being updated by NIBIO. Code will eventually be required here to add the new agricultural data to the TEOTIL database.