In [1]:
import nivapy3 as nivapy
import pandas as pd

# Task 2.12: Process historic data

For the new model, I have requested historic datasets from some providers going back in time as far as possible. These datasets are sometimes provided in a format that is different to the standard annual data submissions. This notebook restructures the historic data to match the expected annual format and generates one file per year and data type.

In [2]:
# Period of interest
st_yr, end_yr = 2013, 2022

## 1. Wastewater data from SSB

During autumn 2023, Gisle Berge sent updated estimates of discharges from wastewater treatment plants, including preliminary estimates of BOF and KOF for both large and small sites (which have not been provided before).

### 1.1. Large wastewater sites

#### 1.1.1. Treatment type, BOF and KOF

In [3]:
fpath = r"/home/jovyan/shared/common/teotil3/point_data/historic_data_delivered_2023/RID_Totalpopulasjon_incl_BOF5_KOF_2013-2022.sdv"
df = pd.read_csv(fpath, sep=";", encoding="cp1252")
df.rename(
    {"anleggsnavn": "ANLEGGSNAVN", "aargang": "year"}, axis="columns", inplace=True
)
df.dropna(subset=["utslipp_BOF5", "utslipp_KOF"], inplace=True)
df.head()

Unnamed: 0,KOMMUNE_NR,ANLEGGSNR,ANLEGGSNAVN,KAPASITET1,RENSPRINS,utslipp_BOF5,utslipp_KOF,year
0,101,0101AL02,Bakke,350.0,Kjemisk-biologisk,723.0,2063.0,2013
1,101,0101AL06,Kornsjø,300.0,Kjemisk-biologisk,654.0,1866.0,2013
2,101,0101AL07,Remmendalen,29500.0,Kjemisk,204372.0,551545.0,2013
3,104,0104AL01,Kambo,25000.0,Kjemisk,111116.0,204671.0,2013
4,105,0105AL00,Alvim Renseanlegg,62100.0,Kjemisk,389015.0,689394.0,2013


In [4]:
for year in range(st_yr, end_yr + 1):
    df2 = df.query("year == @year").copy()
    xl_path = f"/home/jovyan/shared/common/teotil3/point_data/{year}/avlop_stor_anlegg_{year}_treatment_types_bof_kof.xlsx"
    df2.to_excel(xl_path, sheet_name="data", index=False)

### 1.2. TOTN and TOTP

Also join in the outlet locations provided by Miljødirektoratet - see e-mail from Torstein received 22.11.2023 at 14:42.

In [5]:
fpath = r"/home/jovyan/shared/common/teotil3/point_data/historic_data_delivered_2023/TIDSSERIE TEOTIL store anlegg 2008-2022.sdv"
df = pd.read_csv(fpath, sep=";", encoding="cp1252", decimal=",")

outlet_xls = r"/home/jovyan/shared/common/teotil3/point_data/historic_data_delivered_2023/large_wastewater_outlet_locations.xlsx"
out_df = pd.read_excel(outlet_xls, sheet_name="Treffliste")[
    ["Kilderefnr.", "Sone (utslipp)", "Øst (utslipp)", "Nord (utslipp)"]
].dropna()
out_df.columns = ["ANLEGGSNR", "Sone_Utslipp", "UTM_E_Utslipp", "UTM_N_Utslipp"]

df.rename(
    {
        "anleggsnavn": "ANLEGGSNAVN",
        "AARGANG": "year",
        "SONEBELTE": "Sone",
        "UTMOST": "UTM_E",
        "UTMNORD": "UTM_N",
        "PBERINN": "MENGDE_P_INN_kg",
        "PBERUT": "MENGDE_P_UT_kg",
        "prenseeff": "RENSEEFFEKT_P",
        "NBERINN": "MENGDE_N_INN_kg",
        "NBERUT": "MENGDE_N_UT_kg",
        "nrenseeff": "RENSEEFFEKT_N",
    },
    axis="columns",
    inplace=True,
)
df.dropna(subset=["MENGDE_P_UT_kg", "MENGDE_N_UT_kg"], inplace=True)

df = pd.merge(df, out_df, how="left", on="ANLEGGSNR")

df.head()

Unnamed: 0,ANLEGGSNR,ANLEGGSNAVN,Sone,UTM_E,UTM_N,MENGDE_P_INN_kg,MENGDE_P_UT_kg,RENSEEFFEKT_P,MENGDE_N_INN_kg,MENGDE_N_UT_kg,RENSEEFFEKT_N,year,Sone_Utslipp,UTM_E_Utslipp,UTM_N_Utslipp
0,0101AL02,Bakke,32.0,640300.0,6544800.0,180.79,5.23,0.97,919.8,689.85,0.25,2008,32.0,640376.0,6544312.0
1,0101AL03,Brække,32.0,646100.0,6559300.0,9.86,0.49,0.95,65.7,49.28,0.25,2008,32.0,646800.0,6559250.0
2,0101AL06,Kornsjø,32.0,653600.0,6535900.0,74.0,5.6,0.92,876.0,657.0,0.25,2008,32.0,653636.0,6535860.0
3,0101AL07,Remmendalen,32.0,635100.0,6555900.0,15329.75,2306.1,0.85,107428.02,87106.64,0.19,2008,32.0,634990.0,6555480.0
4,0101AL07,Remmendalen,32.0,635100.0,6555900.0,15329.75,2306.1,0.85,107428.02,87106.64,0.19,2008,32.0,597468.0,6639263.0


In [6]:
for year in range(st_yr, end_yr + 1):
    df2 = df.query("year == @year").copy()
    xl_path = f"/home/jovyan/shared/common/teotil3/point_data/{year}/avlop_stor_anlegg_{year}_raw.xlsx"
    df2.to_excel(xl_path, sheet_name=f"store_anlegg_{year}", index=False)

### 1.2. Small wastewater sites

In [7]:
# Mapping for SSB codes => TEOTIL3 codes for små anlegg
ssb_csv = "../../data/ssb_sma_anlegg_type_codes.csv"
ssb_df = pd.read_csv(ssb_csv)

fpath = r"/home/jovyan/shared/common/teotil3/point_data/historic_data_delivered_2023/tmp_teotil_smaa_anlegg_2002_2022.sdv"
df = pd.read_csv(fpath, sep=";", encoding="cp1252")
del df["KOMMUNE_NAVN"]
df.rename({"KOMMUNE_NR": "KOMMUNENR", "aargang": "year"}, axis="columns", inplace=True)
df = df.melt(id_vars=["KOMMUNENR", "year"])
df = df[df["variable"].str.startswith(("B", "N", "P"))]
df[["variable", "ssb_code"]] = df["variable"].str.split("_", n=1, expand=True)
code_list = list(ssb_df["ssb_code"].unique())
df = df.query("ssb_code in @code_list")
df.dropna(subset=["value"], inplace=True)
df = pd.merge(df, ssb_df, how="left", on="ssb_code")
df["variable"].replace({"P": "FOSFOR ", "N": "NITROGEN ", "B": "BOF "}, inplace=True)
df["variable"] = df["variable"] + df["teotil_type"]
df.drop(["ssb_code", "ssb_desc", "teotil_type"], inplace=True, axis="columns")
df = df.set_index(["KOMMUNENR", "year", "variable"]).unstack("variable").fillna(0)
df.columns = df.columns.get_level_values(1)
df.reset_index(inplace=True)
df.columns.name = ""
df["KOMMUNENR"] = df["KOMMUNENR"].astype(str).str.zfill(4)

df.head()

Unnamed: 0,KOMMUNENR,year,BOF Annen løsning,BOF Biologisk,BOF Biologisk og kjemisk,BOF Biologisk toalett,"BOF Biologisk toalett, gråvannsfilter",BOF Direkte utslipp,BOF Infiltrasjonsanlegg,BOF Kjemisk,...,"NITROGEN Biologisk toalett, gråvannsfilter",NITROGEN Direkte utslipp,NITROGEN Infiltrasjonsanlegg,NITROGEN Kjemisk,NITROGEN Konstruert våtmark,NITROGEN Sandfilteranlegg,NITROGEN Slamavskiller,NITROGEN Tett tank (for alt avløpsvann),NITROGEN Tett tank for svartvann,"NITROGEN Tett tank for svartvann, gråvannsfilter"
0,101,2002,0.0,0.0,411.72,0.0,0.0,2803.2,821.25,639.48,...,0.0,560.64,1314.0,271.779,0.0,495.159,6724.176,0.0,314.046,527.79
1,101,2003,0.0,0.0,411.72,0.0,0.0,2803.2,821.25,639.48,...,0.0,560.64,1314.0,271.779,0.0,495.159,6724.176,0.0,314.046,527.79
2,101,2004,0.0,0.0,411.72,0.0,0.0,2803.2,821.25,639.48,...,0.0,560.64,1314.0,271.779,0.0,495.159,6724.176,0.0,314.046,527.79
3,101,2005,0.0,0.0,433.62,0.0,0.0,2715.6,814.68,770.88,...,0.0,543.12,1303.488,327.624,0.0,495.159,6686.727,0.0,331.566,136.656
4,101,2006,0.0,0.0,462.09,0.0,0.0,2452.8,830.01,805.92,...,0.0,490.56,1328.016,342.516,0.0,495.159,6678.405,0.0,331.128,137.094


In [8]:
for year in range(st_yr, end_yr + 1):
    df2 = df.query("year == @year").copy()
    xl_path = f"/home/jovyan/shared/common/teotil3/point_data/{year}/avlop_sma_anlegg_{year}_raw.xlsx"
    df2.to_excel(xl_path, sheet_name=f"sma_anlegg_{year}", index=False)

## 2. Industry data from Miljødirektoratet

Also join in the outlet locations provided by Miljødirektoratet - see e-mail from Torstein received 23.11.2023 at 10.12.

In [9]:
ind_xl_path = r"/home/jovyan/shared/common/teotil3/point_data/historic_data_delivered_2023/glenn_indutsry_data_tidy.xlsx"
df = pd.read_excel(ind_xl_path, sheet_name="Teotiluttrekket til NIVA")
del df["CASNr"]
df.dropna(subset=["Mengde"], inplace=True)

outlet_xls = r"/home/jovyan/shared/common/teotil3/point_data/historic_data_delivered_2023/industry_outlet_locations.xlsx"
out_df = pd.read_excel(outlet_xls, sheet_name="Treffliste")[
    ["Anleggsnr.", "Sone (utslipp)", "Øst (utslipp)", "Nord (utslipp)"]
].dropna()
out_df.columns = ["Anleggsnr", "Sone_Utslipp", "UTM_E_Utslipp", "UTM_N_Utslipp"]
out_df["Sone_Utslipp"] = out_df["Sone_Utslipp"].astype(int)
out_df = nivapy.spatial.utm_to_wgs84_dd(
    out_df, "Sone_Utslipp", "UTM_E_Utslipp", "UTM_N_Utslipp"
)
out_df.drop(
    ["Sone_Utslipp", "UTM_E_Utslipp", "UTM_N_Utslipp"], axis="columns", inplace=True
)
out_df.rename(
    {"lat": "Lat_Utslipp", "lon": "Lon_Utslipp"}, axis="columns", inplace=True
)

# Database contains some exact duplicates. Sum for now
agg_cols = [col for col in df.columns if col != "Mengde"]
df = df.groupby(agg_cols).sum().reset_index()

df = pd.merge(df, out_df, how="left", on="Anleggsnr")

df.head()

Unnamed: 0,Anleggsnr,Anleggsnavn,Anleggsaktivitet,Komm. nett,Status,År,Komp. Id,Komp.kode,Komponent,Enhet,Komm.nr,Kommune,Geografisk Longitude,Geografisk Latitude,Orgnr,Ansvarlig enhet,NACE,Mengde,Lat_Utslipp,Lon_Utslipp
0,0301.0012.01,Nordox,Kjemisk industri,True,Aktiv,2010,74,Cd,kadmium,kg,301,Oslo,10.807122,59.911775,971744391,NORDOX AS,20.12,0.021,59.911775,10.807122
1,0301.0012.01,Nordox,Kjemisk industri,True,Aktiv,2010,101,Cu,kobber,kg,301,Oslo,10.807122,59.911775,971744391,NORDOX AS,20.12,6.39,59.911775,10.807122
2,0301.0012.01,Nordox,Kjemisk industri,True,Aktiv,2011,74,Cd,kadmium,kg,301,Oslo,10.807122,59.911775,971744391,NORDOX AS,20.12,0.034,59.911775,10.807122
3,0301.0012.01,Nordox,Kjemisk industri,True,Aktiv,2011,101,Cu,kobber,kg,301,Oslo,10.807122,59.911775,971744391,NORDOX AS,20.12,8.87,59.911775,10.807122
4,0301.0012.01,Nordox,Kjemisk industri,True,Aktiv,2012,74,Cd,kadmium,kg,301,Oslo,10.807122,59.911775,971744391,NORDOX AS,20.12,0.023,59.911775,10.807122


In [10]:
for year in range(st_yr, end_yr + 1):
    df2 = df.query("`År` == @year").copy()
    xl_path = (
        f"/home/jovyan/shared/common/teotil3/point_data/{year}/industri_{year}_raw.xlsx"
    )
    df2.to_excel(xl_path, sheet_name=f"industri_{year}", index=False)