In [1]:
import numpy as np
import pandas as pd

# TEOTIL3: Tidy annual data

## Part 4: Industry

This notebook processes the raw industrial data provided by Miljødirektoratet (Glenn Storbråten).

## Workflow overview

 1. Glenn provides an Excel file with data on industrial discharges.
    
 2. Delete the two metadata rows from the top of the file. There is also usually a weird, half-merged column (`K`) that needs to be deleted. Save the tidied file to

    /home/jovyan/shared/common/teotil3/point_data/raw_data_delivered_{deliv_year}/industry_data_tidy.xlsx

 4. Run the code below to tidy the data and save annual input files.

In [2]:
# Final year for which emissions will be estimated
final_year = 2023

# Raw datasets to use i.e.
# /home/jovyan/shared/common/teotil3/point_data/raw_data_delivered_{deliv_year}
deliv_year = 2025

## 1. Read raw data

In [3]:
# Read data
ind_xl_path = f"/home/jovyan/shared/common/teotil3/point_data/raw_data_delivered_{deliv_year}/industry_data_tidy.xlsx"
df = pd.read_excel(ind_xl_path, sheet_name="Teotiluttrekket til NIVA")

# Only consider sites NOT connected to the municipal network
assert df["Komm. nett"].isna().sum() == 0
df = df[df["Komm. nett"] == False]

del df["CASNr"]
df.dropna(subset=["Mengde"], inplace=True)

df.rename(
    columns={
        "Anlegg Latitude": "Geografisk Latitude",
        "Anlegg Longitude": "Geografisk Longitude",
        "Utslipp Latitude": "Lat_Utslipp",
        "Utslipp Longitude": "Lon_Utslipp",
    },
    inplace=True,
)

# Database contains some exact duplicates. Sum for now.
# Aggregation drops columns with NaNs, so fill temporarily and then put NaNs back
agg_cols = [col for col in df.columns if col != "Mengde"]
df[agg_cols] = df[agg_cols].fillna("NaN")
df = df.groupby(agg_cols).sum().reset_index()
df[agg_cols] = df[agg_cols].replace("NaN", np.nan)
df = df.infer_objects(copy=False)

df.head()

  df[agg_cols] = df[agg_cols].replace("NaN", np.nan)


Unnamed: 0,Anleggsnr,Anleggsnavn,Anleggsaktivitet,Komm. nett,Status,År,Komp. Id,Komp.kode,Komponent,Enhet,Komm.nr,Kommune,Geografisk Longitude,Geografisk Latitude,Lon_Utslipp,Lat_Utslipp,Orgnr,Ansvarlig enhet,NACE,Mengde
0,0301.0234.01,Ekeberg Oljelager,Tanklagring,False,Aktiv,2010,190,OLJE,olje,tonn,301,Oslo,10.760945,59.889866,,,884191742,SISTERNE DRIFT DA,52.1,0.364
1,0301.0234.01,Ekeberg Oljelager,Tanklagring,False,Aktiv,2011,190,OLJE,olje,tonn,301,Oslo,10.760945,59.889866,,,884191742,SISTERNE DRIFT DA,52.1,0.395
2,0301.0234.01,Ekeberg Oljelager,Tanklagring,False,Aktiv,2012,190,OLJE,olje,tonn,301,Oslo,10.760945,59.889866,,,884191742,SISTERNE DRIFT DA,52.1,0.276
3,0301.0234.01,Ekeberg Oljelager,Tanklagring,False,Aktiv,2013,190,OLJE,olje,tonn,301,Oslo,10.760945,59.889866,,,884191742,SISTERNE DRIFT DA,52.1,0.733
4,0301.0234.01,Ekeberg Oljelager,Tanklagring,False,Aktiv,2014,190,OLJE,olje,tonn,301,Oslo,10.760945,59.889866,,,884191742,SISTERNE DRIFT DA,52.1,0.555


## 2. Save annual files

In [4]:
# Save
for year, ann_df in df.groupby("År"):
    if (year >= 2013) and (year <= final_year):
        xl_path = f"/home/jovyan/shared/common/teotil3/point_data/{year}/industry_{year}_raw.xlsx"
        ann_df.to_excel(xl_path, index=False)