In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from pathlib import Path
import json
import locale
from IPython.display import display
from datetime import datetime
from pandas.tseries.offsets import MonthEnd

from helpers import rename_postaja, simplify_column, lower_sumniki
from scoring_system import ScoringSystem
from functools import reduce

Step1: DataFrame for each air pollutant

- read each csv
- input into dataframe
- convert into long format


In [4]:
NEVARNE_SNOVI = ["SO2", "PM10", "NOX", "NO2"]
NEVARNE_SNOVI_p = ["SO2", "PM10"]
MESECI = [
    "Jan",
    "Feb",
    "Mar",
    "Apr",
    "Maj",
    "Jun",
    "Jul",
    "Avg",
    "Sep",
    "Okt",
    "Nov",
    "Dec",
]

YEAR_MONTH_SORT = {
    "by": ["Leto", "Mesec"],
    "key": lambda col: (
        col
        if col.name != "Mesec"
        else col.map(lambda x: datetime.strptime(x, "%b").month)
    ),
}


def presezki_mapper(snov: str, columns: list[str]):
    full_mapper = {
        "Ura presežek": f"{snov} - Urnih presežkov",
        "Dan presežek": f"{snov} - Dnevnih presežkov",
    }
    return {k: v for k, v in full_mapper.items() if k in columns}

In [None]:
def extend_dataframe(repo: dict[str, pd.DataFrame], snov: str, to_add: pd.DataFrame):
    repo[snov] = (
        pd.concat([repo[snov], to_add], ignore_index=True) if snov in repo else to_add
    )


def dict_to_csv(repo: dict[str, pd.DataFrame], form: str, cas: str):
    for snov, df in repo.items():
        path_dir = Path(f"podatki/df_{form}/{cas}")

        if not Path.exists(path_dir):
            path_dir.mkdir()

        filepath = path_dir / f"{snov}.csv"
        df.to_csv(filepath, index=False)
        print(f"saved: {filepath}")


def load_air_pollutants(nevarne_snovi: list[str], cas: str) -> dict[str, pd.DataFrame]:
    pd_dict = {}

    for snov in nevarne_snovi:
        dir_path = f"podatki/{snov}/{cas}"
        file_list = sorted(x for x in Path(dir_path).iterdir())

        for file_path in file_list:
            leto = file_path.stem

            df = pd.read_csv(file_path, na_values=["", ",", " ", "/"])

            # cleanup
            df = rename_postaja(df)
            df = simplify_column(df, "Postaja", True)

            # insert year column
            df.insert(1, "Leto", leto)
            # wide to long conversion

            if cas == "mesecne":
                df = df.melt(
                    id_vars=["Postaja", "Leto"],
                    value_vars=MESECI,
                    var_name="Mesec",
                    value_name=snov,
                )
            elif cas == "presezki":
                columns = [
                    col for col in ["Ura presežek", "Dan presežek"] if col in df.columns
                ]

                df = df.loc[~(df[columns] == 0).all(axis=1)]

                df.rename(
                    columns=presezki_mapper(snov, columns),
                    inplace=True,
                )

            # concat the dataframe
            extend_dataframe(pd_dict, snov, df)

    return pd_dict


mesecne_postaje_dict = load_air_pollutants(NEVARNE_SNOVI, "mesecne")
dict_to_csv(mesecne_postaje_dict, "postaje", "mesecne")

presezki_postaje_dict = load_air_pollutants(NEVARNE_SNOVI_p, "presezki")
dict_to_csv(presezki_postaje_dict, "postaje", "presezki")


saved: podatki/df_postaje/mesecne/SO2.csv
saved: podatki/df_postaje/mesecne/PM10.csv
saved: podatki/df_postaje/mesecne/NOX.csv
saved: podatki/df_postaje/mesecne/NO2.csv
saved: podatki/df_postaje/presezki/SO2.csv
saved: podatki/df_postaje/presezki/PM10.csv


Step2: Map stations to regions


In [None]:
# returns (postaja->regija map, missing postaja)
def postaja_regija_map(slovar_path: str) -> pd.Series:
    # load dict from file
    regija_dict = json.load(open(slovar_path))
    # create dict for postaja->regija
    regija_dict = {
        postaja: regija
        for regija, postaje in regija_dict.items()
        for postaja in postaje
    }
    # create series for mapper; cleanup names, drop duplicates
    regija_series = pd.Series(regija_dict, name="Regija")
    regija_series.index = regija_series.index.map(lower_sumniki)
    regija_series = regija_series[~regija_series.index.duplicated()]
    return regija_series


# check and print if postaja exists without mapping
def check_missing_mapping(postaje_dict: dict[str, pd.DataFrame], map_regija: pd.Series):
    prevodi = set(map_regija.index.unique())
    for snov, df in postaje_dict.items():
        postaje = set(df["Postaja"].unique())
        inter = postaje - prevodi
        if len(inter) > 0:
            print(f"{snov}: {inter}")


map_regija = postaja_regija_map("podatki/slovar.json")

check_missing_mapping(presezki_postaje_dict, map_regija)
check_missing_mapping(mesecne_postaje_dict, map_regija)

In [7]:
def add_regije_df(
    postaje_dict: dict[str, pd.DataFrame], mapper: pd.Series
) -> dict[str, pd.DataFrame]:
    regije_dict = {}
    for snov, df in postaje_dict.items():
        regija_df = df.copy()
        regija_df.insert(loc=1, column="Regija", value=regija_df["Postaja"].map(mapper))

        regije_dict[snov] = regija_df

    return regije_dict


# Step3 parameter za izracun
mesecne_regije_dict = add_regije_df(
    postaje_dict=mesecne_postaje_dict, mapper=map_regija
)
dict_to_csv(mesecne_regije_dict,"regije","mesecne")

presezki_regije_dict = add_regije_df(
    postaje_dict=presezki_postaje_dict, mapper=map_regija
)
dict_to_csv(presezki_regije_dict, "regije", "presezki")

saved: podatki/df_regije/mesecne/SO2.csv
saved: podatki/df_regije/mesecne/PM10.csv
saved: podatki/df_regije/mesecne/NOX.csv
saved: podatki/df_regije/mesecne/NO2.csv
saved: podatki/df_regije/presezki/SO2.csv
saved: podatki/df_regije/presezki/PM10.csv


In [21]:
locale.setlocale(locale.LC_ALL, "sl_SI.utf8")

PM10 = mesecne_regije_dict["PM10"].copy()
SO2 = mesecne_regije_dict["SO2"].copy()
NO2 = mesecne_regije_dict["NO2"].copy()
NOX = mesecne_regije_dict["NOX"].copy()


tmp_list = [PM10, SO2, NO2, NOX]

df_wide = reduce(
    lambda left, right: pd.merge(
        left, right, on=["Postaja", "Regija", "Leto", "Mesec"], how="outer"
    ),
    tmp_list,
)
df_wide.sort_values(**YEAR_MONTH_SORT, inplace=True)
df_wide.insert(
    loc=2,
    column="Datum",
    value=pd.to_datetime(
        df_wide["Leto"].astype(str) + "-" + df_wide["Mesec"].astype(str), format="%Y-%b"
    )
    + MonthEnd(0),
)
df_wide.drop(columns=['Leto','Mesec'],inplace=True)

df_wide.to_csv("podatki/wide_df.csv", index=False)

df_wide

Unnamed: 0,Postaja,Regija,Datum,PM10,SO2,NO2,NOX
340,celje,Savinjska,1997-01-31,,72.0,42.0,70.0
628,dobovec,Zasavska,1997-01-31,,196.0,,
880,eis celje,Savinjska,1997-01-31,,66.0,50.0,
1012,eis krsko,Posavska,1997-01-31,,41.0,,
1408,graska gora,Koroška,1997-01-31,,71.0,,
...,...,...,...,...,...,...,...
9470,veliki vrh,Savinjska,2023-12-31,,5.0,,
9806,zadobrova,Osrednjeslovenska,2023-12-31,26.0,4.0,27.0,67.0
10130,zagorje,Zasavska,2023-12-31,26.0,0.0,22.0,63.0
10454,zavodnje,Savinjska,2023-12-31,,6.0,6.0,7.0


In [19]:
PM10_long = PM10.rename(columns={"PM10": "Vrednost"})
PM10_long.insert(loc=4, column="Snov", value="PM10")

SO2_long = SO2.rename(columns={"SO2": "Vrednost"})
SO2_long.insert(loc=4, column="Snov", value="SO2")

NO2_long = NO2.rename(columns={"NO2": "Vrednost"})
NO2_long.insert(loc=4, column="Snov", value="NO2")

NOX_long = NOX.rename(columns={"NOX": "Vrednost"})
NOX_long.insert(loc=4, column="Snov", value="NOX")

tmp_list = [PM10_long, SO2_long, NO2_long, NOX_long]

long_df = pd.concat(tmp_list)
long_df.sort_values(**YEAR_MONTH_SORT, inplace=True)

# long_df.to_csv("podatki/long_df.csv", index=False)

In [9]:
full_df = df_wide.drop(columns="Postaja")

In [10]:
scoring_system = ScoringSystem(full_df)

pollutant_presence_df = scoring_system.coverage_score()
coverage_matrices = scoring_system.create_coverage_matrix(pollutant_presence_df)

for pollutant, scores in coverage_matrices.items():
    scores.to_csv(f"coverage/scores_{pollutant}.csv")