In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from pathlib import Path
import json
import locale
from IPython.display import display
from datetime import datetime
import numpy as np

from helpers import renamePostaja, simplifyColumn,lowerSumniki
from scoring_system import ScoringSystem
from functools import reduce

Step1: DataFrame for each air pollutant
- read each csv
- input into dataframe
- convert into long format

In [25]:
NEVARNE_SNOVI = ["SO2","PM10","NOX","NO2"]
NEVARNE_SNOVI_p = ["SO2"]
MESECI = ["Jan", "Feb", "Mar", "Apr", "Maj","Jun", "Jul", "Avg", "Sep", "Okt", "Nov","Dec"]

YEAR_MONTH_SORT = {
    'by': ['Leto', 'Mesec'],
    'key': lambda col: col if col.name != 'Mesec' else col.map(lambda x: datetime.strptime(x, '%b').month)
}

In [26]:
def extend_dataframe(repo: dict[str, pd.DataFrame], snov: str, to_add: pd.DataFrame):
    repo[snov] = (
        pd.concat([repo[snov], to_add], ignore_index=True) if snov in repo else to_add
    )


def dict_to_csv(repo: dict[str, pd.DataFrame], form: str, cas: str):
    for snov, df in repo.items():
        path_dir = Path(f"podatki/df_{form}/{cas}")

        if Path.exists(path_dir):
            filepath = path_dir / f"{snov}.csv"
            df.to_csv(filepath, index=False)
            print(f"saved: {filepath}")
        else:
            path_dir.mkdir()


def load_air_pollutants(nevarne_snovi: list[str], cas: str) -> dict[str, pd.DataFrame]:
    pd_dict = {}

    for snov in nevarne_snovi:
        dir_path = f"podatki/{snov}/{cas}"
        file_list = sorted(x for x in Path(dir_path).iterdir())

        for file_path in file_list:
            leto = file_path.stem

            df = pd.read_csv(file_path, na_values=["", ",", " ", "/"])

            # cleanup
            df = renamePostaja(df)
            df = simplifyColumn(df, "Postaja", True)

            # insert year column
            df.insert(1, "Leto", leto)
            # wide to long conversion
            

            if cas == "mesecne":
                df = df.melt(
                    id_vars=["Postaja", "Leto"],
                    value_vars=MESECI,
                    var_name="Mesec",
                    value_name=snov,
                )
            elif cas == "presezki":
                df = df.loc[~((df["Ura presežek"] == 0) & (df["Dan presežek"] == 0))]
                df.rename(columns={"Ura presežek": f"{snov} - Urnih presežkov", "Dan presežek":f"{snov} - Dnevnih presežkov"},inplace=True)


            # concat the dataframe
            extend_dataframe(pd_dict, snov, df)

    return pd_dict


mesecne_postaje_dict = load_air_pollutants(NEVARNE_SNOVI, "mesecne")
presezki_postaje_dict = load_air_pollutants(NEVARNE_SNOVI_p, "presezki")
# dict_to_csv(mesecne_postaje_dict, "postaje", "mesecne")



Step2: Map stations to regions

In [28]:
# returns (postaja->regija map, missing postaja)
def postaja_regija_map(slovar_path: str) -> pd.Series:
    # load dict from file
    regija_dict = json.load(open(slovar_path))
    # create dict for postaja->regija
    regija_dict = {
        postaja: regija
        for regija, postaje in regija_dict.items()
        for postaja in postaje
    }
    # create series for mapper; cleanup names, drop duplicates
    regija_series = pd.Series(regija_dict, name="Regija")
    regija_series.index = regija_series.index.map(lowerSumniki)
    regija_series = regija_series[~regija_series.index.duplicated()]
    return regija_series


# check and print if postaja exists without mapping
def check_missing_mapping(postaje_dict: dict[str, pd.DataFrame], map_regija: pd.Series):
    prevodi = set(map_regija.index.unique())
    for snov, df in postaje_dict.items():
        postaje = set(df["Postaja"].unique())
        inter = postaje - prevodi
        if len(inter) > 0:
            print(f"{snov}: {inter}")


map_regija = postaja_regija_map("podatki/slovar.json")

check_missing_mapping(presezki_postaje_dict,map_regija)
check_missing_mapping(mesecne_postaje_dict, map_regija)


In [33]:
def add_regije_df(
    postaje_dict: dict[str, pd.DataFrame], mapper: pd.Series
) -> dict[str, pd.DataFrame]:
    regije_dict = {}
    for snov, df in postaje_dict.items():
        regija_df = df.copy()
        regija_df.insert(loc=1, column="Regija", value=regija_df["Postaja"].map(mapper))

        regije_dict[snov] = regija_df

    return regije_dict

#Step3 parameter za izracun
mesecne_regije_dict = add_regije_df(postaje_dict=mesecne_postaje_dict, mapper=map_regija)
# dict_to_csv(mesecne_regije_dict,"regije","mesecne")

presezki_regije_dict = add_regije_df(postaje_dict=presezki_postaje_dict,mapper=map_regija)

In [34]:
locale.setlocale(locale.LC_ALL, "sl_SI.utf8")

PM10 = mesecne_regije_dict["PM10"].copy()
SO2 = mesecne_regije_dict["SO2"].copy()
NO2 = mesecne_regije_dict["NO2"].copy()
NOX = mesecne_regije_dict["NOX"].copy()


tmp_list = [PM10, SO2, NO2, NOX]

df_wide = reduce(
    lambda left, right: pd.merge(
        left, right, on=["Postaja", "Regija", "Leto", "Mesec"], how="outer"
    ),
    tmp_list,
)
df_wide.sort_values(**YEAR_MONTH_SORT, inplace=True)
df_wide.to_csv("podatki/wide_df.csv", index=False)

In [35]:
PM10_long = PM10.rename(columns={"PM10": "Vrednost"})
PM10_long.insert(loc=4, column="Snov", value="PM10")

SO2_long = SO2.rename(columns={"SO2": "Vrednost"})
SO2_long.insert(loc=4, column="Snov", value="SO2")

NO2_long = NO2.rename(columns={"NO2": "Vrednost"})
NO2_long.insert(loc=4, column="Snov", value="NO2")

NOX_long = NOX.rename(columns={"NOX": "Vrednost"})
NOX_long.insert(loc=4, column="Snov", value="NOX")

tmp_list = [PM10_long, SO2_long, NO2_long, NOX_long]

long_df = pd.concat(tmp_list)
long_df.sort_values(**YEAR_MONTH_SORT, inplace=True)

long_df.to_csv("podatki/long_df.csv", index=False)

In [36]:
full_df = df_wide.drop(columns='Postaja')


In [37]:
scoring_system = ScoringSystem(full_df)

pollutant_presence_df = scoring_system.coverage_score()
coverage_matrices = scoring_system.create_coverage_matrix(pollutant_presence_df)

for pollutant, scores in coverage_matrices.items():
    scores.to_csv(f"coverage/scores_{pollutant}.csv")    

