In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from pathlib import Path
import json
import locale
from IPython.display import display
from pandas.tseries.offsets import MonthEnd

from helpers import rename_postaja, strip_postaja, lower_sumniki, MESECI,SNOV_FILTER
from functools import reduce

Step1: DataFrame for each air pollutant

- read each csv
- input into dataframe
- convert into long format


In [4]:
NEVARNE_SNOVI = ["SO2", "PM10", "NO2"]


def presezki_mapper(snov: str, columns: list[str]):
    full_mapper = {
        "Ura presežek": f"{snov} - Urnih presežkov",
        "Dan presežek": f"{snov} - Dnevnih presežkov",
    }
    return {k: v for k, v in full_mapper.items() if k in columns}

In [5]:
locale.setlocale(locale.LC_ALL, "sl_SI.utf8")


def load_pollutant_data(
    snov: str, cas: str, base_dir: Path = Path("podatki")
) -> list[pd.DataFrame]:

    def load_and_transform_df(file: Path):
        df = pd.read_csv(file, na_values=["/"])
        df = rename_postaja(df)
        df = strip_postaja(df)

        leto = file.stem
        df.insert(1, "Leto", leto)
        df = df.melt(
            id_vars=["Postaja", "Leto"],
            value_vars=MESECI,
            var_name="Mesec",
            value_name=snov,
        )
        df.insert(
            loc=1,
            column="Datum",
            value=pd.to_datetime(df["Leto"] + "-" + df["Mesec"], format="%Y-%b")
            + MonthEnd(0),
        )

        return df.drop(columns=["Leto", "Mesec"])

    def parse_pollutant_files():
        dir_path = base_dir / snov / cas

        if not dir_path.exists():
            return

        file_list = sorted([f for f in dir_path.iterdir()])
        df_list = list(map(load_and_transform_df, file_list))

        return pd.concat(df_list, ignore_index=True)

    return parse_pollutant_files()


# TO DO: fix this for further down
def dict_to_csv(repo: dict[str, pd.DataFrame], form: str, cas: str):
    for snov, df in repo.items():
        path_dir = Path(f"podatki/df_{form}/{cas}")

        if not Path.exists(path_dir):
            path_dir.mkdir()

        filepath = path_dir / f"{snov}.csv"
        df.to_csv(filepath, index=False)
        print(f"saved: {filepath}")


mesecne_meritve = [
    df
    for df in list(
        map(lambda snov: load_pollutant_data(snov, "mesecne"), NEVARNE_SNOVI)
    )
    if df is not None
]
dnevne_meritve = [
    df
    for df in list(map(lambda snov: load_pollutant_data(snov, "dnevne"), NEVARNE_SNOVI))
    if df is not None
]
urne_meritve = [
    df
    for df in list(map(lambda snov: load_pollutant_data(snov, "urne"), NEVARNE_SNOVI))
    if df is not None
]

In [6]:
# returns (postaja->regija map, missing postaja)
def postaja_regija_map(slovar_path: str) -> pd.Series:
    regija_dict = json.load(open(slovar_path))
    # create dict for postaja->regija
    regija_dict = {
        postaja: regija
        for regija, postaje in regija_dict.items()
        for postaja in postaje
    }
    # create series for mapper; cleanup names, drop duplicates
    regija_series = pd.Series(regija_dict, name="Regija")
    regija_series.index = regija_series.index.map(lower_sumniki)
    regija_series = regija_series[~regija_series.index.duplicated(keep="first")]
    return regija_series


# check and print if postaja exists without mapping
def check_missing_mapping(list_df: list[pd.DataFrame], map_regija: pd.Series):
    """Checks if a region mapping is missing for station"""
    postaje = pd.Series(
        [postaja for df in list_df for postaja in df["Postaja"].unique()]
    ).drop_duplicates()

    diff = postaje.loc[~postaje.isin(map_regija.index)]
    if diff.size != 0:
        print(diff)


def assign_region_df(df: pd.DataFrame, mapper: pd.Series) -> pd.DataFrame:
    df = df.copy()
    df.insert(loc=1, column="Regija", value=df["Postaja"].map(mapper).values)
    return df


map_regija = postaja_regija_map("podatki/slovar.json")

# Ckeck for missing mappings
check_missing_mapping(mesecne_meritve, map_regija)
check_missing_mapping(urne_meritve, map_regija)
check_missing_mapping(dnevne_meritve, map_regija)

# Insert region
list_mesecne = list(map(lambda df: assign_region_df(df, map_regija), mesecne_meritve))
list_dnevne = list(map(lambda df: assign_region_df(df, map_regija), dnevne_meritve))
list_urne = list(map(lambda df: assign_region_df(df, map_regija), urne_meritve))

In [15]:
df_mesecne = reduce(
    lambda left,right: pd.merge(left,right,on=["Postaja","Regija","Datum"],how='outer'),
    list_mesecne
).sort_values(by=['Datum','Regija'])

df_dnevne = reduce(
    lambda left,right: pd.merge(left,right,on=["Postaja","Regija","Datum"],how='outer'),
    list_dnevne
).sort_values(by=['Datum','Regija'])

df_urne = reduce(
    lambda left,right: pd.merge(left,right,on=["Postaja","Regija","Datum"],how='outer'),
    list_urne
)

m_path = "podatki/df_mesecne.csv"
df_mesecne.to_csv(m_path,index=False)
print(f"saved file: {m_path}")

d_path = "podatki/df_dnevne.csv"
df_dnevne.to_csv(d_path,index=False)
print(f"saved file: {d_path}")

u_path = "podatki/df_urne.csv"
df_urne.to_csv(u_path, index=False)
print(f"saved file: {u_path}")

df_mesecne[df_mesecne['Datum'] > "2003"].sort_values(by='Datum')



saved file: podatki/df_mesecne.csv
saved file: podatki/df_dnevne.csv
saved file: podatki/df_urne.csv


Unnamed: 0,Postaja,Regija,Datum,SO2,PM10,NO2
5808,nova gorica,Goriška,2003-01-31,7.0,38.0,34.0
9756,zagorje,Zasavska,2003-01-31,23.0,64.0,
8436,trbovlje,Zasavska,2003-01-31,21.0,58.0,38.0
6840,ravenska vas,Zasavska,2003-01-31,50.0,,
3012,kum,Zasavska,2003-01-31,,,
...,...,...,...,...,...,...
4931,medvode,Osrednjeslovenska,2023-12-31,,28.0,
9683,zadobrova,Osrednjeslovenska,2023-12-31,4.0,26.0,27.0
4691,mb tezno,Podravska,2023-12-31,,29.0,28.0
4847,mb vrbanski,Podravska,2023-12-31,,13.0,13.0


In [13]:
def load_presezki(snov:str, base_dir: Path = Path("podatki")):
    dir_path = base_dir / snov / "presezki"

    def read_presezek(file: Path):
        leto = file.stem 
        df = pd.read_csv(file)
        df = rename_postaja(df)
        df = strip_postaja(df)
        df.insert(1, "Leto",leto)

        return df 

    file_list = sorted([f for f in dir_path.iterdir()])
    df_list = list(map(read_presezek,file_list))

    return pd.concat(df_list,ignore_index=True)

SO2_presezki = load_presezki("SO2")
SO2_presezki = assign_region_df(SO2_presezki, map_regija).drop(columns=["Postaja", "Leto"])
so2_regije = SNOV_FILTER["SO2"]["regije"]
SO2_presezki = SO2_presezki[SO2_presezki["Regija"].isin(so2_regije)]
display(SO2_presezki.groupby("Regija").mean().reset_index().sort_values(by='Ura',ascending=False))

PM10_presezki = load_presezki("PM10")
PM10_presezki = assign_region_df(PM10_presezki,map_regija).drop(columns=["Postaja", "Leto"])
pm10_regije = SNOV_FILTER["PM10"]["regije"]
PM10_presezki = PM10_presezki[PM10_presezki["Regija"].isin(pm10_regije)]
display(PM10_presezki.groupby("Regija").mean().reset_index().sort_values(by='Dan',ascending=False))

NO2_presezki = load_presezki("NO2")
NO2_presezki = assign_region_df(NO2_presezki,map_regija).drop(columns=["Postaja","Leto"])
no2_regije = SNOV_FILTER["NO2"]["regije"]
NO2_presezki = NO2_presezki[NO2_presezki["Regija"].isin(no2_regije)]
display(NO2_presezki.groupby("Regija").mean().reset_index().sort_values(by='Ura',ascending=False))




Unnamed: 0,Regija,Ura,Dan
4,Zasavska,5.138614,0.722772
2,Posavska,5.043478,0.913043
3,Savinjska,1.162338,0.038961
0,Koroška,0.222222,0.0
1,Osrednjeslovenska,0.0,0.0


Unnamed: 0,Regija,Dan
3,Pomurska,36.64
6,Zasavska,34.545455
2,Podravska,29.214286
1,Osrednjeslovenska,27.949367
5,Savinjska,23.776471
0,Goriška,13.745763
4,Posavska,9.315789


Unnamed: 0,Regija,Ura
1,Osrednjeslovenska,0.186047
2,Podravska,0.032258
0,Goriška,0.0
3,Pomurska,0.0
4,Savinjska,0.0
5,Zasavska,0.0
