In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
import pandas as pd
from pathlib import Path
import json
import locale
from IPython.display import display
from pandas.tseries.offsets import MonthEnd

from helpers import rename_postaja, strip_postaja, lower_sumniki, MESECI
from functools import reduce

Step1: DataFrame for each air pollutant

- read each csv
- input into dataframe
- convert into long format


In [35]:
NEVARNE_SNOVI = ["SO2", "PM10", "NO2"]


def presezki_mapper(snov: str, columns: list[str]):
    full_mapper = {
        "Ura presežek": f"{snov} - Urnih presežkov",
        "Dan presežek": f"{snov} - Dnevnih presežkov",
    }
    return {k: v for k, v in full_mapper.items() if k in columns}

In [36]:
locale.setlocale(locale.LC_ALL, "sl_SI.utf8")


def load_pollutant_data(
    snov: str, cas: str, base_dir: Path = Path("podatki")
) -> list[pd.DataFrame]:

    def load_and_transform_df(file: Path):
        df = pd.read_csv(file, na_values=["/"])
        df = rename_postaja(df)
        df = strip_postaja(df)

        leto = file.stem
        df.insert(1, "Leto", leto)
        df = df.melt(
            id_vars=["Postaja", "Leto"],
            value_vars=MESECI,
            var_name="Mesec",
            value_name=snov,
        )
        df.insert(
            loc=1,
            column="Datum",
            value=pd.to_datetime(df["Leto"] + "-" + df["Mesec"], format="%Y-%b")
            + MonthEnd(0),
        )

        return df.drop(columns=['Leto','Mesec'])

    def parse_pollutant_files():
        dir_path = base_dir / snov / cas

        if not dir_path.exists():
            return

        file_list = sorted([f for f in dir_path.iterdir()])
        df_list = list(map(load_and_transform_df, file_list))

        return pd.concat(df_list, ignore_index=True)

    return parse_pollutant_files()


# TO DO: fix this for further down
def dict_to_csv(repo: dict[str, pd.DataFrame], form: str, cas: str):
    for snov, df in repo.items():
        path_dir = Path(f"podatki/df_{form}/{cas}")

        if not Path.exists(path_dir):
            path_dir.mkdir()

        filepath = path_dir / f"{snov}.csv"
        df.to_csv(filepath, index=False)
        print(f"saved: {filepath}")


mesecne_meritve = [
    df
    for df in list(
        map(lambda snov: load_pollutant_data(snov, "mesecne"), NEVARNE_SNOVI)
    )
    if df is not None
]
dnevne_meritve = [
    df
    for df in list(map(lambda snov: load_pollutant_data(snov, "dnevne"), NEVARNE_SNOVI))
    if df is not None
]
urne_meritve = [
    df
    for df in list(map(lambda snov: load_pollutant_data(snov, "urne"), NEVARNE_SNOVI))
    if df is not None
]

In [37]:
# returns (postaja->regija map, missing postaja)
def postaja_regija_map(slovar_path: str) -> pd.Series:
    regija_dict = json.load(open(slovar_path))
    # create dict for postaja->regija
    regija_dict = {
        postaja: regija
        for regija, postaje in regija_dict.items()
        for postaja in postaje
    }
    # create series for mapper; cleanup names, drop duplicates
    regija_series = pd.Series(regija_dict, name="Regija")
    regija_series.index = regija_series.index.map(lower_sumniki)
    regija_series = regija_series[~regija_series.index.duplicated(keep="first")]
    return regija_series


# check and print if postaja exists without mapping
def check_missing_mapping(list_df: list[pd.DataFrame], map_regija: pd.Series):
    """Checks if a region mapping is missing for station"""
    postaje = pd.Series(
        [postaja for df in list_df for postaja in df["Postaja"].unique()]
    ).drop_duplicates()

    diff = postaje.loc[~postaje.isin(map_regija.index)]
    if diff.size != 0:
        print(diff)


def assign_region_df(df: pd.DataFrame, mapper: pd.Series) -> pd.DataFrame:
    df = df.copy()
    df.insert(loc=1, column="Regija", value=df["Postaja"].map(mapper).values)
    return df


map_regija = postaja_regija_map("podatki/slovar.json")

# Ckeck for missing mappings
check_missing_mapping(mesecne_meritve, map_regija)
check_missing_mapping(urne_meritve, map_regija)
check_missing_mapping(dnevne_meritve, map_regija)

# Insert region
list_mesecne = list(map(lambda df: assign_region_df(df, map_regija), mesecne_meritve))
list_dnevne = list(map(lambda df: assign_region_df(df, map_regija), dnevne_meritve))
list_urne = list(map(lambda df: assign_region_df(df, map_regija), urne_meritve))

In [38]:
df_mesecne = reduce(
    lambda left,right: pd.merge(left,right,on=["Postaja","Regija","Datum"],how='outer'),
    list_mesecne
).sort_values(by=['Datum','Regija'])

df_dnevne = reduce(
    lambda left,right: pd.merge(left,right,on=["Postaja","Regija","Datum"],how='outer'),
    list_dnevne
).sort_values(by=['Datum','Regija'])

df_urne = reduce(
    lambda left,right: pd.merge(left,right,on=["Postaja","Regija","Datum"],how='outer'),
    list_urne
)

m_path = "podatki/df_mesecne.csv"
df_mesecne.to_csv(m_path,index=False)
print(f"saved file: {m_path}")

d_path = "podatki/df_dnevne.csv"
df_dnevne.to_csv(d_path,index=False)
print(f"saved file: {d_path}")

u_path = "podatki/df_urne.csv"
df_urne.to_csv(u_path, index=False)
print(f"saved file: {u_path}")



saved file: podatki/df_mesecne.csv
saved file: podatki/df_dnevne.csv
saved file: podatki/df_urne.csv
