In [156]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [157]:
import pandas as pd
from pathlib import Path
import json
import locale
from IPython.display import display
from datetime import datetime

from helpers import renamePostaja, simplifyColumn,lowerSumniki

Step1: DataFrame for each air pollutant
- read each csv
- input into dataframe
- convert into long format

In [158]:
NEVARNE_SNOVI = [("PM10", 1999), ("SO2", 1997), ("NOX", 1997), ("NO2",1997)]
MESECI = ["Jan", "Feb", "Mar", "Apr", "Maj","Jun", "Jul", "Avg", "Sep", "Okt", "Nov","Dec"]

YEAR_MONTH_SORT = {
    'by': ['Leto', 'Mesec'],
    'key': lambda col: col if col.name != 'Mesec' else col.map(lambda x: datetime.strptime(x, '%b').month)
}

In [163]:
def extend_dataframe(repo: dict[str, pd.DataFrame], snov: str, to_add: pd.DataFrame):
    repo[snov] = (
        pd.concat([repo[snov], to_add], ignore_index=True) if snov in repo else to_add
    )


def dict_to_csv(repo: dict[str, pd.DataFrame], form: str):
    for snov, df in repo.items():
        path_dir = Path(f"podatki/df_{form}")

        if Path.exists(path_dir):
            filepath = path_dir / f"{snov}.csv"
            df.to_csv(filepath, index=False)
            print(f"saved: {filepath}")
        else:
            path_dir.mkdir()


def load_air_pollutants(
    nevarne_snovi: list[tuple[str, int]],
) -> dict[str, pd.DataFrame]:
    pd_dict = {}

    for snov, zac_leto in nevarne_snovi:
        dir_path = f"podatki/{snov}"
        file_list = sorted(x for x in Path(dir_path).iterdir())

        for leto, file_path in enumerate(file_list, start=zac_leto):
            df = pd.read_csv(file_path, na_values=["", ",", " ", "/"])

            # cleanup
            df = renamePostaja(df)
            df = simplifyColumn(df, "Postaja", True)

            # insert year column
            df.insert(1, "Leto", leto)
            # wide to long conversion
            df = df.melt(
                id_vars=["Postaja", "Leto"],
                value_vars=MESECI,
                var_name="Mesec",
                value_name=snov,
            )

            # concat the dataframe
            extend_dataframe(pd_dict, snov, df)

    return pd_dict


postaje_dict = load_air_pollutants(NEVARNE_SNOVI)
dict_to_csv(postaje_dict, "postaje")

saved: podatki/df_postaje/PM10.csv
saved: podatki/df_postaje/SO2.csv
saved: podatki/df_postaje/NOX.csv
saved: podatki/df_postaje/NO2.csv


Step2: Map stations to regions

In [164]:
# returns (postaja->regija map, missing postaja)
def postaja_regija_map(slovar_path: str) -> pd.Series:
    # load dict from file
    regija_dict = json.load(open(slovar_path))
    # create dict for postaja->regija
    regija_dict = {
        postaja: regija
        for regija, postaje in regija_dict.items()
        for postaja in postaje
    }
    # create series for mapper; cleanup names, drop duplicates
    regija_series = pd.Series(regija_dict, name="Regija")
    regija_series.index = regija_series.index.map(lowerSumniki)
    regija_series = regija_series[~regija_series.index.duplicated()]
    return regija_series


# check and print if postaja exists without mapping
def check_missing_mapping(postaje_dict: dict[str, pd.DataFrame], map_regija: pd.Series):
    prevodi = set(map_regija.index.unique())
    for snov, df in postaje_dict.items():
        postaje = set(df["Postaja"].unique())
        inter = postaje - prevodi
        if len(inter) > 0:
            print(f"{snov}: {inter}")


map_regija = postaja_regija_map("podatki/slovar.json")
check_missing_mapping(postaje_dict, map_regija)

In [None]:
def add_regije_df(postaje_dict: dict[str, pd.DataFrame], mapper: pd.Series)->dict[str, pd.DataFrame]:
    regije_dict = {}
    for snov, df in postaje_dict.items():
        regija_df = df.copy()
        regija_df.insert(loc=1,column='Regija',value=regija_df['Postaja'].map(map_regija))
        display(regija_df)
        break






Unnamed: 0,Postaja,Regija,Leto,Mesec,PM10
0,ljubljana f.,Osrednjeslovenska,1999,Jan,43.0
1,celje,Savinjska,1999,Jan,47.0
2,trbovlje,Zasavska,1999,Jan,52.0
3,eis celje,Savinjska,1999,Jan,50.0
4,ljubljana f.,Osrednjeslovenska,1999,Feb,46.0
...,...,...,...,...,...
6355,miklavz,Podravska,2023,Dec,
6356,spuhlja,Podravska,2023,Dec,34.0
6357,ruse,Podravska,2023,Dec,13.0
6358,morsko,Goriška,2023,Dec,20.0


In [None]:
regije_dict["NOX"]

In [None]:
PM10 = regije_dict["PM10"]
SO2 = regije_dict["SO2"]
NOX = regije_dict["NOX"]
NO2 = regije_dict["NO2"]

station_count = pd.DataFrame()

PM10_counts = PM10.groupby(['Regija','Leto','Mesec'])['PM10'].count().reset_index(name='Count')



In [None]:
PM10_counts

