In [None]:
import json
import re
import unicodedata
from pathlib import Path
from collections import defaultdict


def preprocess_tourist_data():
    input_path = Path("tourist_data.json")
    output_path = Path("tourist_data_grouped.json")

    with input_path.open(encoding="utf-8") as f:
        data = json.load(f)

    def normalize_value(v):
        if v in (None, "", "-", "\"-\""):
            return 0
        try:
            return float(v)
        except (ValueError, TypeError):
            return 0

    def normalize_country(name):
        cleaned = re.sub(r"^\d+(\.\d+)?\s*", "", name).strip()
        return cleaned.capitalize()

    def normalize_name(s):
        # Matches JS normalize("NFD") + diacritic stripping + lowercasing
        if not isinstance(s, str):
            return ""
        normalized = unicodedata.normalize("NFD", s)
        stripped = "".join(ch for ch in normalized if unicodedata.category(ch) != "Mn")
        return stripped.strip().lower()

    grouped = defaultdict(lambda: defaultdict(list))
    for row in data:
        month = row["mesec"]
        municipality = row["občine"]
        grouped[month][municipality].append(row)

    result = []
    for month, municipalities in grouped.items():
        month_entry = {"month": month, "municipalities": {}}
        for municipality, rows in municipalities.items():
            countries = []
            for r in rows:
                countries.append({
                    "name": normalize_country(r["država"]),
                    "data": {
                        "meritve": r["meritve"],
                        "data": normalize_value(r.get("data"))
                    }
                })
            # Store with normalized key for exact JS matching
            key_normalized = normalize_name(municipality)
            month_entry["municipalities"][key_normalized] = {
                "countries": countries,
                "display_name": municipality.capitalize()
            }
        result.append(month_entry)

    with output_path.open("w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"Saved grouped data to {output_path}")


if __name__ == "__main__":
    preprocess_tourist_data()


In [2]:
import json
data = json.load(open("tourist_data_grouped.json", encoding="utf-8"))
json.dump(data, open("tourist_data_grouped_smaller.json", "w", encoding="utf-8"), separators=(",", ":"))
