In [None]:
import json
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
from preprocessing.law_reference_parser import parse_reference
from pprint import pprint

In [None]:
def flatten(xss):
    """Flattens a list of lists."""
    return [x for xs in xss for x in xs]

In [None]:
def load_all_decisions(data_dir="../data/raw") -> pd.DataFrame:
    base_path = Path(data_dir)
    records = []

    for json_file in tqdm(base_path.rglob("page*.json"), desc="Loading JSON files"):
        try:
            with open(json_file, "r", encoding="utf-8") as f:
                data = json.load(f)
            for item in data.get("items", []):
                records.append(item)
        except Exception as e:
            print(f"Failed to load {json_file}: {e}")

    df = pd.DataFrame(records)
    return df


In [None]:
df_decisions = load_all_decisions()

In [None]:
df_decisions.head(10)

In [None]:
df_decisions.shape

In [None]:
df_decisions.to_csv("../data/raw/court_decisions.csv", index=False)

In [None]:
# Let's convert the two datetime columns to the proper dtypes
df_decisions["datumVydani"] = pd.to_datetime(df_decisions["datumVydani"], errors="coerce")
df_decisions["datumZverejneni"] = pd.to_datetime(df_decisions["datumZverejneni"], errors="coerce")


In [None]:
# Looking at the table below, we see that there are some null rows
# in the datumVydani column
df_decisions.info()

In [None]:
# We will drop the 141 or so rows which have NaT as their datumVydani
df_decisions = df_decisions[df_decisions["datumVydani"].notna()]
df_decisions.info()

In [None]:
# There are also approximately 284 dates which are clearly typos. 
# We will also remove them.

df_decisions = df_decisions[df_decisions["datumVydani"].between("2015-01-01", "2025-12-31")]
df_decisions = df_decisions[df_decisions["datumZverejneni"].between("2015-01-01", "2025-12-31")]

In [None]:
df_decisions.shape

In [None]:
print(df_decisions[["datumVydani", "datumZverejneni"]].min())
print(df_decisions[["datumVydani", "datumZverejneni"]].max())

In [None]:
# Release date seems to be unreliable, as there are cases which are on impossible dates.
# This warrants a bit of further investigation

In [None]:
list(df_decisions["soud"].unique())

## Law references

In [None]:
# How many unique law refs are there? 
def extract_unique_refs(df: pd.DataFrame) -> set:
    list_of_refs = flatten(list(df["zminenaUstanoveni"]))
    
    return set(list_of_refs)

In [None]:
all_refs = list(extract_unique_refs(df_decisions))
print(f"Total distinct refs: {len(all_refs)}")

In [None]:
all_refs

In [None]:
ref = random.choice(all_refs)



print("Original reference")
print(ref)

print("")
print("Inferred dictionary:")
pprint(parse_reference(ref))

## Keywords

In [None]:
# How many unique keywords are there? 
def extract_unique_keywords(df: pd.DataFrame) -> set:
    list_of_kwords = flatten(list(df["klicovaSlova"]))
    
    return set(list_of_kwords)

In [None]:
all_keywords = extract_unique_keywords(df_decisions)
print(f"Total distinct keywords: {len(all_keywords)}")

In [None]:
all_keywords

In [None]:
# How many court cases per court?
df_decisions.groupby("soud").size().sort_values(ascending=False)

In [None]:
# Count decisions per day
daily_counts = df_decisions["datumZverejneni"].value_counts().sort_index()

# Plot
plt.figure(figsize=(15, 5))
daily_counts.plot()
plt.title("Number of Judicial Decisions per Day")
plt.xlabel("Date")
plt.ylabel("Number of Decisions")
plt.tight_layout()
plt.show()


In [None]:
# Extract year-month from the publication date
df_decisions["month"] = df_decisions["datumZverejneni"].dt.to_period("M")

# Count decisions per month
monthly_counts = df_decisions["month"].value_counts().sort_index()

# Plot
monthly_counts.plot(kind="bar", figsize=(15, 5))
plt.title("Number of Judicial Decisions per Month")
plt.xlabel("Month")
plt.ylabel("Number of Decisions")
plt.tight_layout()
plt.show()
