In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from config import PROCESSED_CSV_PATH

In [2]:
df = pd.read_csv(PROCESSED_CSV_PATH, parse_dates=["date_start", "date_decision", "date_end"])

# Check the columns
df.head()


Unnamed: 0,soud,autor,predmetRizeni,klicovaSlova,zminenaUstanoveni,parsed_jednaciCislo,date_start,date_decision,date_end,days_to_decision
0,Obvodní soud pro Prahu 1,JUDr. Edita Votočková,o zaplacení 100 000 Kč s příslušenstvím,"['smlouva příkazní', 'odstoupení od smlouvy']","['§ 2002 z. č. 89/2012 Sb.', '§ 2430 z. č. 89/...","[11, 'C', 76, 2024, 51]",2024-05-03,2024-12-04,2025-04-15,215
1,Obvodní soud pro Prahu 3,"JUDr. Markéta Písaříková, Ph.D.",o 11 220 Kč s příslušenstvím,['pojištění odpovědnosti za škodu'],"['§ 6 vyhl. č. 177/1996 Sb.', '§ 14b vyhl. č. ...","[17, 'C', 31, 2025, 31]",2024-11-26,2025-03-14,2025-04-23,108
2,Okresní soud v Příbrami,JUDr. Josef Pelcner,zaplacení 12 585 Kč s příslušenstvím,['peněžité plnění'],"['§ 142 z. č. 99/1963 Sb.', '§ 1 z. č. 168/199...","[14, 'C', 6, 2025, 19]",2024-10-18,2025-01-22,2025-03-12,96
3,Okresní soud v Ostravě,Mgr. Radana Vilčová,o zaplacení 18 985 Kč s příslušenstvím,['dodávky energie'],"['§ 1 vyhl. č. 254/2015 Sb.', '§ 2 vyhl. č. 25...","[62, 'C', 457, 2024, 17]",2024-04-12,2025-02-11,2025-03-14,305
4,Okresní soud v Příbrami,JUDr. Josef Pelcner,"zaplacení 17 828,01 Kč s příslušenstvím",['smlouva o úvěru'],"['§ 160 z. č. 99/1963 Sb.', '§ 2048 z. č. 89/2...","[14, 'C', 187, 2024, 30]",2024-10-05,2024-12-30,2025-03-15,86


In [3]:
df.describe()

Unnamed: 0,date_start,date_decision,date_end,days_to_decision
count,462424,462424,462424,462424.0
mean,2021-12-18 06:14:46.189297664,2022-07-03 17:40:14.864280320,2022-08-15 02:10:56.685639168,197.476026
min,2004-12-31 00:00:00,2006-03-21 00:00:00,2002-04-28 00:00:00,1.0
25%,2021-01-25 00:00:00,2021-08-11 00:00:00,2021-09-17 00:00:00,114.75
50%,2021-10-26 00:00:00,2022-04-27 00:00:00,2022-06-06 00:00:00,155.0
75%,2022-08-22 00:00:00,2023-02-23 00:00:00,2023-04-13 00:00:00,223.0
max,2025-04-11 00:00:00,2025-05-28 00:00:00,2025-12-30 00:00:00,1824.0
std,,,,161.196589


In [4]:
# Feature set: only court and judge
X = df[["soud", "autor"]]
y = df["days_to_decision"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [5]:
from sklearn.metrics import mean_absolute_error

def grouped_mean_baseline(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    group_cols: list[str] = None,
    fallback_cols: list[list[str]] = None,
    global_fallback: bool = True
) -> tuple[pd.Series, float]:
    """
    Predicts y_test using group-wise mean of y_train based on group_cols.
    If group_cols is None or empty, uses global mean.
    Fallbacks supported.
    """
    if group_cols is None:
        group_cols = []
    if fallback_cols is None:
        fallback_cols = []

    global_mean = y_train.mean()

    if not group_cols:
        preds = pd.Series([global_mean] * len(y_test), index=y_test.index)
        mae = mean_absolute_error(y_test, preds)
        return preds, mae

    # Step 1: Primary group mean
    group_means = y_train.groupby([X_train[col] for col in group_cols]).mean()

    if len(group_cols) == 1:
        test_keys = X_test[group_cols[0]]
    else:
        test_keys = list(zip(*(X_test[col] for col in group_cols)))

    preds = pd.Series(test_keys).map(group_means)

    # Step 2: Fallbacks
    for fallback in fallback_cols:
        fallback_means = y_train.groupby([X_train[col] for col in fallback]).mean()
        fallback_keys = (
            X_test[fallback[0]] if len(fallback) == 1
            else list(zip(*(X_test[col] for col in fallback)))
        )
        preds = preds.fillna(pd.Series(fallback_keys).map(fallback_means))

    # Step 3: Global fallback
    if global_fallback:
        preds = preds.fillna(global_mean)

    mae = mean_absolute_error(y_test, preds)
    return preds, mae


In [6]:
# Global mean baseline (no group_cols)
_, mae_global = grouped_mean_baseline(X_train, y_train, X_test, y_test)

# Grouped baseline
_, mae_judge = grouped_mean_baseline(X_train, y_train, X_test, y_test, group_cols=["autor"])
_, mae_court = grouped_mean_baseline(X_train, y_train, X_test, y_test, group_cols=["soud"])
_, mae_group = grouped_mean_baseline(X_train, y_train, X_test, y_test,
                                     group_cols=["soud", "autor"],
                                     fallback_cols=[["autor"], ["soud"]])

In [7]:
print(f"Global Mean Baseline MAE: {mae_global:.2f} days")
print(f"Grouped autor baseline MAE: {mae_judge:.2f} days")
print(f"Global soud baseline MAE: {mae_court:.2f} days")
print(f"Global soud+autor baseline MAE: {mae_group:.2f} days")



Global Mean Baseline MAE: 94.50 days
Grouped autor baseline MAE: 78.54 days
Global soud baseline MAE: 82.13 days
Global soud+autor baseline MAE: 78.44 days


# Preliminary testing

In [8]:
# Preprocessing pipeline
categorical_cols = ["soud", "autor"]
preprocessor = ColumnTransformer([
    ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_cols)
])

# Model pipeline
model = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)
y_pred_model = model.predict(X_test)
mae_model = mean_absolute_error(y_test, y_pred_model)
print(f"Random Forest Model MAE: {mae_model:.2f}")

Random Forest Model MAE: 78.44
