# Velocity

In [1]:
import pandas as pd
import numpy as np
from typing import Tuple

In [2]:
# Load data
CSV_PATH = "/Users/marlenerueschoff/Documents/Uni/UzK Master/Masterarbeit/Experiment/masterthesis_experiment/Data/mouse_events_rows (1).csv" 
df_mouse = pd.read_csv(CSV_PATH)

In [3]:
GROUPING = ("participant_id",)

def compute_velocity_from_df(df_mouse: pd.DataFrame,
                             group_cols=GROUPING) -> tuple[pd.DataFrame, pd.DataFrame]:
    required = {"participant_id", "t_ms", "x", "y"}
    missing = required - set(df_mouse.columns)
    if missing:
        raise ValueError(f"df_mouse is missing required columns: {missing}")

    # Work on a copy to avoid side-effects (set copy=False if you want in-place)
    df = df_mouse.copy()

    # Ensure numeric types
    df["t_ms"] = pd.to_numeric(df["t_ms"], errors="coerce")
    df["x"]    = pd.to_numeric(df["x"], errors="coerce")
    df["y"]    = pd.to_numeric(df["y"], errors="coerce")

    # Drop unusable rows
    df = df.dropna(subset=["participant_id", "t_ms", "x", "y"])

    # Sort within groups by time
    sort_cols = list(group_cols) + ["t_ms"]
    df = df.sort_values(sort_cols)

    # Compute deltas per group
    def _deltas(g: pd.DataFrame) -> pd.DataFrame:
        g = g.copy()
        g["dx"] = g["x"].diff()
        g["dy"] = g["y"].diff()
        g["dt_ms"] = g["t_ms"].diff()
        g["dist_px"] = np.sqrt(g["dx"]**2 + g["dy"]**2)
        g["dt_s"] = g["dt_ms"] / 1000.0
        # guard against zero/negative/NaN intervals
        g.loc[~np.isfinite(g["dt_s"]) | (g["dt_s"] <= 0), ["dt_s", "dist_px", "dx", "dy"]] = np.nan
        g["speed_px_s"] = g["dist_px"] / g["dt_s"]
        return g

    df_steps = (
        df.groupby(list(group_cols), dropna=False, as_index=False, group_keys=False)
          .apply(_deltas)
    )

    # Keep only valid step rows for aggregation (the first in each group will be NaN)
    valid = df_steps.dropna(subset=["dist_px", "dt_s", "speed_px_s"]).copy()

    if valid.empty:
        # Return empty results with expected columns if nothing to compute
        empty_cols = list(group_cols) + [
            "total_distance_px","max_speed_px_s","min_speed_px_s",
            "mean_speed_px_s","std_speed_px_s","mean_abs_dx","mean_abs_dy","n_steps"
        ]
        metrics = pd.DataFrame(columns=empty_cols)
        return metrics, df_steps

    grouped = valid.groupby(list(group_cols), dropna=False)

    metrics = grouped.agg(
        total_distance_px = ("dist_px", "sum"),
        max_speed_px_s    = ("speed_px_s", "max"),
        min_speed_px_s    = ("speed_px_s", "min"),
        mean_speed_px_s   = ("speed_px_s", "mean"),
        std_speed_px_s    = ("speed_px_s", "std"),
        mean_abs_dx       = ("dx",  lambda s: np.nanmean(np.abs(s))),
        mean_abs_dy       = ("dy",  lambda s: np.nanmean(np.abs(s))),
        n_steps           = ("speed_px_s", "count"),
    ).reset_index()

    # For groups with only one valid step, std is NaN → set to 0.0
    metrics["std_speed_px_s"] = metrics["std_speed_px_s"].fillna(0.0)

    return metrics, df_steps

# ---- Use it on your existing df_mouse ----
metrics, df_mouse_with_steps = compute_velocity_from_df(df_mouse, group_cols=GROUPING)

# Peek at results
print(metrics.head())

                         participant_id  total_distance_px  max_speed_px_s  \
0  03dd908a-1650-408d-85b7-1c3403451336       43682.859652    8.220487e+05   
1  389c538b-aad3-4787-8030-9adc552993ba       89481.979903    1.018440e+06   
2  4aa053ae-bb0b-402a-904a-df94b514460c      161745.269043    1.023145e+06   
3  53d7dc68-c8ed-4348-bf2c-5d14cd2cae71        2563.422756    8.520669e+03   
4  5bf4b1db-be05-4519-96e7-f4db6467f02a        4633.916208    9.618992e+03   

   min_speed_px_s  mean_speed_px_s  std_speed_px_s  mean_abs_dx  mean_abs_dy  \
0             0.0    143894.529416   243842.619798   182.870056   146.423729   
1             0.0     80500.238250   182951.358954    97.983752    72.534712   
2             0.0    114460.415493   203979.815220   245.568182    61.849026   
3             0.0       817.631061     1970.874908    28.101266    11.291139   
4             0.0       419.467083     1030.996727    11.228261     9.456522   

   n_steps  
0      177  
1      677  
2      616 

  .apply(_deltas)


# Submovements

Hyperparameters
ROLL_WIN_SAMPLES = Size of the centered moving average applied to the speed series before finding peaks 
-> Typical: 3–5 (with ~40 ms sampling, that’s ~120–200 ms smoothing).
-> Effect:
    Larger → smoother curve, fewer peaks.
    Smaller → noisier curve, more peaks.

MIN_PROMINENCE_PX_S = How much higher a peak must be relative to its immediate left/right valleys.
-> Start around the 20th–40th percentile of non-zero speeds.
-> Effect:
    Larger → suppresses small/flat bumps → fewer, more salient peaks.
    Smaller → allows small bumps → more peaks, including noise.

MIN_PEAK_HEIGHT_PX_S = absolute minimum peak height, px/s
-> Typical: Just above your noise floor, e.g., median(speed when moving slowly) or P10–P20 of all positive speeds.
-> Effect:
    Larger → discards slow, possibly jitter-based peaks.
    Smaller → admits slow “peaks” → risk of false positives during tiny moves.

MIN_PEAK_DISTANCE_MS = Minimum elapsed time between accepted peaks
-> Typical: 200 ms (aligns with your pause notion); range 120–300 ms.
-> Effect:
    Larger → merges nearby peaks → fewer but longer submovements.
    Smaller → allows rapid successive peaks → more submovements.

In [None]:
# Peak detection hyperparams
ROLL_WIN_SAMPLES      = 3      # ~120 ms if ~40 ms sampling / no changes 
MIN_PROMINENCE_PX_S   = 50.0   # tune on your data / no changes
MIN_PEAK_HEIGHT_PX_S  = 50.0   # tune on your data
MIN_PEAK_DISTANCE_MS  = 200    # refractory period

# ============================
# Helpers
# ============================

def _moving_avg(a, win):
    if win <= 1:
        return a
    return pd.Series(a, dtype="float64").rolling(win, center=True, min_periods=1).mean().to_numpy()

def _compute_speed_px_s_aligned(g: pd.DataFrame) -> np.ndarray:
    """
    Compute speed (px/s) aligned to g.index.
    First sample is NaN; rows where dt<=0 are NaN (prevents divide-by-zero warnings).
    """
    dx   = g["x"].diff().to_numpy(dtype="float64")
    dy   = g["y"].diff().to_numpy(dtype="float64")
    dt_s = (g["t_ms"].diff() / 1000.0).to_numpy(dtype="float64")

    # distances
    dist = np.sqrt(dx*dx + dy*dy)

    # mask invalid dt_s BEFORE division
    invalid = ~np.isfinite(dt_s) | (dt_s <= 0)
    v = np.empty_like(dt_s, dtype="float64")
    v[:] = np.nan
    ok = ~invalid
    v[ok] = dist[ok] / dt_s[ok]  # px/s

    # first row stays NaN
    return v

def _find_speed_peaks(v_smooth, t_ms, min_prominence, min_height, min_peak_distance_ms):
    peaks = []
    n = len(v_smooth)
    last_peak_time = -np.inf

    for i in range(1, n-1):
        vi = v_smooth[i]
        if not np.isfinite(vi):
            continue
        if vi <= (v_smooth[i-1] if np.isfinite(v_smooth[i-1]) else -np.inf):
            continue
        if vi <= (v_smooth[i+1] if np.isfinite(v_smooth[i+1]) else -np.inf):
            continue
        if vi < min_height:
            continue
        left  = v_smooth[i-1] if np.isfinite(v_smooth[i-1]) else -np.inf
        right = v_smooth[i+1] if np.isfinite(v_smooth[i+1]) else -np.inf
        prom = vi - max(left, right)
        if prom < min_prominence:
            continue
        if (t_ms[i] - last_peak_time) < min_peak_distance_ms:
            continue
        peaks.append(i)
        last_peak_time = t_ms[i]
    return np.array(peaks, dtype=int)

def _count_submovements_speed_peaks(g,
                                    roll_win=ROLL_WIN_SAMPLES,
                                    min_prom=MIN_PROMINENCE_PX_S,
                                    min_height=MIN_PEAK_HEIGHT_PX_S,
                                    min_dist_ms=MIN_PEAK_DISTANCE_MS):
    g = g.sort_values("t_ms").reset_index(drop=True)

    # ensure numeric
    g["t_ms"] = pd.to_numeric(g["t_ms"], errors="coerce")
    g["x"]    = pd.to_numeric(g["x"], errors="coerce")
    g["y"]    = pd.to_numeric(g["y"], errors="coerce")
    g = g.dropna(subset=["t_ms", "x", "y"])

    if len(g) < 3:
        dur = float(g["t_ms"].iloc[-1] - g["t_ms"].iloc[0]) if len(g) else np.nan
        rate = (0 / (dur/1000.0)) if (np.isfinite(dur) and dur > 0) else np.nan
        return pd.DataFrame({"submovements":[0], "trial_duration_ms":[dur], "submovements_per_s":[rate]})

    # Reuse existing speed if present; otherwise compute robustly
    if "speed_px_s" in g.columns and g["speed_px_s"].notna().any():
        v = g["speed_px_s"].to_numpy(dtype="float64")
        # Mask pathological values from earlier steps
        v[~np.isfinite(v)] = np.nan
    else:
        v = _compute_speed_px_s_aligned(g)

    t_ms = g["t_ms"].to_numpy(dtype="float64")
    v_s  = _moving_avg(v, roll_win)
    peaks = _find_speed_peaks(v_s, t_ms,
                              min_prominence=min_prom,
                              min_height=min_height,
                              min_peak_distance_ms=min_dist_ms)
    submoves = int(peaks.size)
    dur = float(t_ms[-1] - t_ms[0])
    rate = submoves / (dur/1000.0) if (np.isfinite(dur) and dur > 0) else np.nan

    return pd.DataFrame({"submovements":[submoves],
                         "trial_duration_ms":[dur],
                         "submovements_per_s":[rate]})

def submovements_per_trial_speed_peaks(df_mouse, group_cols=GROUPING):
    needed = set(group_cols) | {"t_ms", "x", "y"}
    missing = [c for c in needed if c not in df_mouse.columns]
    if missing:
        raise KeyError(f"df_mouse is missing required columns: {missing}")

    gobj = df_mouse.groupby(list(group_cols), dropna=False, group_keys=False)

    # Use include_groups=False when available (pandas >= 2.2); otherwise fallback
    try:
        out = gobj.apply(lambda g: _count_submovements_speed_peaks(g), include_groups=False)
    except TypeError:
        out = gobj.apply(lambda g: _count_submovements_speed_peaks(g))
    out = out.reset_index()
    # Clean accidental 'level_*' columns if any
    out = out.drop(columns=[c for c in out.columns if c.startswith("level_")], errors="ignore")
    return out

def submovements_per_participant_speed_peaks(df_mouse, group_cols=GROUPING):
    per_trial = submovements_per_trial_speed_peaks(df_mouse, group_cols=group_cols)
    part_col = "participant_id" if "participant_id" in group_cols else group_cols[0]

    agg = (per_trial.groupby(part_col, as_index=False)
                  .agg(total_submovements=("submovements", "sum"),
                       total_trial_time_ms=("trial_duration_ms", "sum")))
    agg["submovements_per_s"] = agg["total_submovements"] / (agg["total_trial_time_ms"] / 1000.0)
    return per_trial, agg



print("Per-trial (first 10 rows):")
print(per_trial.head(10))



Per-trial (first 10 rows):


NameError: name 'agg' is not defined