# Motor progression in PPMI – MDS‑UPDRS Part III
---
**Run date:** {{ 2025‑05‑05 }}  
**PPMI data:** {{ data freeze 2025‑03‑21 }}  
**Commit:** {{ abc1234 }}

## Objectives
1. Quantify baseline group differences (PD vs HC).
2. Estimate individual‑level progression rates (random‑slope mixed model).
3. Identify the most influential motor items via factor analysis & SHAP.


In [1]:
# ⬛ Cell 1 – Config & imports (only the CONFIG block changed)
# ───────────────────────────────────────────────────────────
import pathlib, warnings, datetime, json, re
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from factor_analyzer import FactorAnalyzer
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import shap

warnings.filterwarnings("ignore")
plt.rcParams["figure.dpi"] = 120

CONFIG = {
    "data_path" : pathlib.Path("/Users/larsheijnen/Thesis/data/motor/MDS-UPDRS_Part_III_21Mar2025.csv"),
    "id_col"    : "PATNO",
    "time_col"  : "EVENT_ID",
    "group_col" : "COHORT",
    # Leave score_cols =None – we will auto‑detect them in Cell 2a
    "score_cols": None,
    # Regex pattern for Part III item columns (edit to fit your header names)
    "score_pattern": r"^(NP3_|P3_)\d+$",
    "min_items_per_visit": 25,
}

In [2]:
# ⬛ Cell 2a – Auto‑detect Part III item columns
# ─────────────────────────────────────────────
df_raw = pd.read_csv(CONFIG["data_path"])
print(f"{df_raw.shape[0]:,} rows × {df_raw.shape[1]} columns loaded")

# Identify columns whose names match the regex pattern
item_cols = [c for c in df_raw.columns if re.match(CONFIG["score_pattern"], c)]
assert item_cols, "No columns matched CONFIG['score_pattern']; tweak the regex."
CONFIG["score_cols"] = item_cols
print(f"Detected {len(item_cols)} Part III item columns.")

32,346 rows × 63 columns loaded


AssertionError: No columns matched CONFIG['score_pattern']; tweak the regex.

In [None]:
# Cell 3 – Data overview
# ──────────────────────
display(df.head())

print("\nUnique subjects :", df[CONFIG["id_col"]].nunique())
print("Visits / subject:", df.groupby(CONFIG["id_col"])[CONFIG["time_col"]].nunique().describe().round(2))

# Simple data dictionary
ddict = (
    df.dtypes.to_frame("dtype")
      .assign(n_missing=df.isna().sum(),
              pct_missing=lambda d: d["n_missing"].div(len(df)).round(3))
)
display(ddict.head(15))

In [None]:
# Cell 4 – Missingness visualisation
# ──────────────────────────────────
msno.matrix(df[CONFIG["score_cols"]], figsize=(9,4))
plt.title("Missingness pattern – MDS‑UPDRS Part III items")
plt.show()

In [None]:
# Cell 5 – Data cleaning & helpers
# ────────────────────────────────
def clean_part3(df: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    """Keep valid score range, drop visits with excessive missingness, cast dtypes."""
    score_df = df.copy()
    # Clamp impossible values
    for col in cfg["score_cols"]:
        score_df.loc[(score_df[col] < 0) | (score_df[col] > 4), col] = np.nan
    # Drop visits w/ too many NaNs
    mask = score_df[cfg["score_cols"]].isna().sum(axis=1) <= (34 - cfg["min_items_per_visit"])
    score_df = score_df.loc[mask].reset_index(drop=True)
    return score_df

df = clean_part3(df, CONFIG)
print("After cleaning:", df.shape)