In [2]:
# === NN Risk Summary: Definitions ===
import json, os, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# ----------------------------
# Utilities
# ----------------------------
def safe_write_json(obj, path):
    """Write JSON safely even if no directory portion exists."""
    dir_ = os.path.dirname(path)
    if dir_:
        os.makedirs(dir_, exist_ok=True)
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)

def set_seeds(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# ----------------------------
# Feature engineering
# ----------------------------
def prepare_features(df: pd.DataFrame):
    """
    Build robust numeric features from the patent events JSON:
      - Dates -> calendar features
      - linked_reviewers -> count + top-10 one-hots
      - reviewer_risk_contributions -> mean/sum/max aggregates
      - Categorical basics (patent_type, technology_domain, TC, source) -> one-hot
    Returns:
      X (numeric DataFrame), y_cont (continuous target), df_full (unaltered copy)
    """
    df = df.copy()

    # Ensure expected columns exist (as NaN if missing)
    base_cols = ["patent_type", "technology_domain", "TC", "source"]
    for col in base_cols:
        if col not in df.columns:
            df[col] = np.nan

    # Dates -> numeric calendar features
    if "event_date" in df.columns:
        dt = pd.to_datetime(df["event_date"], errors="coerce")
        df["event_year"] = dt.dt.year
        df["event_month"] = dt.dt.month
        df["event_dayofweek"] = dt.dt.dayofweek
        df["event_is_month_start"] = dt.dt.is_month_start.astype("float")
        df["event_is_month_end"] = dt.dt.is_month_end.astype("float")

    # linked_reviewers -> count + one-hot of top-10 reviewers
    if "linked_reviewers" in df.columns:
        df["num_reviewers"] = df["linked_reviewers"].apply(
            lambda x: len(x) if isinstance(x, (list, tuple)) else 0
        )
        all_rev = []
        for v in df["linked_reviewers"]:
            if isinstance(v, (list, tuple)):
                all_rev.extend(v)
        top_reviewers = pd.Series(all_rev).value_counts().head(10).index.tolist() if len(all_rev) else []
        for rid in top_reviewers:
            df[f"reviewer_{rid}"] = df["linked_reviewers"].apply(
                lambda x: 1.0 if isinstance(x, (list, tuple)) and rid in x else 0.0
            )

    # reviewer_risk_contributions -> numeric aggregates
    def _agg_reviewer_metrics(lst, key, fn):
        if not isinstance(lst, list) or len(lst) == 0:
            return np.nan
        vals = [d.get(key) for d in lst if isinstance(d, dict) and pd.notna(d.get(key))]
        if not vals:
            return np.nan
        if fn == "mean":
            return float(np.mean(vals))
        if fn == "sum":
            return float(np.sum(vals))
        if fn == "max":
            return float(np.max(vals))
        return np.nan

    if "reviewer_risk_contributions" in df.columns:
        for k in ["current_rate", "past_rate", "consistency_score", "risk_contribution"]:
            df[f"rrc_{k}_mean"] = df["reviewer_risk_contributions"].apply(lambda x: _agg_reviewer_metrics(x, k, "mean"))
            df[f"rrc_{k}_sum"]  = df["reviewer_risk_contributions"].apply(lambda x: _agg_reviewer_metrics(x, k, "sum"))
            df[f"rrc_{k}_max"]  = df["reviewer_risk_contributions"].apply(lambda x: _agg_reviewer_metrics(x, k, "max"))

    # Target: composite_risk_score (continuous in [0,1])
    if "composite_risk_score" not in df.columns:
        raise ValueError("Missing composite_risk_score in data.")
    y_cont = df["composite_risk_score"].astype(float)

    # Build feature frame
    num_cols = [
        "event_year", "event_month", "event_dayofweek", "event_is_month_start", "event_is_month_end",
        "num_reviewers"
    ]
    num_cols += [c for c in df.columns if c.startswith("reviewer_")]
    num_cols += [c for c in df.columns if c.startswith("rrc_")]

    cat_cols = [c for c in base_cols if c in df.columns]
    existing_num_cols = [c for c in num_cols if c in df.columns]

    X = df[existing_num_cols + cat_cols].copy()

    # One-hot for categoricals (drop_first to reduce collinearity; include NaN category)
    X = pd.get_dummies(X, columns=cat_cols, dummy_na=True, drop_first=True)

    # Strictly numeric
    X = X.apply(pd.to_numeric, errors="coerce")
    X = X.replace([np.inf, -np.inf], np.nan)

    # Drop columns that are entirely NaN
    all_nan_cols = [c for c in X.columns if X[c].isna().all()]
    if all_nan_cols:
        X = X.drop(columns=all_nan_cols)

    return X, y_cont, df  # df is the full original for summarization

# ----------------------------
# Model (Regressor)
# ----------------------------
class RiskRegressor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)  # predict composite_risk_score âˆˆ [0,1]
        )
    def forward(self, x):
        return self.net(x)

def train_risk_regressor(X_train_df, y_train_cont, X_val_df, y_val_cont, epochs=120, lr=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[Regressor] Using device: {device}")

    # Safety / impute / align
    for _df in [X_train_df, X_val_df]:
        _df.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_train_df = X_train_df.fillna(X_train_df.median(numeric_only=True))
    X_val_df   = X_val_df.fillna(X_val_df.median(numeric_only=True))

    all_nan_cols = list(set(
        [c for c in X_train_df.columns if X_train_df[c].isna().all()] +
        [c for c in X_val_df.columns   if X_val_df[c].isna().all()]
    ))
    if all_nan_cols:
        X_train_df.drop(columns=all_nan_cols, inplace=True, errors="ignore")
        X_val_df.drop(columns=all_nan_cols, inplace=True, errors="ignore")

    # Align columns between train and val
    X_val_df = X_val_df.reindex(columns=X_train_df.columns, fill_value=0)

    X_train = torch.as_tensor(X_train_df.values.astype(np.float32), device=device)
    X_val   = torch.as_tensor(X_val_df.values.astype(np.float32),   device=device)
    y_train = torch.as_tensor(np.asarray(y_train_cont, dtype=np.float32).reshape(-1, 1), device=device)
    y_val   = torch.as_tensor(np.asarray(y_val_cont,   dtype=np.float32).reshape(-1, 1), device=device)

    model = RiskRegressor(input_size=X_train_df.shape[1]).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val = float("inf")
    best_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()
        pred = model(X_train)
        loss = criterion(pred, y_train)
        loss.backward()
        optimizer.step()

        if epoch % 20 == 0:
            model.eval()
            with torch.no_grad():
                vpred = model(X_val)
                vloss = criterion(vpred, y_val).item()
            print(f"[Regressor] Epoch {epoch:03d}/{epochs} | Train MSE: {loss.item():.5f} | Val MSE: {vloss:.5f}")
            if vloss < best_val:
                best_val = vloss
                best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)
    print("[Regressor] Training completed. Best Val MSE:", best_val)
    return model, X_train_df.columns.tolist()

def predict_patent_risk(model, X_df, feature_order):
    """Return clamped predictions in [0,1] for each row in X_df using feature_order."""
    X_use = X_df.reindex(columns=feature_order, fill_value=0).copy()
    X_use.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_use = X_use.fillna(X_use.median(numeric_only=True))
    device = next(model.parameters()).device
    with torch.no_grad():
        pred = model(torch.as_tensor(X_use.values.astype(np.float32), device=device)).cpu().numpy().ravel()
    pred = np.clip(pred, 0.0, 1.0)
    return pred

# ----------------------------
# Summaries (Model-driven)
# ----------------------------
def summarize_from_model_predictions(df_full: pd.DataFrame, y_pred: np.ndarray, top_k=5, high_cut=0.66):
    """
    Build the requested JSON using ONLY model predictions.
    TC overall risk = weighted combination of mean predicted risk, pct of 'high' predicted,
                      and reviewer-linked mean predicted risk.
    """
    dfx = df_full.copy()
    dfx["pred_score"] = y_pred
    dfx["pred_is_high"] = dfx["pred_score"] > high_cut

    # Reviewer appearances (prefer reviewer_risk_contributions; fallback to linked_reviewers)
    rows = []
    for _, r in dfx.iterrows():
        rlist = r.get("reviewer_risk_contributions", [])
        rid_list = []
        if isinstance(rlist, list) and len(rlist) > 0:
            rid_list = [rr.get("reviewer_id") for rr in rlist if isinstance(rr, dict)]
        elif isinstance(r.get("linked_reviewers"), list):
            rid_list = r.get("linked_reviewers")
        for rid in rid_list:
            rows.append({
                "reviewer_id": rid,
                "TC": r.get("TC"),
                "pred_score": r.get("pred_score", np.nan)
            })
    rev_df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=["reviewer_id", "TC", "pred_score"])

    # TC-level aggregates (model-driven)
    tc_grp = dfx.groupby("TC", dropna=False).agg(
        mean_pred=("pred_score", "mean"),
        pct_high_pred=("pred_is_high", "mean"),
        n_events=("patent_id", "count")
    )
    if not rev_df.empty:
        tc_rev = rev_df.groupby("TC", dropna=False)["pred_score"].mean().rename("reviewer_pred_in_tc")
        tc_grp = tc_grp.join(tc_rev, how="left")
    else:
        tc_grp["reviewer_pred_in_tc"] = np.nan
    tc_grp[["reviewer_pred_in_tc"]] = tc_grp[["reviewer_pred_in_tc"]].fillna(0.0)

    # Transparent weights (tune later or learn)
    w_mean, w_pct, w_rev = 0.7, 0.2, 0.1
    tc_grp["overall_tc_risk"] = (
        w_mean * tc_grp["mean_pred"] +
        w_pct  * tc_grp["pct_high_pred"] +
        w_rev  * tc_grp["reviewer_pred_in_tc"]
    )

    tc_ranked = tc_grp.sort_values("overall_tc_risk", ascending=False).reset_index()

    # Build TC items with top reviewers (ranked by avg pred score in that TC)
    out_tcs = []
    for _, row in tc_ranked.head(top_k).iterrows():
        tc = row["TC"]
        if not rev_df.empty and pd.notna(tc):
            r_in_tc = rev_df[rev_df["TC"] == tc].groupby("reviewer_id")["pred_score"].mean().reset_index()
            r_in_tc = r_in_tc.sort_values("pred_score", ascending=False).head(5)
            top_reviewers_json = [
                {"reviewer_id": rid, "reviewer_overall_model": float(score)}
                for rid, score in zip(r_in_tc["reviewer_id"], r_in_tc["pred_score"])
            ]
        else:
            top_reviewers_json = []

        out_tcs.append({
            "TC": None if pd.isna(tc) else tc,
            "overall_risk": float(row["overall_tc_risk"]),
            "decided_based_on": {
                "mean_predicted_patent_risk": float(row["mean_pred"]),
                "pct_high_predicted_patents": float(row["pct_high_pred"]),
                "reviewer_pred_in_tc": float(row["reviewer_pred_in_tc"]),
                "weights": {"mean_pred": 0.7, "pct_high_pred": 0.2, "reviewer_pred": 0.1},
                "n_events": int(row["n_events"]),
            },
            "top_reviewers_in_tc": top_reviewers_json
        })

    # Reviewer-level summary
    out_reviewers = []
    if not rev_df.empty:
        rsum = rev_df.groupby("reviewer_id").agg(
            reviewer_overall_model=("pred_score", "mean"),
            n_events=("pred_score", "count")
        ).reset_index().sort_values("reviewer_overall_model", ascending=False)
        for _, r in rsum.head(top_k).iterrows():
            out_reviewers.append({
                "reviewer_id": r["reviewer_id"],
                "reviewer_overall": float(r["reviewer_overall_model"]),
                "decided_based_on": {
                    "mean_predicted_patent_risk": float(r["reviewer_overall_model"]),
                    "n_events": int(r["n_events"])
                }
            })

    return {
        "highest_risk_tcs": out_tcs,
        "highest_risk_reviewers": out_reviewers
    }


In [None]:
# === NN Risk Summary: Run Workflow ===

# ---- Configuration ----
args = {
    "input": "patent_events.json",            # path to your JSON file
    "out": "model_summary_high_risk.json",    # output file path
    "top_k": 5,                               # number of top TCs/reviewers to report
    "epochs": 120,                            # training epochs
    "lr": 1e-3,                               # learning rate
    "high_cut": 0.35,                         # threshold for "predicted high" patent
    "seed": 42                                # random seed
}

# ---- Pipeline ----
set_seeds(args["seed"])

# Load data
with open(args["input"], "r") as f:
    data = json.load(f)
df = pd.DataFrame(data)

# Prepare features/target
X, y_cont, df_full = prepare_features(df)

# Align y with X rows (in case of prior filtering); reset indices
y_cont = y_cont.iloc[X.index].reset_index(drop=True)
X = X.reset_index(drop=True)

# Train/val split (deterministic 80/20 with the given seed)
n = len(X)
if n < 5:
    raise ValueError(f"Not enough rows to train a model (found {n}).")
idx = np.arange(n)
rng = np.random.default_rng(args["seed"])
rng.shuffle(idx)
split = int(0.8 * n)
tr_idx, va_idx = idx[:split], idx[split:]
X_train_df, X_val_df = X.iloc[tr_idx].copy(), X.iloc[va_idx].copy()
y_train_cont, y_val_cont = y_cont.iloc[tr_idx].copy(), y_cont.iloc[va_idx].copy()

# Train regressor
model_reg, feat_order_reg = train_risk_regressor(
    X_train_df, y_train_cont, X_val_df, y_val_cont,
    epochs=args["epochs"], lr=args["lr"]
)

# Predict on ALL rows (current snapshot)
y_pred_all = predict_patent_risk(model_reg, X, feat_order_reg)

# Build the model-driven summary
summary_json = summarize_from_model_predictions(
    df_full, y_pred_all, top_k=args["top_k"], high_cut=args["high_cut"]
)

# Save + display
safe_write_json(summary_json, args["out"])
print("\n=== Model-driven High-Risk Summary ===")
print(json.dumps(summary_json, indent=2))
print(f"\nSaved summary to: {args['out']}")


[Regressor] Using device: cpu
[Regressor] Epoch 020/120 | Train MSE: 61.07678 | Val MSE: 12.12106
[Regressor] Epoch 040/120 | Train MSE: 16.66831 | Val MSE: 0.83267
[Regressor] Epoch 060/120 | Train MSE: 10.77893 | Val MSE: 0.02783
[Regressor] Epoch 080/120 | Train MSE: 5.49794 | Val MSE: 0.15558
[Regressor] Epoch 100/120 | Train MSE: 3.53892 | Val MSE: 0.07307
[Regressor] Epoch 120/120 | Train MSE: 2.66921 | Val MSE: 0.06928
[Regressor] Training completed. Best Val MSE: 0.027831001207232475

=== Model-driven High-Risk Summary ===
{
  "highest_risk_tcs": [
    {
      "TC": "2190",
      "overall_risk": 0.11810915110824265,
      "decided_based_on": {
        "mean_predicted_patent_risk": 0.14774490892887115,
        "pct_high_predicted_patents": 0.0,
        "reviewer_pred_in_tc": 0.14687716348148952,
        "weights": {
          "mean_pred": 0.7,
          "pct_high_pred": 0.2,
          "reviewer_pred": 0.1
        },
        "n_events": 56
      },
      "top_reviewers_in_tc": [
