# Generalized Linear Models (GLMs) for Regression

Fit **Poisson** regression for daily conversions (counts) and **Gamma** regression for fulfillment cost (positive continuous).

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PoissonRegressor, GammaRegressor
from sklearn.model_selection import TimeSeriesSplit

!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Regression_Forecasting/reg_for_utils.py
import reg_for_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Regression_Forecasting/marketing_daily.csv"
warnings.filterwarnings("ignore")

df = pd.read_csv(csv_path, parse_dates=["date"]).sort_values("date")
train, test = utils.time_train_test_split(df, "date", test_days=90)

X_cols = ["search_spend","social_spend","display_spend","promo","price_index","temp_F","rain","is_weekend"]
X_train, X_test = train[X_cols], test[X_cols]

# Poisson (counts)
y_train_p = train["conversions"].astype(float)
y_test_p = test["conversions"].astype(float)

pre_p = ColumnTransformer([("num", StandardScaler(), ["search_spend","social_spend","display_spend","price_index","temp_F"]),
                           ("cat", OneHotEncoder(drop="if_binary"), ["promo","rain","is_weekend"])])

pois = Pipeline([("pre", pre_p), ("est", PoissonRegressor(alpha=0.1, max_iter=2000))])
pois.fit(X_train, y_train_p)
pred_p = np.maximum(0, pois.predict(X_test))
print("Poisson — MAE:", utils.mae(y_test_p, pred_p), "RMSE:", utils.rmse(y_test_p, pred_p))

# Gamma (positive continuous)
y_train_g = train["fulfillment_cost"].clip(lower=1e-3)
y_test_g = test["fulfillment_cost"].clip(lower=1e-3)

pre_g = pre_p
gamma = Pipeline([("pre", pre_g), ("est", GammaRegressor(alpha=0.1, max_iter=4000))])
gamma.fit(X_train, y_train_g)
pred_g = np.maximum(1e-3, gamma.predict(X_test))
print("Gamma — MAE:", utils.mae(y_test_g, pred_g), "RMSE:", utils.rmse(y_test_g, pred_g))

> Note: GLMs model the **mean** via a link function and assume a distribution (Poisson/Gamma). We report MAE/RMSE for simplicity in this notebook.

### (Optional) Diagnostics
Diagnostics helper functions

In [None]:
# Common helpers for GLM diagnostics
import numpy as np, pandas as pd, matplotlib.pyplot as plt, warnings
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error

def rmse(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))

def _final_est(pipe):  # last step "est"
    return pipe.named_steps["est"] if isinstance(pipe, Pipeline) else pipe

def _preproc(pipe):    # first step "pre"
    return pipe.named_steps["pre"] if isinstance(pipe, Pipeline) else None

def fitted_mu(pipe, X):
    """Model mean on natural scale (mu)."""
    return np.asarray(pipe.predict(X), dtype=float)

def fitted_eta(pipe, X):
    """Linear predictor (link scale): eta = Xβ + intercept, using the pipeline's transformed X."""
    est = _final_est(pipe)
    P = _preproc(pipe)
    Xt = P.transform(X) if P is not None else X.values
    return np.asarray(Xt @ est.coef_ + est.intercept_, dtype=float)

def pearson_residuals(y, mu, family):
    """Pearson residuals: (y - mu)/sqrt(V(mu))"""
    if family == "poisson":
        V = mu
    elif family == "gamma":
        V = mu**2
    else:
        raise ValueError("family must be 'poisson' or 'gamma'")
    with np.errstate(divide="ignore", invalid="ignore"):
        r = (y - mu) / np.sqrt(np.maximum(V, 1e-12))
    return r

# Convenience: pull aligned validation/test frames
X_te = X_test.copy()
y_te_p = y_test_p.copy()
y_te_g = y_test_g.copy()

mu_p = fitted_mu(pois, X_te)   # Poisson mean
eta_p = fitted_eta(pois, X_te) # Poisson link (log)
mu_g = fitted_mu(gamma, X_te)  # Gamma mean
eta_g = fitted_eta(gamma, X_te)# Gamma link (log)

print(f"[Poisson]  MAE={mean_absolute_error(y_te_p, mu_p):.3f} | RMSE={rmse(y_te_p, mu_p):.3f}")
print(f"[Gamma]    MAE={mean_absolute_error(y_te_g, mu_g):.3f} | RMSE={rmse(y_te_g, mu_g):.3f}")

### Residual plots (natural scale & link scale)
- Natural-scale residuals should be centered around 0 without patterns.
- On the link scale, Pearson residuals should look like white noise across η; strong patterns suggest mis-specification or missing interactions.

In [None]:
# Residuals vs Fitted (natural & link scales)
def plot_residuals_glm(y, mu, eta, family, title_prefix):
    # Natural scale (response residuals)
    resid_resp = y - mu
    fig, ax = plt.subplots(figsize=(6,5))
    ax.scatter(mu, resid_resp, alpha=0.5)
    ax.axhline(0, color="k", lw=1)
    ax.set_xlabel("Fitted μ (natural scale)")
    ax.set_ylabel("Response residual (y − μ)")
    ax.set_title(f"{title_prefix} — Residuals vs μ (natural)")
    plt.tight_layout(); plt.show()

    # Link scale (Pearson residuals vs eta)
    r_pear = pearson_residuals(y, mu, family)
    fig, ax = plt.subplots(figsize=(6,5))
    ax.scatter(eta, r_pear, alpha=0.5)
    ax.axhline(0, color="k", lw=1)
    ax.set_xlabel("Linear predictor η (link scale)")
    ax.set_ylabel("Pearson residual")
    ax.set_title(f"{title_prefix} — Pearson residuals vs η (link)")
    plt.tight_layout(); plt.show()

# Poisson
plot_residuals_glm(y_te_p.values, mu_p, eta_p, family="poisson", title_prefix="Poisson")
# Gamma
plot_residuals_glm(y_te_g.values, mu_g, eta_g, family="gamma",   title_prefix="Gamma")

### Overdispersion check (Poisson only)
- φ̂ ≈ 1 → Poisson variance ≈ mean (assumption OK).
- φ̂ > 1 → overdispersion; consider adding features, using Negative Binomial, or robust SEs.
- In the plot, points far above the dashed line indicate overdispersion in those ranges of μ.

In [None]:
# Overdispersion diagnostics for Poisson
y = y_te_p.values
mu = mu_p
n = len(y)

# Approx. degrees of freedom = n − p  (use number of effective parameters from the final estimator)
p_params = _final_est(pois).coef_.size + 1  # +1 for intercept
df_val = max(n - p_params, 1)

# Pearson chi-square / df_val (φ_hat): >1 indicates overdispersion, <1 underdispersion
phi_hat = np.sum(((y - mu)**2) / np.maximum(mu, 1e-12)) / df_val
print(f"Poisson overdispersion factor (Pearson χ² / df_val): {phi_hat:.3f}  (≈1 is ideal; >1 overdispersion)")

# Mean–variance relationship: aggregate by fitted-mean bins
bins = pd.qcut(mu, q=min(10, max(3, n//50)), duplicates="drop")
df_val_mv = pd.DataFrame({"y": y, "mu": mu, "bin": bins}).groupby("bin").agg(mean_mu=("mu","mean"),
                                                                          var_y=("y","var"),
                                                                          mean_y=("y","mean")).reset_index(drop=True)
fig, ax = plt.subplots(figsize=(6,5))
ax.scatter(df_val_mv["mean_mu"], df_val_mv["var_y"], alpha=0.8)
ax.plot([df_val_mv["mean_mu"].min(), df_val_mv["mean_mu"].max()],
        [df_val_mv["mean_mu"].min(), df_val_mv["mean_mu"].max()], linestyle="--", label="Var = Mean (Poisson)")
ax.set_xlabel("Mean count (by bin)")
ax.set_ylabel("Observed variance (by bin)")
ax.set_title("Poisson mean–variance check")
ax.legend()
plt.tight_layout(); plt.show()

### Zero diagnostics
- If observed zeros » expected zeros, your count process may be zero-inflated (consider Zero-Inflated Poisson/NB or adding a separate “structural zero” model).
- Gamma targets should be strictly positive; zeros imply a different model (e.g., Tweedie or two-part model).

In [None]:
# Poisson: expected zero rate = exp(-μ). Compare to actual zeros.
obs_zero_rate_p = float(np.mean(y_te_p.values == 0))
exp_zero_rate_p = float(np.mean(np.exp(-mu_p)))
print(f"[Poisson] Observed zero rate = {obs_zero_rate_p:.3f} | Expected (model) zero rate = {exp_zero_rate_p:.3f}")

fig, ax = plt.subplots(figsize=(6,4))
ax.bar(["Observed zeros","Expected zeros (Poisson)"], [obs_zero_rate_p, exp_zero_rate_p])
ax.set_ylim(0, 1)
ax.set_title("Zero-rate comparison — Poisson")
plt.tight_layout(); plt.show()

# Gamma: should not produce zeros (strictly positive). Warn if present.
zeros_in_gamma_target = int(np.sum(y_te_g.values <= 0))
print(f"[Gamma] Non-positive values in target (should be none): {zeros_in_gamma_target}")
if zeros_in_gamma_target > 0:
    warnings.warn("Gamma regression expects strictly positive targets. Consider a shifted target or a different family.")

### Offset sanity (Poisson only; optional)
- With a log link, adding log(exposure) as an offset should multiplicatively scale μ by exposure. The quick check above validates your offset wiring.

In [None]:
# If you have an exposure column (e.g., 'impressions', 'sessions'), set it here.
OFFSET_COL = None  # e.g., "sessions"  (log-offset used)

if OFFSET_COL is None or OFFSET_COL not in X_te.columns:
    print("No OFFSET_COL provided or not in X_test; skipping offset sanity.")
else:
    # Using Poisson with log link: log(μ) = Xβ + log(exposure)
    # sklearn doesn't accept offset directly, so we compute η and add offset at prediction time.
    est = _final_est(pois)
    P = _preproc(pois)
    Xt = P.transform(X_te) if P is not None else X_te.values
    eta_no_offset = Xt @ est.coef_ + est.intercept_
    log_exposure = np.log(np.clip(X_te[OFFSET_COL].values.astype(float), 1e-12, None))
    eta_with_offset = eta_no_offset + log_exposure

    mu_with_offset = np.exp(eta_with_offset)
    # Sanity: adding +log(2) to offset should ~double μ
    mu_with_offset_x2 = np.exp(eta_with_offset + np.log(2.0))

    ratio = np.median(mu_with_offset_x2 / np.maximum(mu_with_offset, 1e-12))
    print(f"Offset sanity: median(μ_with_offset * 2 / μ_with_offset) ≈ {ratio:.2f} (expect ~2.00)")

### Segment errors (promo / rain / is_weekend)
- Consistent positive/negative residuals in segments (promo/rain/weekend) indicate systematic bias → add interactions, segment-specific terms, or re-specify model.

In [None]:
# Segment errors for Poisson & Gamma (by flags)
seg_cols = ["promo","rain","is_weekend"]

# Build evaluation frames aligned with X_te
eval_df = pd.DataFrame({
    "date": df.loc[X_te.index, "date"].values,
    "promo": X_te["promo"].values,
    "rain": X_te["rain"].values,
    "is_weekend": X_te["is_weekend"].values,
    "y_count": y_te_p.values,
    "mu_count": mu_p,
    "y_cost": y_te_g.values,
    "mu_cost": mu_g
})

eval_df["resid_count"] = eval_df["y_count"] - eval_df["mu_count"]
eval_df["resid_cost"]  = eval_df["y_cost"]  - eval_df["mu_cost"]

for seg in seg_cols:
    if seg not in eval_df.columns: 
        continue
    # Poisson residuals by segment
    grp = [eval_df.loc[eval_df[seg]==v, "resid_count"] for v in sorted(eval_df[seg].unique())]
    fig, ax = plt.subplots(figsize=(6,4))
    ax.boxplot(grp, labels=[str(v) for v in sorted(eval_df[seg].unique())])
    ax.axhline(0, color="k", lw=1)
    ax.set_xlabel(seg); ax.set_ylabel("Count residual (y − μ)")
    ax.set_title(f"Poisson residuals by {seg}")
    plt.tight_layout(); plt.show()

    # Gamma residuals by segment
    grp_g = [eval_df.loc[eval_df[seg]==v, "resid_cost"] for v in sorted(eval_df[seg].unique())]
    fig, ax = plt.subplots(figsize=(6,4))
    ax.boxplot(grp_g, labels=[str(v) for v in sorted(eval_df[seg].unique())])
    ax.axhline(0, color="k", lw=1)
    ax.set_xlabel(seg); ax.set_ylabel("Cost residual (y − μ)")
    ax.set_title(f"Gamma residuals by {seg}")
    plt.tight_layout(); plt.show()

# Optional: residuals over time to spot drift
for tgt, col in [("Poisson", "resid_count"), ("Gamma", "resid_cost")]:
    ts = eval_df.dropna(subset=["date"]).sort_values("date").copy()
    ts["roll_mean_resid"] = ts[col].rolling(window=8, min_periods=1).mean()
    fig, ax = plt.subplots(figsize=(10,4))
    ax.plot(ts["date"], ts[col], alpha=0.25, label="residual")
    ax.plot(ts["date"], ts["roll_mean_resid"], lw=2, label="rolling mean (w=8)")
    ax.axhline(0, color="k", lw=1)
    ax.set_title(f"{tgt} residuals over time")
    ax.set_xlabel("date"); ax.set_ylabel("residual")
    ax.legend()
    plt.tight_layout(); plt.show()