## Step 6 – Two approaches: Logistic regression & K-means

This notebook:
- **Logistic regression**: Predict interface version (A vs B) from session-level behavioral features.
- **K-means clustering**: Group sessions by behavior (unsupervised).

For each we evaluate: **model performance**, **feature importance** (or ranking), and **failure cases**.

Deliverables: **results summary**, **feature importance ranking**.

### 1. Load or build session-level dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, silhouette_score, silhouette_samples,
)

BASE = Path("..")

# Try loading precomputed session-level metrics; otherwise compute from logs
session_path = BASE / "session_level_metrics.csv"
if session_path.exists():
    session_level = pd.read_csv(session_path)
else:
    logs = pd.read_csv(BASE / "logs.csv")
    sessions = pd.read_csv(BASE / "sessions.csv")
    logs["timestamp"] = pd.to_datetime(logs["timestamp"])
    logs = logs.sort_values(["session_id", "timestamp"])
    logs["prev_timestamp"] = logs.groupby("session_id")["timestamp"].shift(1)
    logs["gap_sec"] = (logs["timestamp"] - logs["prev_timestamp"]).dt.total_seconds()
    logs["prev_action"] = logs.groupby("session_id")["action_type"].shift(1)
    logs["prev_element"] = logs.groupby("session_id")["element_id"].shift(1)
    logs["is_repetition"] = (logs["action_type"] == logs["prev_action"]) & (logs["element_id"] == logs["prev_element"])
    logs["action_element"] = logs["action_type"].astype(str) + "_" + logs["element_id"].astype(str)
    session_duration = (logs.groupby("session_id")["timestamp"].max() - logs.groupby("session_id")["timestamp"].min()).dt.total_seconds()
    n_actions = logs.groupby("session_id").size()
    duration_min = (session_duration / 60).replace(0, np.nan)
    session_level = pd.DataFrame({
        "session_id": n_actions.index,
        "usage_intensity": n_actions.values,
        "session_duration_sec": session_duration.values,
        "actions_per_minute": (n_actions / duration_min).values,
        "error_proportion": logs.groupby("session_id")["error_flag"].mean().values,
        "hesitation_time_sec": logs.groupby("session_id")["gap_sec"].mean().values,
        "action_diversity": logs.groupby("session_id")["action_element"].nunique().values,
        "repetition_index": logs.groupby("session_id")["is_repetition"].mean().values,
    })
    sessions_clean = sessions.drop_duplicates(subset=["session_id"])
    session_level = session_level.merge(sessions_clean, on="session_id", how="left")

feature_cols = ["usage_intensity", "error_proportion", "hesitation_time_sec", "action_diversity", "repetition_index"]
if "actions_per_minute" in session_level.columns:
    feature_cols.append("actions_per_minute")

df = session_level.copy()
df["target"] = (df["interface_version"] == "B").astype(int)  # B=1, A=0

for c in feature_cols:
    if c in df.columns and df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())

df = df.dropna(subset=feature_cols + ["target"]).reset_index(drop=True)
print(f"Sessions with complete features and target: {len(df)}")
df[feature_cols + ["interface_version", "target"]].head()

### 2. Train/test split and scaling

In [None]:
X = df[feature_cols]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

X_s = scaler.fit_transform(X)  # full data for K-means
print("Train size:", len(X_train), "Test size:", len(X_test))

### 3. Approach 1: Logistic regression

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_s, y_train)
y_pred_lr = lr.predict(X_test_s)
y_proba_lr = lr.predict_proba(X_test_s)[:, 1]

acc = accuracy_score(y_test, y_pred_lr)
prec = precision_score(y_test, y_pred_lr, zero_division=0)
rec = recall_score(y_test, y_pred_lr, zero_division=0)
f1 = f1_score(y_test, y_pred_lr, zero_division=0)
auc = roc_auc_score(y_test, y_proba_lr) if len(np.unique(y_test)) > 1 else 0.5

print("Logistic regression – Performance (test set):")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {prec:.4f}")
print(f"  Recall:   {rec:.4f}")
print(f"  F1:       {f1:.4f}")
print(f"  ROC-AUC:  {auc:.4f}")
print("\nConfusion matrix (test):")
print(confusion_matrix(y_test, y_pred_lr))

In [None]:
# Feature importance: absolute coefficient (standardized features)
coef = pd.DataFrame({
    "feature": feature_cols,
    "coefficient": lr.coef_[0],
})
coef["abs_coef"] = np.abs(coef["coefficient"])
coef = coef.sort_values("abs_coef", ascending=False).reset_index(drop=True)
coef["rank"] = range(1, len(coef) + 1)
print("Logistic regression – Feature importance (coefficients on standardized features):")
coef[["rank", "feature", "coefficient", "abs_coef"]]

In [None]:
# Failure cases: misclassified sessions (test set)
test_idx = X_test.index
df_test = df.loc[test_idx].copy()
df_test["predicted"] = y_pred_lr
df_test["predicted_label"] = df_test["predicted"].map({0: "A", 1: "B"})
df_test["correct"] = df_test["target"] == df_test["predicted"]
failures_lr = df_test[~df_test["correct"]].copy()

print(f"Failure cases (misclassified): {len(failures_lr)} out of {len(df_test)} test sessions")
if len(failures_lr) > 0:
    cols_show = ["session_id", "interface_version", "predicted_label"] + [c for c in feature_cols if c in failures_lr.columns]
    display(failures_lr[cols_show].head(10))
else:
    print("No misclassifications.")

### 4. Approach 2: K-means clustering

In [None]:
k = 2  # match A/B; can try 3–4 and compare silhouette
km = KMeans(n_clusters=k, random_state=42, n_init=10)
km.fit(X_s)
labels_km = km.labels_

sil_avg = silhouette_score(X_s, labels_km)
sil_samples = silhouette_samples(X_s, labels_km)

print(f"K-means (k={k}) – Performance:")
print(f"  Inertia: {km.inertia_:.2f}")
print(f"  Silhouette score (avg): {sil_avg:.4f}")
print("\nCluster sizes:")
print(pd.Series(labels_km).value_counts().sort_index())

In [None]:
# Feature importance ranking for K-means: which features differ most across cluster centroids
centroids = pd.DataFrame(km.cluster_centers_, columns=feature_cols)
centroids["cluster"] = range(k)
centroid_range = centroids[feature_cols].max() - centroids[feature_cols].min()
feat_importance_km = pd.DataFrame({
    "feature": feature_cols,
    "range_across_centroids": centroid_range.values,
}).sort_values("range_across_centroids", ascending=False)
feat_importance_km["rank"] = range(1, len(feat_importance_km) + 1)
print("K-means – Feature importance ranking (range of centroid values; larger = more separation):")
feat_importance_km

In [None]:
# Failure cases: sessions with lowest silhouette (ambiguous assignment)
df["cluster"] = labels_km
df["silhouette"] = sil_samples
n_fail = min(10, max(1, int(0.1 * len(df))))
failures_km = df.nsmallest(n_fail, "silhouette")[["session_id", "cluster", "silhouette"] + feature_cols]

print(f"Failure cases (lowest silhouette, ambiguous assignment): top {n_fail} sessions")
display(failures_km)

### 5. Results summary

In [None]:
results_summary = pd.DataFrame([
    {
        "Approach": "Logistic regression",
        "Task": "Predict interface_version (A vs B)",
        "Accuracy": round(acc, 4),
        "F1": round(f1, 4),
        "ROC-AUC": round(auc, 4),
        "Failure_cases": len(failures_lr),
        "Failure_note": "Misclassified test sessions",
    },
    {
        "Approach": "K-means (k=2)",
        "Task": "Cluster sessions by behavior",
        "Accuracy": np.nan,
        "F1": np.nan,
        "ROC-AUC": np.nan,
        "Failure_cases": n_fail,
        "Failure_note": "Lowest silhouette (ambiguous)",
    },
])
results_summary["Silhouette"] = [np.nan, round(sil_avg, 4)]
results_summary["Inertia"] = [np.nan, round(km.inertia_, 2)]
results_summary

### 6. Feature importance ranking (combined)

In [None]:
lr_rank = coef[["feature", "rank", "coefficient"]].rename(columns={"rank": "LR_rank", "coefficient": "LR_coef"})
km_rank = feat_importance_km[["feature", "rank", "range_across_centroids"]].rename(
    columns={"rank": "KMeans_rank", "range_across_centroids": "KMeans_centroid_range"}
)
importance_ranking = lr_rank.merge(km_rank, on="feature", how="outer")
importance_ranking