In [5]:
!pip install --quiet scikit-learn



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# !pip install -q scikit-learn

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

# --- Load data ---
df = pd.read_csv("Silver/events_clean.csv")

# --- Build a numeric Series of counts called `events` ---
if "EVENTS" in df.columns:
    events = pd.to_numeric(df["EVENTS"], errors="coerce")
else:
    org_col_candidates = [
        c
        for c in df.columns
        if c.upper() in {"ORGANIZATION_ID", "ORG_ID", "ACCOUNT_ID", "CUSTOMER_ID"}
    ]
    if not org_col_candidates:
        raise ValueError(
            "Could not find an organization-id column or an 'EVENTS' column."
        )
    org_col = org_col_candidates[0]
    events = df.groupby(org_col, dropna=False).size().rename("EVENTS").astype("Int64")

# clean up: drop NaNs/negatives, convert to int
events = events.dropna()
events = events[events >= 0].astype(int)

# --- Log-KMeans on counts (handles heavy skew) ---
X = np.log1p(events.to_numpy().reshape(-1, 1))
kmeans = KMeans(n_clusters=3, random_state=42, n_init=50)
labels = kmeans.fit_predict(X)

# order clusters by raw-scale centers => weak < engaged < very engaged
centers_raw = np.expm1(kmeans.cluster_centers_.ravel())
order = np.argsort(centers_raw)
label_map = {order[0]: "Weakly engaged", order[1]: "Engaged", order[2]: "Very engaged"}

seg = pd.Series(labels, index=events.index).map(label_map)

# --- Cut points on RAW counts ---
weak_max = int(events[seg == "Weakly engaged"].max())
engaged_max = int(events[seg == "Engaged"].max())

thresholds = {
    "Weakly engaged": f"EVENTS ≤ {weak_max}",
    "Engaged": f"{weak_max+1} ≤ EVENTS ≤ {engaged_max}",
    "Very engaged": f"EVENTS > {engaged_max}",
}

print("Cluster centers (raw scale):", np.round(centers_raw, 2))
print("Thresholds:", thresholds)

# Optional: save per-org segments
out = pd.DataFrame({"EVENTS": events, "SEGMENT": seg})
out.to_csv("engagement_segments.csv", index=True)

Cluster centers (raw scale): [321.66   2.44  20.24]
Thresholds: {'Weakly engaged': 'EVENTS ≤ 7', 'Engaged': '8 ≤ EVENTS ≤ 78', 'Very engaged': 'EVENTS > 78'}
