#### **Overview**

**Steps:**
1. Select clustering features
2. Clean + impute missing values
3. Scale numeric columns
4. One-hot encode categorical columns
5. Vectorize topics
6. Combine features
7. Apply clustering (KMeans / GMM / HDBSCAN)

In [1]:
pip install hdbscan scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
import hdbscan

In [3]:
# ---- select features ----
num_features = [
    "interaction_duration_seconds",
    "speech_sentiment_score",
    "customer_satisfaction_score",
    "call_hour"
]

cat_features = [
    "call_direction",
    "call_channel",
    "call_dayofweek",
    "issue_resolved",
    "follow_up_required"
]

df_model = df_copy[num_features + cat_features].copy()

# ---- preprocessing ----
preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

X = preprocessor.fit_transform(df_model)

# ---- HDBSCAN base clustering ----
hdb = hdbscan.HDBSCAN(
    min_cluster_size=10,
    min_samples=5,
    metric="euclidean",
    prediction_data=True
).fit(X)

df_copy["hdbscan_cluster"] = hdb.labels_

# ---- Gaussian Mixture refinement ----
valid_idx = df_copy["hdbscan_cluster"] != -1
X_valid = X[valid_idx]

gmm = GaussianMixture(
    n_components=4,
    covariance_type="full",
    random_state=42
).fit(X_valid)

df_copy["gmm_cluster"] = -1
df_copy.loc[valid_idx, "gmm_cluster"] = gmm.predict(X_valid)

# ---- SOFT CLUSTER PROBABILITY ----
probs = np.full((len(df_copy), 4), np.nan)
probs[valid_idx] = gmm.predict_proba(X_valid)

for i in range(4):
    df_copy[f"gmm_prob_{i}"] = probs[:, i]

NameError: name 'df_copy' is not defined