#### **Overview**

**Steps:**
1. Select clustering features
2. Clean + impute missing values
3. Scale numeric columns
4. One-hot encode categorical columns
5. Vectorize topics
6. Combine features
7. Apply clustering (KMeans / GMM / HDBSCAN)

In [8]:
# pip install hdbscan scikit-learn

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
import hdbscan

In [9]:
df_copy = pd.read_csv(r"C:\Users\pc\Desktop\Pro_Jets\CC EDA&ML\EDA notebook\df_copy.csv")
df_copy.head()

Unnamed: 0,interaction_id,customer_id,agent_id,interaction_datetime,interaction_duration_seconds,call_direction,call_channel,call_status,customer_satisfaction_score,speech_sentiment_score,...,issue_resolved,follow_up_required,follow_up_due_date,language,customer_feedback_text,agent_notes,call_hour,call_dayofweek,csat_band,hour
0,INT00001,CUST00001,AGT0001,2024-01-01 09:10:13,415,inbound,phone,completed,4.7,0.82,...,True,False,,en,"Thank you for your help, great service.",Customer called regarding password reset. Issu...,9,Monday,4-5,9
1,INT00002,CUST00002,AGT0002,2024-01-01 11:24:50,23,outbound,phone,dropped,,,...,False,False,,en,,Call dropped instantly. No customer response.,11,Monday,,11
2,INT00003,CUST00003,AGT0003,2024-01-01 13:32:05,198,inbound,chat,completed,4.1,0.63,...,True,False,,es,"Gracias, todo bien.",Customer requested recent statements. Provided...,13,Monday,4-5,13
3,INT00004,CUST00001,AGT0004,2024-01-02 10:45:14,37,inbound,phone,abandoned,0.0,-0.95,...,False,False,,en,,Caller disconnected before an agent could answer.,10,Tuesday,,10
4,INT00005,CUST00004,AGT0001,2024-01-02 15:08:55,720,inbound,phone,completed,4.5,0.91,...,True,False,,en,Resolved my issue quickly.,Customer reported card decline online. Walked ...,15,Tuesday,4-5,15


#### **Clustering Pipeline**

In [11]:
# ---- select features ----
num_features = [
    "interaction_duration_seconds",
    "speech_sentiment_score",
    "customer_satisfaction_score",
    "call_hour"
]

cat_features = [
    "call_direction",
    "call_channel",
    "call_dayofweek",
    "issue_resolved",
    "follow_up_required"
]

df_model = df_copy[num_features + cat_features].copy()

In [12]:
# ---- preprocessing ----
preprocessor = ColumnTransformer(
    [
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

X = preprocessor.fit_transform(df_model)

In [13]:
# ---- HDBSCAN base clustering ----
hdb = hdbscan.HDBSCAN(
    min_cluster_size=10,
    min_samples=5,
    metric="euclidean",
    prediction_data=True
).fit(X)

df_copy["hdbscan_cluster"] = hdb.labels_

ValueError: Input contains NaN.

In [14]:
# ---- Gaussian Mixture refinement ----
valid_idx = df_copy["hdbscan_cluster"] != -1
X_valid = X[valid_idx]

gmm = GaussianMixture(
    n_components=4,
    covariance_type="full",
    random_state=42
).fit(X_valid)

df_copy["gmm_cluster"] = -1
df_copy.loc[valid_idx, "gmm_cluster"] = gmm.predict(X_valid)

# ---- SOFT CLUSTER PROBABILITY ----
probs = np.full((len(df_copy), 4), np.nan)
probs[valid_idx] = gmm.predict_proba(X_valid)

for i in range(4):
    df_copy[f"gmm_prob_{i}"] = probs[:, i]

KeyError: 'hdbscan_cluster'