# Extract Clustering Features and Add to Data

In [3]:
import sys
import pandas as pd
from pathlib import Path

sys.path.insert(0, '/Users/luungoc/Westaco/Chatbot')
from weschatbot.ambiguity.add_features_to_data import (
    extract_kmeans_features,
    extract_hdbscan_features, 
    add_features_and_save
)

## Load Data

In [4]:
data_path = '/Users/luungoc/Westaco/Train_dataset/messages_202512292119_chunks.parquet'
output_dir = Path('./output_with_features')
output_dir.mkdir(exist_ok=True)

df = pd.read_parquet(data_path)
df.head(2)

Unnamed: 0,question_id,question,step,content,score,entropy,cluster,cluster_label,silhouette,decision,vector,elbow_idx,elbow_value,steepness,steepness_norm,steepness_pos_weight,steepness_combined,normalized_entropy,confidence
0,9,TC-02 Who is responsible for providing the Wes...,Final,#### 2. **Definitii**\n\nPentru scopul acestor...,0.632993,2.889979,0.0,Cluster-0,0.268934,answer_direct,"[0.006683349609375, -0.035369873046875, -0.010...",9,0.545869,0.087124,0.137638,0.749947,0.2601,0.849695,0.402625
1,9,TC-02 Who is responsible for providing the Wes...,Final,#### 2. **Definitii**\n\nPentru scopul acestor...,0.631873,2.889979,0.0,Cluster-0,0.268934,answer_direct,"[0.005290985107421875, -0.03546142578125, -0.0...",9,0.545869,0.087124,0.137638,0.749947,0.2601,0.849695,0.402625


## Run Full Dataset - KMeans

In [5]:
k_values = [2, 3, 4, 5]
# k_values = [2]

for k in k_values:
    print(f"Processing KMeans k={k}")
    
    features = extract_kmeans_features(df, k)
    print(f"Extracted {len(features)} questions")
    
    output_path = output_dir / f'kmeans_k_{k}_with_features.parquet'
    result = add_features_and_save(df, features, output_path)
    
    print(f"Saved: {output_path}")

Processing KMeans k=2
Extracted 477 questions
Saved: output_with_features/kmeans_k_2_with_features.parquet
Processing KMeans k=3
Extracted 473 questions
Saved: output_with_features/kmeans_k_3_with_features.parquet
Processing KMeans k=4
Extracted 470 questions
Saved: output_with_features/kmeans_k_4_with_features.parquet
Processing KMeans k=5
Extracted 469 questions
Saved: output_with_features/kmeans_k_5_with_features.parquet


## Run Full Dataset - HDBSCAN

In [6]:
min_cluster_sizes = [2, 3, 5]
min_samples_list = [2, 3, 5]

# min_cluster_sizes = [2]
# min_samples_list = [2]
for mcs in min_cluster_sizes:
    for ms in min_samples_list:
        features = extract_hdbscan_features(df, mcs, ms)
        print(f"Extracted {len(features)} questions")
        
        output_path = output_dir / f'hdbscan_mcs_{mcs}_ms_{ms}_with_features.parquet'
        result = add_features_and_save(df, features, output_path)
        
        print(f"Saved: {output_path}")

Extracted 473 questions
Saved: output_with_features/hdbscan_mcs_2_ms_2_with_features.parquet
Extracted 473 questions
Saved: output_with_features/hdbscan_mcs_2_ms_3_with_features.parquet
Extracted 473 questions
Saved: output_with_features/hdbscan_mcs_2_ms_5_with_features.parquet
Extracted 469 questions
Saved: output_with_features/hdbscan_mcs_3_ms_2_with_features.parquet
Extracted 469 questions
Saved: output_with_features/hdbscan_mcs_3_ms_3_with_features.parquet
Extracted 469 questions
Saved: output_with_features/hdbscan_mcs_3_ms_5_with_features.parquet
Extracted 451 questions
Saved: output_with_features/hdbscan_mcs_5_ms_2_with_features.parquet
Extracted 451 questions
Saved: output_with_features/hdbscan_mcs_5_ms_3_with_features.parquet
Extracted 451 questions
Saved: output_with_features/hdbscan_mcs_5_ms_5_with_features.parquet


## Check Output Files

In [7]:
output_files = sorted(output_dir.glob('*.parquet'))

print(f"Total files: {len(output_files)}\n")
for f in output_files:
    print(f"{f.name:50s}")

Total files: 13

hdbscan_mcs_2_ms_2_with_features.parquet          
hdbscan_mcs_2_ms_3_with_features.parquet          
hdbscan_mcs_2_ms_5_with_features.parquet          
hdbscan_mcs_3_ms_2_with_features.parquet          
hdbscan_mcs_3_ms_3_with_features.parquet          
hdbscan_mcs_3_ms_5_with_features.parquet          
hdbscan_mcs_5_ms_2_with_features.parquet          
hdbscan_mcs_5_ms_3_with_features.parquet          
hdbscan_mcs_5_ms_5_with_features.parquet          
kmeans_k_2_with_features.parquet                  
kmeans_k_3_with_features.parquet                  
kmeans_k_4_with_features.parquet                  
kmeans_k_5_with_features.parquet                  
