In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import AgglomerativeClustering
from gower import gower_matrix

# Load data (with original labels for stratified sampling)
df_original = pd.read_csv("CDC_Diabetes_Dataset_prep.csv")

# Define column types
numerical_cols = ['BMI', 'MentHlth', 'PhysHlth']
binary_cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 
               'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 
               'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']
ordinal_cols = ['Age', 'Education', 'Income', 'GenHlth']

print("Original data distribution:")
print(df_original['Diabetes_012'].value_counts().sort_index())
print(f"Total samples: {len(df_original)}")



Original data distribution:
Diabetes_012
0.0    188251
1.0      4629
2.0     35028
Name: count, dtype: int64
Total samples: 227908


In [43]:
# Stratified sampling strategy
np.random.seed(42)

# Get all class 1 samples (prediabetes)
class_1_samples = df_original[df_original['Diabetes_012'] == 1]

# Sample 6000 from class 0 (no diabetes)
class_0_samples = df_original[df_original['Diabetes_012'] == 0].sample(n=4629, random_state=42)

# Sample 6000 from class 2 (diabetes)
class_2_samples = df_original[df_original['Diabetes_012'] == 2].sample(n=4269, random_state=42)

# Combine all samples
df_sampled = pd.concat([class_0_samples, class_1_samples, class_2_samples], axis=0)
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"\nSampled data distribution:")
print(df_sampled['Diabetes_012'].value_counts().sort_index())
print(f"Total sampled: {len(df_sampled)}")




Sampled data distribution:
Diabetes_012
0.0    4629
1.0    4629
2.0    4269
Name: count, dtype: int64
Total sampled: 13527


In [44]:
# Drop the target variable for clustering
df_sample_features = df_sampled.drop(columns=["Diabetes_012"])



In [45]:
# Scale numerical features
scaler = RobustScaler()
df_scaled = df_sample_features.copy()
df_scaled[numerical_cols] = scaler.fit_transform(df_sample_features[numerical_cols])

print(f"\nSampled data shape: {df_scaled.shape}")




Sampled data shape: (13527, 21)


In [46]:
print("\nCalculating Gower distance matrix on sample...")

# Calculate Gower distance matrix on sample
gower_dist = gower_matrix(df_scaled)

print("Gower distance matrix shape:", gower_dist.shape)
print("\nRunning Agglomerative Clustering with 3 clusters...")

# Agglomerative Clustering with Gower distance
agg_clustering = AgglomerativeClustering(
    n_clusters=4,
    metric='precomputed',
    linkage='average'
)

clusters = agg_clustering.fit_predict(gower_dist)




Calculating Gower distance matrix on sample...
Gower distance matrix shape: (13527, 13527)

Running Agglomerative Clustering with 3 clusters...


In [47]:
# Add clusters to sample dataframe
df_scaled['Cluster'] = clusters
df_scaled['True_Label'] = df_sampled['Diabetes_012'].values



In [48]:
# Cluster statistics
n_clusters = len(set(clusters))

print(f"\nFinal Results:")
print(f"Number of clusters: {n_clusters}")
print(f"\nCluster distribution:")
unique, counts = np.unique(clusters, return_counts=True)
for cluster_id, count in zip(unique, counts):
    print(f"  Cluster {cluster_id}: {count} points ({count/len(clusters)*100:.2f}%)")




Final Results:
Number of clusters: 4

Cluster distribution:
  Cluster 0: 32 points (0.24%)
  Cluster 1: 11 points (0.08%)
  Cluster 2: 13265 points (98.06%)
  Cluster 3: 219 points (1.62%)


In [49]:
# Cross-tabulation: Clusters vs True Labels
print(f"\n{'='*60}")
print("Cluster vs True Label Distribution:")
print(f"{'='*60}")
crosstab = pd.crosstab(df_scaled['Cluster'], df_scaled['True_Label'], margins=True)
print(crosstab)

# Calculate percentage distribution within each cluster
print(f"\n{'='*60}")
print("Percentage Distribution (Row-wise: Within Each Cluster):")
print(f"{'='*60}")
crosstab_pct = pd.crosstab(df_scaled['Cluster'], df_scaled['True_Label'], normalize='index') * 100
print(crosstab_pct.round(2))

# Cluster characteristics
print(f"\n{'='*60}")
print("Cluster Characteristics:")
print(f"{'='*60}")

for cluster_id in sorted(set(clusters)):
    cluster_data = df_scaled[df_scaled['Cluster'] == cluster_id]
    original_cluster_data = df_sample_features.loc[cluster_data.index]
    
    print(f"\nCluster {cluster_id} (n={len(cluster_data)}):")
    print("-" * 40)
    
    # True label distribution in this cluster
    label_dist = df_scaled[df_scaled['Cluster'] == cluster_id]['True_Label'].value_counts().sort_index()
    print("True label distribution:")
    for label, count in label_dist.items():
        label_name = {0: 'No Diabetes', 1: 'Prediabetes', 2: 'Diabetes'}[label]
        print(f"  {label_name} ({label}): {count} ({count/len(cluster_data)*100:.1f}%)")
    
    # Numerical features (original scale)
    print("\nNumerical features (mean):")
    for col in numerical_cols:
        print(f"  {col}: {original_cluster_data[col].mean():.2f}")
    
    # Binary features (percentage with value 1)
    print("\nBinary features (% positive):")
    for col in binary_cols:
        print(f"  {col}: {original_cluster_data[col].mean()*100:.1f}%")
    
    # Ordinal features (mode and mean)
    print("\nOrdinal features:")
    for col in ordinal_cols:
        mode_val = original_cluster_data[col].mode()[0] if len(original_cluster_data[col].mode()) > 0 else 'N/A'
        mean_val = original_cluster_data[col].mean()
        print(f"  {col}: mode={mode_val}, mean={mean_val:.2f}")

# Evaluation metrics
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

print(f"\n{'='*60}")
print("Clustering Evaluation Metrics:")
print(f"{'='*60}")

silhouette_avg = silhouette_score(gower_dist, clusters, metric='precomputed')
# Note: Calinski-Harabasz and Davies-Bouldin don't work with precomputed distances
print(f"Silhouette Score: {silhouette_avg:.4f} (higher is better, range: -1 to 1)")

print("\n" + "="*60)
print("Clustering complete!")
print("="*60)

# Save results
df_scaled.to_csv('clustered_results_hierarchical_gower.csv', index=False)
print("\nResults saved to 'clustered_results_hierarchical_gower.csv'")


Cluster vs True Label Distribution:
True_Label   0.0   1.0   2.0    All
Cluster                            
0             10    13     9     32
1              3     4     4     11
2           4587  4539  4139  13265
3             29    73   117    219
All         4629  4629  4269  13527

Percentage Distribution (Row-wise: Within Each Cluster):
True_Label    0.0    1.0    2.0
Cluster                        
0           31.25  40.62  28.12
1           27.27  36.36  36.36
2           34.58  34.22  31.20
3           13.24  33.33  53.42

Cluster Characteristics:

Cluster 0 (n=32):
----------------------------------------
True label distribution:
  No Diabetes (0.0): 10 (31.2%)
  Prediabetes (1.0): 13 (40.6%)
  Diabetes (2.0): 9 (28.1%)

Numerical features (mean):
  BMI: 30.38
  MentHlth: 20.59
  PhysHlth: 20.00

Binary features (% positive):
  HighBP: 90.6%
  HighChol: 68.8%
  CholCheck: 71.9%
  Smoker: 84.4%
  Stroke: 12.5%
  HeartDiseaseorAttack: 25.0%
  PhysActivity: 31.2%
  Fruits: 25.

In [70]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import DBSCAN
from gower import gower_matrix

# Load data (with original labels for stratified sampling)
df_original = pd.read_csv("CDC_Diabetes_Dataset_prep.csv")

# Define column types
numerical_cols = ['BMI', 'MentHlth', 'PhysHlth']
binary_cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 
               'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 
               'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']
ordinal_cols = ['Age', 'Education', 'Income', 'GenHlth']

print("Original data distribution:")
print(df_original['Diabetes_012'].value_counts().sort_index())
print(f"Total samples: {len(df_original)}")

# Stratified sampling strategy
np.random.seed(42)

# Get all class 1 samples (prediabetes)
class_1_samples = df_original[df_original['Diabetes_012'] == 1]

# Sample 6000 from class 0 (no diabetes)
class_0_samples = df_original[df_original['Diabetes_012'] == 0].sample(n=9000, random_state=42)

# Sample 6000 from class 2 (diabetes)
class_2_samples = df_original[df_original['Diabetes_012'] == 2].sample(n=9000, random_state=42)

# Combine all samples
df_sampled = pd.concat([class_0_samples, class_1_samples, class_2_samples], axis=0)
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"\nSampled data distribution:")
print(df_sampled['Diabetes_012'].value_counts().sort_index())
print(f"Total sampled: {len(df_sampled)}")

# Drop the target variable for clustering
df_sample_features = df_sampled.drop(columns=["Diabetes_012"])



Original data distribution:
Diabetes_012
0.0    188251
1.0      4629
2.0     35028
Name: count, dtype: int64
Total samples: 227908

Sampled data distribution:
Diabetes_012
0.0    9000
1.0    4629
2.0    9000
Name: count, dtype: int64
Total sampled: 22629


In [71]:
# Scale numerical features
scaler = RobustScaler()
df_scaled = df_sample_features.copy()
df_scaled[numerical_cols] = scaler.fit_transform(df_sample_features[numerical_cols])

print(f"\nSampled data shape: {df_scaled.shape}")



Sampled data shape: (22629, 21)


In [72]:
print("\nCalculating Gower distance matrix on sample...")

# Calculate Gower distance matrix on sample
gower_dist = gower_matrix(df_scaled)

print("Gower distance matrix shape:", gower_dist.shape)
print("\nRunning DBSCAN to find optimal parameters for ~3 clusters...")

# Try different eps values to get approximately 3 clusters
eps_values = [0.1, 0.2, 0.3, 0.4, 0.5]
min_samples = 50

best_eps = None
best_n_clusters = 0
best_diff = float('inf')

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    clusters = dbscan.fit_predict(gower_dist)
    
    n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
    n_noise = list(clusters).count(-1)
    
    diff = abs(n_clusters - 3)
    
    print(f"eps={eps}: {n_clusters} clusters, {n_noise} noise points ({n_noise/len(clusters)*100:.1f}%)")
    
    if diff < best_diff and n_clusters > 0:
        best_diff = diff
        best_eps = eps
        best_n_clusters = n_clusters

print(f"\n{'='*60}")
print(f"Best eps value: {best_eps} (produces {best_n_clusters} clusters)")
print(f"{'='*60}\n")




Calculating Gower distance matrix on sample...
Gower distance matrix shape: (22629, 22629)

Running DBSCAN to find optimal parameters for ~3 clusters...
eps=0.1: 1 clusters, 1957 noise points (8.6%)
eps=0.2: 1 clusters, 1 noise points (0.0%)
eps=0.3: 1 clusters, 0 noise points (0.0%)
eps=0.4: 1 clusters, 0 noise points (0.0%)
eps=0.5: 1 clusters, 0 noise points (0.0%)

Best eps value: 0.1 (produces 1 clusters)



In [None]:
# Run final DBSCAN with best eps
dbscan_final = DBSCAN(eps=best_eps, min_samples=min_samples, metric='precomputed')
clusters = dbscan_final.fit_predict(gower_dist)



In [None]:
# Add clusters to sample dataframe
df_scaled['Cluster'] = clusters
df_scaled['True_Label'] = df_sampled['Diabetes_012'].values



In [None]:
# Cluster statistics
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise = list(clusters).count(-1)

print(f"Final Results:")
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise} ({n_noise/len(clusters)*100:.2f}%)")
print(f"\nCluster distribution:")
unique, counts = np.unique(clusters, return_counts=True)
for cluster_id, count in zip(unique, counts):
    if cluster_id == -1:
        print(f"  Noise: {count} points ({count/len(clusters)*100:.2f}%)")
    else:
        print(f"  Cluster {cluster_id}: {count} points ({count/len(clusters)*100:.2f}%)")



In [None]:
# Cross-tabulation: Clusters vs True Labels
print(f"\n{'='*60}")
print("Cluster vs True Label Distribution:")
print(f"{'='*60}")
crosstab = pd.crosstab(df_scaled['Cluster'], df_scaled['True_Label'], margins=True)
print(crosstab)

# Cluster characteristics
print(f"\n{'='*60}")
print("Cluster Characteristics:")
print(f"{'='*60}")

for cluster_id in sorted(set(clusters)):
    if cluster_id == -1:
        continue
    
    cluster_data = df_scaled[df_scaled['Cluster'] == cluster_id]
    original_cluster_data = df_sample_features.loc[cluster_data.index]
    
    print(f"\nCluster {cluster_id} (n={len(cluster_data)}):")
    print("-" * 40)
    
    # True label distribution in this cluster
    label_dist = df_scaled[df_scaled['Cluster'] == cluster_id]['True_Label'].value_counts().sort_index()
    print("True label distribution:")
    for label, count in label_dist.items():
        label_name = {0: 'No Diabetes', 1: 'Prediabetes', 2: 'Diabetes'}[label]
        print(f"  {label_name} ({label}): {count} ({count/len(cluster_data)*100:.1f}%)")
    
    # Numerical features (original scale)
    print("\nNumerical features (mean):")
    for col in numerical_cols:
        print(f"  {col}: {original_cluster_data[col].mean():.2f}")
    
    # Binary features (percentage with value 1)
    print("\nBinary features (% positive):")
    for col in binary_cols:
        print(f"  {col}: {original_cluster_data[col].mean()*100:.1f}%")
    
    # Ordinal features (mode and mean)
    print("\nOrdinal features:")
    for col in ordinal_cols:
        mode_val = original_cluster_data[col].mode()[0] if len(original_cluster_data[col].mode()) > 0 else 'N/A'
        mean_val = original_cluster_data[col].mean()
        print(f"  {col}: mode={mode_val}, mean={mean_val:.2f}")

print("\n" + "="*60)
print("Clustering complete!")
print("="*60)

# Save results if needed
df_scaled.to_csv('clustered_results_dbscan_gower.csv', index=False)
print("\nResults saved to 'clustered_results_dbscan_gower.csv'")