This notebook:
1. Uses the CCI integrated dataset.
2. Filters to diabetes patients.
3. Runs VAE and TabNet with parameter input_file.
4. Performs clustering analysis and visualization (t-SNE, UMAP) as before.

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
from scipy.stats import f_oneway, kruskal, shapiro, levene

project_dir = os.path.abspath("..")
data_dir = os.path.join(project_dir, "Data")
sys.path.append(project_dir)

from vae_model import main as vae_main
from tabnet_model import main as tabnet_main

### Load CCI data

In [2]:
cci_file='patient_data_with_health_index_cci.pkl'
cci_path=os.path.join(data_dir,cci_file)
if not os.path.exists(cci_path):
    raise FileNotFoundError("CCI file not found. Run 01_full_dataset_with_cci first.")

patient_data=pd.read_pickle(cci_path)

### Filter to diabetes patients

In [3]:
conditions=pd.read_csv(os.path.join(r'C:\Users\imran\Documents\VITAI\Data','conditions.csv'),usecols=['PATIENT','DESCRIPTION'])
diabetes_patients=conditions[conditions['DESCRIPTION'].str.lower().str.contains('diabetes')]['PATIENT'].unique()

subset_data=patient_data[patient_data['Id'].isin(diabetes_patients)].copy()
subset_file='patient_data_with_health_index_cci_diabetes.pkl'
subset_data.to_pickle(os.path.join(data_dir,subset_file))

### Run models on diabetes subset

In [None]:
vae_main(input_file=subset_file)
tabnet_main(input_file=subset_file)

### Analysis

In [None]:
# Load latent features and predictions
latent_features = pd.read_csv('latent_features_vae.csv')
tabnet_predictions = pd.read_csv('tabnet_predictions.csv')
data_merged = latent_features.merge(tabnet_predictions,on='Id',how='inner')

X = data_merged.drop(columns=['Id','Predicted_Health_Index'])
scaler = StandardScaler()
X_scaled=scaler.fit_transform(X)

cluster_range=range(2,10)
sil_kmeans=[]
for n in cluster_range:
    km=KMeans(n_clusters=n,random_state=42)
    labels=km.fit_predict(X_scaled)
    sil_kmeans.append(silhouette_score(X_scaled, labels))

optimal_k = cluster_range[np.argmax(sil_kmeans)]
kmeans=KMeans(n_clusters=optimal_k,random_state=42).fit(X_scaled)
kmeans_labels=kmeans.labels_

sil_agg=[]
for n in cluster_range:
    agg=AgglomerativeClustering(n_clusters=n)
    labels=agg.fit_predict(X_scaled)
    sil_agg.append(silhouette_score(X_scaled,labels))

optimal_agg=cluster_range[np.argmax(sil_agg)]
agg=AgglomerativeClustering(n_clusters=optimal_agg)
agg_labels=agg.fit_predict(X_scaled)

neighbors=5
from sklearn.neighbors import NearestNeighbors
nbrs=NearestNeighbors(n_neighbors=neighbors).fit(X_scaled)
distances,indices=nbrs.kneighbors(X_scaled)
distances=np.sort(distances[:,neighbors-1],axis=0)
epsilon=distances[int(0.9*len(distances))]
db=DBSCAN(eps=epsilon,min_samples=5).fit(X_scaled)
dbscan_labels=db.labels_

def cluster_scores(X,labels):
    if len(set(labels))>1:
        sil=silhouette_score(X,labels)
        ch=calinski_harabasz_score(X,labels)
        db=davies_bouldin_score(X,labels)
    else:
        sil=ch=db=np.nan
    return sil,ch,db

sil_km,ch_km,db_km=cluster_scores(X_scaled,kmeans_labels)
sil_a,ch_a,db_a=cluster_scores(X_scaled,agg_labels)
if len(set(dbscan_labels))>1:
    sil_db,ch_db,db_db=cluster_scores(X_scaled,dbscan_labels)
else:
    sil_db=ch_db=db_db=np.nan

validation_df=pd.DataFrame({
    'Method':['KMeans','Agglomerative','DBSCAN'],
    'Silhouette':[sil_km,sil_a,sil_db],
    'CH':[ch_km,ch_a,ch_db],
    'DB':[db_km,db_a,db_db]
})
validation_df['Sil_rank']=validation_df['Silhouette'].rank(ascending=False)
validation_df['CH_rank']=validation_df['CH'].rank(ascending=False)
validation_df['DB_rank']=validation_df['DB'].rank(ascending=True)
validation_df['Avg_rank']=validation_df[['Sil_rank','CH_rank','DB_rank']].mean(axis=1)

# Function to count clusters (excluding noise for DBSCAN)
def get_n_clusters(labels):
    unique_lbls = set(labels)
    # If DBSCAN includes noise (-1), exclude it
    if -1 in unique_lbls:
        unique_lbls.remove(-1)
    return len(unique_lbls)

kmeans_n = get_n_clusters(kmeans_labels)
agg_n = get_n_clusters(agg_labels)
dbscan_n = get_n_clusters(dbscan_labels)

validation_df['n_clusters'] = [kmeans_n, agg_n, dbscan_n]

# Rank by n_clusters descending (more clusters = better)
validation_df['Cluster_rank'] = validation_df['n_clusters'].rank(ascending=False)

# Combine cluster preference
# Adjust weight to control how much cluster count influences the final selection.
# Positive weight means more clusters reduces the final rank (i.e. better).
weight = 0.5
validation_df['New_Avg_rank'] = validation_df['Avg_rank'] - weight*validation_df['Cluster_rank'].rank(ascending=True)

best_method=validation_df.loc[validation_df['New_Avg_rank'].idxmin(),'Method']

if best_method=='KMeans':
    final_labels=kmeans_labels
elif best_method=='Agglomerative':
    final_labels=agg_labels
else:
    final_labels=dbscan_labels

data_merged['Cluster']=final_labels
cluster_map=data_merged.groupby('Cluster')['Predicted_Health_Index'].mean().sort_values().reset_index()
cluster_map['Severity_Index']=range(1,len(cluster_map)+1)
mapping=cluster_map.set_index('Cluster')['Severity_Index'].to_dict()
data_merged['Severity_Index']=data_merged['Cluster'].map(mapping)

tsne=TSNE(n_components=2,random_state=42)
tsne_results=tsne.fit_transform(X_scaled)
plt.figure(figsize=(10,6))
sns.scatterplot(x=tsne_results[:,0],y=tsne_results[:,1],hue=data_merged['Severity_Index'],palette='viridis')
plt.title(f't-SNE visualization ({best_method})')
plt.show()

reducer=umap.UMAP(n_components=2,random_state=42)
umap_results=reducer.fit_transform(X_scaled)
plt.figure(figsize=(10,6))
sns.scatterplot(x=umap_results[:,0],y=umap_results[:,1],hue=data_merged['Severity_Index'],palette='viridis')
plt.title(f'UMAP visualization ({best_method})')
plt.show()

clusters=data_merged['Cluster'].unique()
normality_pvals=[]
for c in clusters:
    grp=data_merged[data_merged['Cluster']==c]['Predicted_Health_Index']
    stat,p=shapiro(grp)
    normality_pvals.append(p)

if any(p<0.05 for p in normality_pvals):
    groups=[data_merged[data_merged['Cluster']==c]['Predicted_Health_Index'] for c in clusters]
    kw_stat,kw_p=kruskal(*groups)
    print(f"Kruskal-Wallis: H={kw_stat}, p={kw_p}")
else:
    groups=[data_merged[data_merged['Cluster']==c]['Predicted_Health_Index'] for c in clusters]
    lv_stat,lv_p=levene(*groups)
    if lv_p<0.05:
        kw_stat,kw_p=kruskal(*groups)
        print(f"Kruskal-Wallis: H={kw_stat}, p={kw_p}")
    else:
        f_stat,f_p=f_oneway(*groups)
        print(f"ANOVA: F={f_stat}, p={f_p}")

print("Analysis complete. Full dataset with CCI integrated.")

In [None]:
###########################################################
# ---- 3D Embedding & Clustering with Expanded Metrics ----
###########################################################
from mpl_toolkits.mplot3d import Axes3D  # for 3D plotting
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# 1) Create a 3D embedding (pick t-SNE or UMAP)
#    We'll demonstrate BOTH, then choose one:

# Option A: 3D t-SNE
tsne_3d = TSNE(n_components=3, random_state=42)
tsne_3d_results = tsne_3d.fit_transform(X_scaled)

# Option B: 3D UMAP
reducer_3d = umap.UMAP(n_components=3, random_state=42)
umap_3d_results = reducer_3d.fit_transform(X_scaled)

# Choose which 3D embedding you want to use:
# embedding_3d = tsne_3d_results
embedding_3d = umap_3d_results  # <--- If you prefer UMAP in 3D

# 2) Run K-Means for cluster_range = [6..10] in the 3D space.
cluster_range_3d = range(6, 10)  # or range(6, 11) if you want 6..10 inclusive
kmeans_3d_metrics = []

for k_3d in cluster_range_3d:
    kmeans_3d = KMeans(n_clusters=k_3d, random_state=42)
    labels_3d = kmeans_3d.fit_predict(embedding_3d)
    
    # Compute 3 validation metrics
    sil_3d = silhouette_score(embedding_3d, labels_3d)
    ch_3d = calinski_harabasz_score(embedding_3d, labels_3d)
    db_3d = davies_bouldin_score(embedding_3d, labels_3d)
    
    kmeans_3d_metrics.append({
        'n_clusters': k_3d,
        'silhouette': sil_3d,
        'calinski_harabasz': ch_3d,
        'davies_bouldin': db_3d
    })

# 3) Convert metrics to a DataFrame for ranking
df_3d_eval = pd.DataFrame(kmeans_3d_metrics)

# Rank: higher silhouette/CH is better => descending rank; lower DB is better => ascending rank
df_3d_eval['sil_rank'] = df_3d_eval['silhouette'].rank(ascending=False)
df_3d_eval['ch_rank'] = df_3d_eval['calinski_harabasz'].rank(ascending=False)
df_3d_eval['db_rank'] = df_3d_eval['davies_bouldin'].rank(ascending=True)

# Average rank across all three
df_3d_eval['avg_rank'] = df_3d_eval[['sil_rank', 'ch_rank', 'db_rank']].mean(axis=1)

# Pick best cluster count based on minimal avg_rank
best_idx_3d = df_3d_eval['avg_rank'].idxmin()
best_k_3d = df_3d_eval.loc[best_idx_3d, 'n_clusters']

print("==== 3D K-Means Clustering Metrics (6..9 or 6..10) ====")
print(df_3d_eval[['n_clusters','silhouette','calinski_harabasz','davies_bouldin','avg_rank']])
print(f"\nChosen #clusters for 3D embedding: {best_k_3d}\n")

# 4) Fit final K-Means with best_k_3d in 3D embedding
kmeans_final_3d = KMeans(n_clusters=best_k_3d, random_state=42)
final_labels_3d = kmeans_final_3d.fit_predict(embedding_3d)

# 5) Attach new 3D clusters to data_merged
data_merged['Cluster_3D'] = final_labels_3d

# Create a "Severity_Index_3D" by ordering clusters on mean predicted health index
cluster_map_3d = (
    data_merged
    .groupby('Cluster_3D')['Predicted_Health_Index']
    .mean()
    .sort_values()
    .reset_index()
)
cluster_map_3d['Severity_Index_3D'] = range(1, len(cluster_map_3d)+1)
mapping_3d = cluster_map_3d.set_index('Cluster_3D')['Severity_Index_3D'].to_dict()
data_merged['Severity_Index_3D'] = data_merged['Cluster_3D'].map(mapping_3d)

# 6) 3D scatter plot (color-coded by Severity_Index_3D)
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')

scatter_3d = ax.scatter(
    embedding_3d[:, 0],
    embedding_3d[:, 1],
    embedding_3d[:, 2],
    c=data_merged['Severity_Index_3D'],
    cmap='viridis',
    alpha=0.7
)

ax.set_title(f"3D Clustering (K={best_k_3d}) - {'UMAP' if embedding_3d is umap_3d_results else 't-SNE'}")
ax.set_xlabel("Dimension 1")
ax.set_ylabel("Dimension 2")
ax.set_zlabel("Dimension 3")

# Add a colorbar
cbar = plt.colorbar(scatter_3d, ax=ax, fraction=0.03, pad=0.09)
cbar.set_label("Severity_Index_3D")

plt.show()

# 7) Optional: If you want a quick stats check across clusters in 3D:
#    We can re-use the cluster_scores function, but now pass embedding_3d + final_labels_3d
sil_3d_final, ch_3d_final, db_3d_final = cluster_scores(embedding_3d, final_labels_3d)
print(f"Final 3D KMeans (K={best_k_3d}): silhouette={sil_3d_final:.3f}, "
      f"CH={ch_3d_final:.1f}, DB={db_3d_final:.3f}")

print("---- 3D clustering with expanded metrics complete ----")
