Use GDI43 instead of 33 so that it is less close to Silhouette? By replacing average linkage by centroid distance.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from joblib import Parallel, delayed # for parallelization
from tqdm import tqdm
from itertools import product

# Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

# Evaluation
from kneed import KneeLocator
from sklearn.neighbors import BallTree

# Visualization
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull

# Preparation
## Data

In [None]:
data2004_i = pd.read_parquet("data/data2004_i.parquet") # load imputed data

# Dataset with numeric outcomes
data_n = data2004_i[[
    # Q2
    'clseusa_n', # 'clsetown_n', 'clsestat_n', 'clsenoam_n',
    # Q3
    'ambornin_n', 'amcit_n', 'amlived_n', 'amenglsh_n', 
    'amchrstn_n', 'amgovt_n', 'amfeel_n', # 'amancstr_n',
    # Q4
    'amcitizn_n', 'amshamed_n', 'belikeus_n', 'ambetter_n', 'ifwrong_n', # 'amsports_n', 'lessprd_n',
    # Q5
    'proudsss_n', 'proudgrp_n', 'proudpol_n', 'prouddem_n', 'proudeco_n',
    'proudspt_n', 'proudart_n', 'proudhis_n', 'proudmil_n', 'proudsci_n']]

## Scaling and normalizing
scaler = MinMaxScaler(feature_range=(-1,1))
data_n_scaled = scaler.fit_transform(data_n)

normalizer = StandardScaler()
data_n_norm = normalizer.fit_transform(data_n)

# Dataset with categorical outcomes
data_f = data2004_i[[
    # Q2
    'clseusa_f', # 'clsetown_f', 'clsestat_f', 'clsenoam_f',
    # Q3
    'ambornin_f', 'amcit_f', 'amlived_f', 'amenglsh_f', 
    'amchrstn_f', 'amgovt_f', 'amfeel_f', # 'amancstr_f',
    # Q4
    'amcitizn_f', 'amshamed_f', 'belikeus_f', 'ambetter_f', 'ifwrong_f', # 'amsports_f', 'lessprd_f',
    # Q5
    'proudsss_f', 'proudgrp_f', 'proudpol_f', 'prouddem_f', 'proudeco_f',
    'proudspt_f', 'proudart_f', 'proudhis_f', 'proudmil_f', 'proudsci_f']]

# Dataset with controls
controls = data2004_i[[
    'sex', 'race_f', 'born_usa', 'party_fs', 'religstr_f', 
    'reltrad_f', 'region_f']]

## CVI

In [None]:
CVI = ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn']

## Parameters

In [None]:
max_clust = 8
max_threads = 8

# 1. Fit models

In [None]:
from model_fit import do_StepMix, do_kmeans, do_AHC, do_hdbscan

## Latent

In [None]:
# Encoding
data_f_oh = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
controls_dum = pd.get_dummies(controls)

In [None]:
msrt = ['categorical', 'continuous']
covar = ['without', 'with']
latent_params = list(product(msrt, covar))

clust_range = range(1, max_clust+1)
latent_grid = product(clust_range, latent_params)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(data_f_oh if msrt == 'categorical' else data_n, n, msrt, covar)
    for n, (msrt, covar) in tqdm(latent_grid, desc='Fitting latent models')
)

latent_all = pd.DataFrame(results)

In [None]:
# Parameters
# clust_range = range(1, max_clust+1)

In [None]:
# Fit models without covariates
# cat_results_ncv = Parallel(n_jobs=max_threads)(delayed(do_StepMix)(data_f_oh, n, 'categorical', 'without') for n in clust_range)
# num_results_ncv = Parallel(n_jobs=max_threads)(delayed(do_StepMix)(data_n, n, 'continuous', 'without') for n in clust_range)

In [None]:
# Fit models with covariates
# cat_results_cv = Parallel(n_jobs=max_threads)(delayed(do_StepMix)(data_f_oh, n, 'categorical', 'with') for n in clust_range)
# num_results_cv = Parallel(n_jobs=max_threads)(delayed(do_StepMix)(data_n, n, 'continuous', 'with') for n in clust_range)

In [None]:
# Aggregate results
# latent_all = pd.concat([pd.DataFrame(cat_results_ncv),
#                         pd.DataFrame(num_results_ncv),
#                         pd.DataFrame(cat_results_cv),
#                         pd.DataFrame(num_results_cv)]).reset_index(drop=True)

## k-means

In [None]:
dist = ['euclidean', 'manhattan', 'chebyshev']
link = ['mean', 'median', 'medoid']
kmeans_params = list(product(dist, link))

clust_range = range(2, max_clust+1)
kmeans_grid = product(clust_range, kmeans_params)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_kmeans)(data_n, n, dist, link) 
    for n, (dist, link) in tqdm(kmeans_grid, desc='Fitting KMeans models')
)

kmeans_all = pd.DataFrame(results)

## AHC

In [None]:
distances = ['manhattan', 'euclidean', 'chebyshev', 'hamming']
linkages = ['single', 'average', 'complete']
ahc_params = [*product(distances, linkages), ('euclidean', 'ward')]

clust_range = range(1, max_clust+1)
ahc_grid = product(clust_range, ahc_params)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_AHC)(data_n, n, dist, link) 
    for n, (dist, link) in tqdm(ahc_grid, desc='Fitting AHC models')
)

ahc_all = pd.DataFrame(results)

## HDBSCAN

In [None]:
distances = ['manhattan', 'euclidean', 'chebyshev', 'mahalanobis', 'hamming']
min_cluster_sizes = range(2, 21)
min_samples_range = range(1, 21)
hdb_params = product(distances, min_cluster_sizes, min_samples_range)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_hdbscan)(data_n, dist, min_c, min_s) 
    for dist, min_c, min_s in tqdm(hdb_params, desc='Fitting HDBSCAN models')
)

hdbscan_all = pd.DataFrame(results)

## Aggregate results and compare CVI

In [None]:
all_models = pd.concat([latent_all, kmeans_all, ahc_all, hdbscan_all]).reset_index(drop=True)

In [None]:
all_CVI = all_models[['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn']]

correlations = all_CVI.corr(method='spearman')

plt.figure(figsize=(5, 5)) 
sns.heatmap(correlations, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, 
            square=True, linewidths=0.5, vmin=-1, vmax=1)
plt.show()

# 2. Select models

## Gap stat for latent models, kmeans and AHC

In [None]:
all_models = pd.concat([latent_all, kmeans_all, ahc_all]).reset_index(drop=True)

In [None]:
def dict_to_strg(d):
    return ', '.join(f"{key} = {value}" for key, value in d.items())

# Generate reference data from a uniform distribution
def gen_ref_data(data):
    return np.random.uniform(low=data.min(axis=0), 
                            high=data.max(axis=0), 
                            size=data.shape)

# Create empty df to store results
def create_empty_df(indices):
    cols = ['model', 'params', 'n_clust'] + \
       [f'{index}_gs' for index in indices] + \
       [f'{index}_s' for index in indices]
    
    df = pd.DataFrame(columns=cols)

    float_cols = [col for col in cols if col not in ['model', 'params', 'n_clust']]
    df[float_cols] = df[float_cols].astype('float64')
    
    df['model'] = df['model'].astype('object')
    df['params'] = df['params'].astype('object')
    df['n_clust'] = df['n_clust'].astype('int64')

    return df

### Step 1: compute the gap statistic

In [None]:
# Compute the Gap Statistic
def compute_gap_statistic(data, indices, iters, model, params):   

    str_params = dict_to_strg(params)
    gap_values = create_empty_df(indices)

    # Loop over n values
    if model == 'latent': n_min = 1
    else: n_min = 2
    
    for n in range(n_min, max_clust+1):
    
        # Fit the model on random datasets
        rand_scores_all = pd.DataFrame()
        
        for _ in range(iters):
            rand_data = gen_ref_data(data)
            
            if model == 'latent':
                rand_scores = do_StepMix(rand_data, n, **params)

            elif model == 'kmeans':
                rand_scores = do_kmeans(rand_data, n, **params)

            elif model == 'AHC':
                rand_scores = do_AHC(rand_data, n, **params)
            
            rand_scores = pd.DataFrame([rand_scores])
            rand_scores_all = pd.concat([rand_scores_all, rand_scores], ignore_index=True)

        # Retrive scores for the assessed model
        mod_scores = all_models.loc[(all_models['model'] == model) & 
                                    (all_models['params'] == str_params) & 
                                    (all_models['n_clust'] == n)]

        # Calculate the Gap statistic and s value for each validity index
        for index in indices:
            rand_ind = rand_scores_all[index]
            mod_ind = mod_scores[index]

            # Rescale the Silhouette index on [0,1] to avoid errors when it is negative
            if index == 'silhouette':
                rand_ind = (rand_ind + 1) / 2
                mod_ind = (mod_ind + 1) / 2
                
            gap = np.log(np.mean(rand_ind)) - np.log(mod_ind)
            s = np.std(np.log(rand_ind)) * np.sqrt(1 + (1 / iters))

            # Store the results
            ## Check if the corresponding row exists in the df
            row_id = ((gap_values['model'] == model) & 
                      (gap_values['params'] == str_params) & 
                      (gap_values['n_clust'] == n))

            if gap_values[row_id].empty:
            ## If not, create a new one
                new_row = {
                    'model': model,
                    'params': str_params,
                    'n_clust': n,
                    f'{index}_gs': gap.values[0],
                    f'{index}_s': s
                }
                new_row = pd.DataFrame([new_row])
                gap_values = pd.concat([gap_values, new_row], ignore_index=True)
            
            else:
            # Otherwise, update the existing row
                gap_values.loc[row_id, f'{index}_gs'] = gap.values[0]
                gap_values.loc[row_id, f'{index}_s'] = s

    return gap_values

In [None]:
# Define parameters grid
models = ['latent', 'kmeans', 'AHC']

params = {
    'latent': latent_params,
    'kmeans': kmeans_params,
    'AHC': ahc_params
}

param_names = {
    'latent': ['msrt', 'covar'],
    'kmeans': ['dist', 'link'],
    'AHC': ['dist', 'link']
}

grid = [
    (model, dict(zip(param_names[model], param_values)))
    for model in models
    for param_values in params[model]
]

In [None]:
# Compute gap values for all models
results = Parallel(n_jobs=max_threads)(
    delayed(compute_gap_statistic)(data_n, CVI, iters=5, model=model, params=config)
    for model, config in tqdm(grid, desc = 'Bootstrapping CVIs')
)

gap_values = pd.concat(results).reset_index(drop=True)

## Step 2: identify the optimal number of clusters for each model-config

In [None]:
# Select the optimal number of clusters
def get_best_gap(model, params, index):
    # Subset gap_values to the right model and params 
    rows_id = ((gap_values['model'] == model) & (gap_values['params'] == dict_to_strg(params)))
    df = gap_values[rows_id].reset_index(drop=True)

    # Extract gap and s values
    gap = df[f'{index}_gs']
    s = df[f'{index}_s']

    # Select rows such that GS(k) >= GS(k+1) - s(k+1)
    # Skipping the last row and adjusting for index-based calculations
    n_min = df['n_clust'].min()
    stats = []
    
    for i in range(0, len(df) - 1):
        stat = gap[i] - gap[i+1] + s[i+1]
        if stat >= 0: 
            stats.append([i+n_min, stat])

    # Return optimal cluster number
    stats = np.array(stats)
    if stats.size == 0:
        best_n = 'none'
    else:
        best_n = int(stats[np.argmin(stats[:, 1]), 0])

    return best_n

In [None]:
# Create df to store results
cols = ['model', 'params', 'n_clust'] + \
       [index for index in CVI] + \
       [f'{index}_abs' for index in CVI] + \
       [f'{index}_elbow' for index in CVI] + \
       [f'{index}_gap' for index in CVI]

candidate_models = pd.DataFrame(columns=cols)

candidate_models['model'] = candidate_models['model'].astype('object')
candidate_models['params'] = candidate_models['params'].astype('object')

float_cols = [col for col in cols if col not in ['model', 'params', 'n_clust'] + CVI]
candidate_models[float_cols] = candidate_models[float_cols].astype('float64')

int_cols = [col for col in cols if col in ['n_clust'] + CVI]
candidate_models[int_cols] = candidate_models[int_cols].astype('int64')

In [None]:
# Find best n
for model, config in grid:
    for index in CVI:
        best_n = get_best_gap(model, config, index)

        # Check if a best value has been identified
        if best_n != 'none':
            row_id = ((candidate_models['model'] == model) & 
                      (candidate_models['params'] == config) &
                      (candidate_models['n_clust'] == best_n))
            
            # Check if the corresponding row exists in the df
            if candidate_models[row_id].empty:

                model_id = ((all_models['model'] == model) & 
                           (all_models['params'] == dict_to_strg(config)) &
                           (all_models['n_clust'] == best_n))
                
                new_row = {
                    'model': model,
                    'params': config,
                    'n_clust': best_n,
                    'min_clust_size': all_models.loc[model_id, 'min_clust_size'].values[0],
                    'max_clust_size': all_models.loc[model_id, 'max_clust_size'].values[0],
                    'silhouette': all_models.loc[model_id, 'silhouette'].values[0],
                    'calinski_harabasz': all_models.loc[model_id, 'calinski_harabasz'].values[0],
                    'davies_bouldin': all_models.loc[model_id, 'davies_bouldin'].values[0],
                    'dunn': all_models.loc[model_id, 'dunn'].values[0],
                    f'{index}_gap': 1
                }
                
                new_row = pd.DataFrame([new_row])
                candidate_models = pd.concat([candidate_models, new_row], ignore_index=True)

            # Otherwise, update the existing row
            else:
                candidate_models.loc[row_id, f'{index}_gap'] = 1

## Step 3: identify the best model for each class

In [None]:
CVI_results = {}

for index in CVI:
    CVI_results[index] = []
    df = candidate_models[candidate_models[f'{index}_gap'] == 1]
    
    for model in models:
        sub_df = df[df['model'] == model]

        if sub_df.empty:
            continue
        else:
            if index == 'davies_bouldin':
                best_mod = sub_df.sort_values(index, ascending=True).iloc[0]
            else:
                best_mod = sub_df.sort_values(index, ascending=False).iloc[0]
            CVI_results[index].append(best_mod)

In [None]:
best_sil = pd.DataFrame(CVI_results['silhouette'])
best_ch = pd.DataFrame(CVI_results['calinski_harabasz'])
best_db = pd.DataFrame(CVI_results['davies_bouldin'])
best_dunn = pd.DataFrame(CVI_results['dunn'])

# Min/max for HDBSCAN

In [None]:
best_sil = pd.concat([best_sil, hdbscan_all.sort_values('silhouette', ascending=False).iloc[0:1]], axis=0)
best_ch = pd.concat([best_ch, hdbscan_all.sort_values('calinski_harabasz', ascending=False).iloc[0:1]], axis=0)
best_db = pd.concat([best_db, hdbscan_all.sort_values('davies_bouldin', ascending=True).iloc[0:1]], axis=0)
best_dunn = pd.concat([best_dunn, hdbscan_all.sort_values('dunn', ascending=False).iloc[0:1]], axis=0)

In [None]:
best_sil = best_sil.drop(columns=[col for col in best_sil.columns if col.endswith(('elbow', 'abs', 'gap'))])
best_ch = best_ch.drop(columns=[col for col in best_ch.columns if col.endswith(('elbow', 'abs', 'gap'))])
best_db = best_db.drop(columns=[col for col in best_db.columns if col.endswith(('elbow', 'abs', 'gap'))])
best_dunn = best_dunn.drop(columns=[col for col in best_dunn.columns if col.endswith(('elbow', 'abs', 'gap'))])

## Elbow for AIC / BIC?

# 3. Results

In [None]:
from model_plot import plot_clusters, plot_cluster_profiles

## Silhouette

In [None]:
best_sil.sort_values('silhouette', ascending=False).drop(columns=['calinski_harabasz', 'davies_bouldin', 'dunn']).reset_index(drop=True)

In [None]:
# refit best model

In [None]:
# plot clusters

In [None]:
# plot response variables

## Calinski_harabasz

In [None]:
best_ch.sort_values('calinski_harabasz', ascending=False).drop(columns=['silhouette', 'davies_bouldin', 'dunn']).reset_index(drop=True)

## Davies_bouldin

In [None]:
best_db.sort_values('davies_bouldin', ascending=True).drop(columns=['silhouette', 'calinski_harabasz', 'dunn']).reset_index(drop=True)

## Generalized Dunn 33

In [None]:
best_dunn.sort_values('dunn', ascending=False).drop(columns=['silhouette', 'calinski_harabasz', 'davies_bouldin']).reset_index(drop=True)