In [None]:
import ast
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import warnings

# Parallelization and monitoring
from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm

# Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

# Fitting
from stepmix.stepmix import StepMix
from src.model_fit import build_latent_model, do_StepMix, do_kmeans, do_AHC, do_hdbscan

# Selection
from kneed import KneeLocator
from src.model_select import bootstrap_gap, compute_gap, get_gap, baseline_chi2, bootstrap_chi2

# Statistical tests
from scipy.stats import chi2
from src.hopkins import hopkins
from stepmix.bootstrap import blrt_sweep

# Visualization
from src.model_plot import plot_clusters, plot_cluster_profiles

# Preparation
## Data

In [None]:
var_list = [
    # Q2
    'clseusa', # 'clsetown', 'clsestat', 'clsenoam',
    # Q3
    'ambornin', 'amcit', 'amlived', 'amenglsh', 
    'amchrstn', 'amgovt', 'amfeel', # 'amancstr',
    # Q4
    'amcitizn', 'amshamed', 'belikeus', 'ambetter', 'ifwrong', # 'amsports', 'lessprd',
    # Q5
    'proudsss', 'proudgrp', 'proudpol', 'prouddem', 'proudeco',
    'proudspt', 'proudart', 'proudhis', 'proudmil', 'proudsci'
]

var_list_f = [var + "_f" for var in var_list]
var_list_n = [var + "_n" for var in var_list]

In [None]:
# Load imputed data
data2004_i = pd.read_parquet("data/data2004_i.parquet") 

In [None]:
# Dataset with categorical outcomes
data_f = data2004_i[var_list_f]

## Label encoding
data_f_lb = data_f.apply(lambda col: LabelEncoder().fit_transform(col))

## One-hot encoding (for BVR calculation)
columns = []
for col in data_f_lb.columns:
    for val in data_f_lb[col].unique():
        columns.append((data_f_lb[col] == val).astype(int).rename(f'{col}_{val}'))
data_f_oh = pd.concat(columns, axis=1)

# Dataset with numeric outcomes
data_n = data2004_i[var_list_n]

## Scaling and normalizing / not used
# scaler = MinMaxScaler(feature_range=(-1,1))
# data_n_scaled = scaler.fit_transform(data_n)
# normalizer = StandardScaler()
# data_n_norm = normalizer.fit_transform(data_n)

# Dataset with controls
controls = data2004_i[['sex', 'race_f', 'born_usa', 'party_fs', 'religstr_f', 'reltrad_f', 'region_f']]
controls_dum = pd.get_dummies(controls)

## Parameters
*For the Silhouette and Dunn indices, the Mahnattan distance is used.*

In [None]:
CVI = ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn']
max_clust = 33
max_threads = -1

# 1. Fit models

## 1.1. Latent

*With the [StepMix package](https://github.com/Labo-Lacourse/stepmix?tab=readme-ov-file).*

*The methods used are **categorical** (multinomial) for LCA and **gaussian_tied** for LPA (where all Gaussian components share the same general covariance matrix). The default gaussian_diag (where each Gaussian component has its own diagonal covariance matrix) as well as gaussian_full (where each gaussian component has its own general covariance matrix) encountered severe convergence issues and produced highly unstable results. They also showed a tendency to overfit, as they yielded much higher log-likelihoods (LL) compared to the other models and proved very sensitive to scaling.*

*Models with covariates are fitted through the 1 step approach, where the EM algorithm is run on both the measurement and structural models. Overall, 5 initializations with kmeans++ and slightly relaxed convergence thresholds (abs_tol = rel_tol = 1e-4) proved enough to get consistent results.*

In [None]:
msrt = ['categorical', 'gaussian_tied']
covar = ['without', 'with']
latent_params = list(product(msrt, covar))

clust_range = range(1, max_clust+1)
latent_grid = product(clust_range, latent_params)

In [None]:
time1 = time.time()
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(
        data_f_lb if msrt == 'categorical' else data_n,
        controls_dum if covar == 'with' else None,
        data_f_oh if msrt == 'categorical' else None,
        n, 
        msrt, 
        covar)
    for n, (msrt, covar) in tqdm(latent_grid, desc='Fitting latent models')
)
time2 = time.time()

latent_all = pd.DataFrame(results)
print(f"Time to fit latent models: {time2-time1:.2f} seconds")

In [None]:
LCA_nocov = latent_all[latent_all['params'] == {'msrt': 'categorical', 'covar': 'without'}].reset_index(drop=True)
LCA_covs = latent_all[latent_all['params'] == {'msrt': 'categorical', 'covar': 'with'}].reset_index(drop=True)
LPA_nocov = latent_all[latent_all['params'] == {'msrt': 'gaussian_tied', 'covar': 'without'}].reset_index(drop=True)
LPA_covs = latent_all[latent_all['params'] == {'msrt': 'gaussian_tied', 'covar': 'with'}].reset_index(drop=True)

improv_covs_LCA = 100 * ((LCA_covs['sabic'] / LCA_nocov['sabic']) - 1)
improv_covs_LPA = 100 * ((LPA_covs['sabic'] / LPA_nocov['sabic']) - 1)
improv_gaussian_nocov = 100 * ((LPA_nocov['sabic'] / LCA_nocov['sabic']) - 1)
improv_gaussian_covs = 100 * ((LPA_covs['sabic'] / LCA_covs['sabic']) - 1)

In [None]:
improv = pd.DataFrame({
    "Clusters": clust_range,
    "LCA": improv_covs_LCA.round(1),
    "LPA": improv_covs_LPA.round(1),
    "no covar.": improv_gaussian_nocov.round(1),
    "covars.": improv_gaussian_covs.round(1)
})
improv.set_index('Clusters', inplace=True)
improv.columns = pd.MultiIndex.from_tuples([
    ('Covariates', 'LCA'), 
    ('Covariates', 'LPA'), 
    ('Gaussian', 'no covar.'), 
    ('Gaussian', 'covars.')
])

print("Evolution of SABIC (in %) brought by:")
print("- introducing covariates in LCA and LPA models")
print("- switching from LCA to LPA for models without and with covariates")
improv

*The inclusion of covariates proves detrimental, as it increases the SABIC for both categorical and continuous models. While it slightly improves fit, it also introduces many additional free parameters, increasing the risk of overfitting. Moreover, covariates significantly increase computation time, so they will be excluded from subsequent analyses.*

*Switching from categorical to continuous models has mixed effects. It negatively impacts SABIC, which decreases steadily as the number of classes increases. This is due to differences in the number of free parameters: for Gaussian tied models, the parameter count starts higher but grows more slowly with additional classes. In terms of entropy, Gaussian models yield lower values up to five classes but higher values beyond that. Since their usefulness depends on the final number of classes chosen, they will be retained for further analysis.*

## 1.2. k-means

*With a custom implementation, as scikit-learn does not allow to change the linkage function.*

In [None]:
dist = ['euclidean', 'manhattan', 'chebyshev']
link = ['mean', 'median', 'medoid']
kmeans_params = list(product(dist, link))

clust_range = range(2, max_clust+1)
kmeans_grid = product(clust_range, kmeans_params)

In [None]:
time1 = time.time()
results = Parallel(n_jobs=max_threads)(
    delayed(do_kmeans)(data_n, n, dist, link) 
    for n, (dist, link) in tqdm(kmeans_grid, desc='Fitting KMeans models')
)
time2 = time.time()
print(f"Time to fit k-means models: {time2-time1:.2f} seconds")

kmeans_all = pd.DataFrame(results)

## 1.3. AHC

In [None]:
distances = ['manhattan', 'euclidean', 'chebyshev', 'hamming']
linkages = ['single', 'average', 'complete']
ahc_params = [*product(distances, linkages), ('euclidean', 'ward')]

clust_range = range(1, max_clust+1)
ahc_grid = product(clust_range, ahc_params)

In [None]:
time1 = time.time()
results = Parallel(n_jobs=max_threads)(
    delayed(do_AHC)(data_n, n, dist, link) 
    for n, (dist, link) in tqdm(ahc_grid, desc='Fitting AHC models')
)
time2 = time.time()
print(f"Time to fit AHC models: {time2-time1:.2f} seconds")

ahc_all = pd.DataFrame(results)

## 1.4. HDBSCAN

In [None]:
distances = ['manhattan', 'euclidean', 'chebyshev', 'mahalanobis', 'hamming']
min_cluster_sizes = range(2, 21)
min_samples_range = range(1, 21)
hdb_params = product(distances, min_cluster_sizes, min_samples_range)

In [None]:
time1 = time.time()
results = Parallel(n_jobs=max_threads)(
    delayed(do_hdbscan)(data_n, dist, min_clust, min_smpl)
    for dist, min_clust, min_smpl in tqdm(hdb_params, desc='Fitting HDBSCAN models')
)
time2 = time.time()
print(f"Time to fit HDBSCAN models: {time2-time1:.2f} seconds")

hdbscan_all = pd.DataFrame(results)

## 1.5. Aggregate results and compare CVI

In [None]:
all_models = pd.concat([latent_all, kmeans_all, ahc_all, hdbscan_all]).reset_index(drop=True)
all_models.to_csv("output/models/all_models.csv", index=False)

In [None]:
all_CVI = all_models[['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn']]
labels = {
    'silhouette': 'Silhouette',
    'calinski_harabasz': 'Calinski-Harabasz',
    'davies_bouldin': 'Davies-Bouldin',
    'dunn': 'Dunn 43'
}

correlations = all_CVI.corr(method='spearman')
correlations = correlations.rename(index=labels, columns=labels)

plt.figure(figsize=(5, 5)) 
sns.heatmap(correlations, annot=True, fmt=".2f", cmap="coolwarm", cbar=True, 
            square=True, linewidths=0.5, vmin=-1, vmax=1)
plt.show()

*The correlation between CVIs is generally low. Therefore, all will be retained for subsequent analyses.*

# 2. Select models
## 2.1. Fit criteria for latent models

In [None]:
latent_stats = latent_all.copy()

### 2.1.1. Absolute value for AIC / BIC / SABIC / entropy

In [None]:
min_aic = latent_all.sort_values('aic', ascending=True).iloc[0]
min_bic = latent_all.sort_values('bic', ascending=True).iloc[0]
min_sabic = latent_all.sort_values('sabic', ascending=True).iloc[0]
max_entropy = latent_all.sort_values('relative_entropy', ascending=False).iloc[0]

### 2.1.2. Elbow method for AIC / BIC / SABIC / entropy

In [None]:
def elbow_method(df, val_index):
    res = df.dropna(subset=[val_index])
    x = res['n_clust']
    y = res[val_index]

    if val_index == 'relative_entropy':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

def best_elbow_model(index):
    candidate_models = pd.DataFrame()

    for msrt in ['categorical', 'gaussian_tied']:
        for covar in ['without', 'with']:
            mask = (latent_stats['params'] == {'msrt': msrt, 'covar': covar})
            models = latent_stats[mask]
            elbow_res = elbow_method(models, index)
            if elbow_res is not None:
                candidate_models = pd.concat([candidate_models, elbow_res], ignore_index=True)
    
    if candidate_models.empty:
        return None
    return candidate_models.sort_values(index, ascending=True).iloc[0]

In [None]:
elbow_aic = best_elbow_model('aic')
elbow_bic = best_elbow_model('bic')
elbow_sabic = best_elbow_model('sabic')
elbow_entropy = best_elbow_model('relative_entropy')

### 2.1.3. Statistical tests for log-likelihood
*LRT - not advisable for comparing models with $C$ and $C-1$ classes as the resulting test statistics does not converge towards a $\chi^2$ distribution under the null hypothesis.*

In [None]:
def LRT(models):
    # LRT test
    _2LL_stat = - 2 * (models['LL'].diff())
    _2LL_df = models['df'].diff()
    _2LL_p_val = 1 - chi2.cdf(_2LL_stat, _2LL_df)

    # L2 reduction
    _2LL_red = 1 - (models['LL'] / models['LL'].iloc[0])

    results = pd.DataFrame({
        '-2LL': _2LL_stat,
        '-2LL_df': _2LL_df,
        'LRT_pval': _2LL_p_val,
        '-2LL_red_%': 100 * _2LL_red
    }, index=models.index)
    
    return results

In [None]:
for msrt in ['categorical', 'gaussian_tied']:
    for covar in ['without', 'with']:
        mask = (latent_stats['params'] == {'msrt': msrt, 'covar': covar})
        models = latent_stats[mask]
        lrt_results = LRT(models)
        latent_stats.loc[mask, ['-2LL', '-2LL_df', 'LRT_pval', '-2LL_red_%']] = lrt_results.values

*BLRT - not implemented in StepMix for models with covariates.*

In [None]:
iters = 100

time1 = time.time()
for msrt in ['categorical', 'gaussian_tied']:
    latent_mod = build_latent_model(n = None, msrt = msrt, covar = 'without')
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        
        blrt = blrt_sweep(
            latent_mod,
            data_f_lb if msrt == 'categorical' else data_n,
            low=1,
            high=max_clust,
            n_repetitions=iters)
        
        # Add a row for the 1-class model
        blrt = pd.concat([pd.DataFrame({'p': [np.nan]}), blrt]).reset_index(drop=True)
        
    mask = (latent_stats['params'] == {'msrt': msrt, 'covar': 'without'})
    latent_stats.loc[mask, ['BLRT_pval']] = blrt.values
    
time2 = time.time()

In [None]:
print(f"Time to bootstrap LRT: {time2-time1:.2f} seconds")

*BVRT - requires a different bootstrapping than the BLRT with samples generated under an alternative assumption*

In [None]:
B = 100

covar = 'without'
config = {'msrt': 'categorical', 'covar': covar}
controls = controls_dum if covar == 'with' else None

time1 = time.time()
for n in range(1, max_clust+1):
    ref_l2, ref_data = baseline_chi2(
        data = data_f_lb,
        bvr_data = data_f_oh,
        n = n,
        covar = covar,
        controls = controls)
    
    btsp_results = Parallel(n_jobs=max_threads)(
        delayed(bootstrap_chi2)(ref_data, controls, n, covar, b)
        for b in tqdm(range(1, B+1), desc = f"Bootstrapping Chi2 (n={n})"))
    
    s = (btsp_results > ref_l2).sum()
    btsp_chi2_pval = 100 * (s+1) / (B+1)
    
    row_id = ((latent_stats['params'] == config) & (latent_stats['n_clust'] == n))
    latent_stats.loc[row_id, 'btsp_chi2_pval'] = btsp_chi2_pval

time2 = time.time()

In [None]:
print(f"Time to bootstrap Chi2: {time2-time1:.2f} seconds")

## 2.2. Gap statistic for latent models / kmeans / AHC

In [None]:
all_models = pd.concat([latent_all, kmeans_all, ahc_all]).reset_index(drop=True)

In [None]:
# Ensure params are a dictionary to avoid errors afterwards
if isinstance(all_models['params'].iloc[0], str):
    all_models['params'] = all_models['params'].apply(ast.literal_eval)

*Step 1: compute the gap statistic for each model-config*

In [None]:
# Exclude categorical models with covariates
latent_params = list([('categorical', 'without'),
                      ('gaussian_tied', 'without')])

params = {'kmeans': kmeans_params,
          'AHC': ahc_params,
          'latent': latent_params}

param_names = {'kmeans': ['dist', 'link'],
               'AHC': ['dist', 'link'],
               'latent': ['msrt', 'covar']}

models = ['kmeans', 'AHC', 'latent']

In [None]:
# Bootstrap
iters = 100

bootstrap_grid = [
    (model, {key: value for key, value in zip(param_names[model], param_values)}, n_val, n_iter)
    for model in models
    for param_values in params[model]
    for n_val in (range(1, max_clust+1) if model == 'latent' else range(2, max_clust+1))
    for n_iter in range(iters)
]

time1 = time.time()
results = Parallel(n_jobs=max_threads)(
    delayed(bootstrap_gap)(
        data = data_f_lb if model == 'latent' and config.get('msrt') == 'categorical' else data_n,
        controls = controls_dum if model == 'latent' and config.get('covar') == 'with' else None,
        bvr_data = data_f_oh if model == 'latent' and config.get('msrt') == 'categorical' else None,
        n = n,
        model = model,
        params = config,
        iter_num = iter_num)
    for model, config, n, iter_num in tqdm(bootstrap_grid, desc='Bootstrapping CVIs')
)
time2 = time.time()

bootstrap_results = pd.concat(results).reset_index(drop=True)
print(f"Time to compute gap statistics: {time2-time1:.2f} seconds")

In [None]:
# Compute gap values
model_grid = [
    (model, dict(zip(param_names[model], param_values)))
    for model in models
    for param_values in params[model]
]

gap_values = []

for model, config in model_grid:
    rows_id = ((bootstrap_results['model'] == model) & (bootstrap_results['params'] == config))    
    bs_select_res = bootstrap_results[rows_id]
    gap_stats = compute_gap(bs_select_res, all_models, model, config, CVI)
    gap_values.append(gap_stats)

gap_values = pd.concat(gap_values, ignore_index=True)

*Step 2: identify the optimal number of clusters for each model-config*

In [None]:
# Create df to store results
cols = ['model', 'params', 'n_clust'] + \
       [index for index in CVI] + \
       [f'{index}_gap' for index in CVI]

candidate_models = pd.DataFrame(columns=cols)
candidate_models['model'] = candidate_models['model'].astype('object')
candidate_models['params'] = candidate_models['params'].astype('object')

float_cols = [col for col in cols if col not in ['model', 'params', 'n_clust'] + CVI]
candidate_models[float_cols] = candidate_models[float_cols].astype('float64')
int_cols = [col for col in cols if col in ['n_clust'] + CVI]
candidate_models[int_cols] = candidate_models[int_cols].astype('int64')

In [None]:
# Find best n
for model, config in model_grid:
    for index in CVI:
        best_n = get_gap(gap_values, model, config, index)

        # Check if a best value has been identified
        if best_n != 'none':
            row_id = ((candidate_models['model'] == model) & 
                      (candidate_models['params'] == config) &
                      (candidate_models['n_clust'] == best_n))
            
            # Check if the corresponding row exists in the df
            if candidate_models[row_id].empty:

                model_id = ((all_models['model'] == model) & 
                           (all_models['params'] == config) &
                           (all_models['n_clust'] == best_n))
                
                new_row = {
                    'model': model,
                    'params': config,
                    'n_clust': best_n,
                    'min_clust_size': all_models.loc[model_id, 'min_clust_size'].values[0],
                    'max_clust_size': all_models.loc[model_id, 'max_clust_size'].values[0],
                    'silhouette': all_models.loc[model_id, 'silhouette'].values[0],
                    'calinski_harabasz': all_models.loc[model_id, 'calinski_harabasz'].values[0],
                    'davies_bouldin': all_models.loc[model_id, 'davies_bouldin'].values[0],
                    'dunn': all_models.loc[model_id, 'dunn'].values[0],
                    f'{index}_gap': 1
                }
                
                new_row = pd.DataFrame([new_row])
                candidate_models = pd.concat([candidate_models, new_row], ignore_index=True)

            # Otherwise, update the existing row
            else:
                candidate_models.loc[row_id, f'{index}_gap'] = 1

*Step 3: identify the best model for each class*

In [None]:
CVI_results = {}

for index in CVI:
    CVI_results[index] = []
    df = candidate_models[candidate_models[f'{index}_gap'] == 1]
    
    for model in models:
        sub_df = df[df['model'] == model]

        if sub_df.empty:
            continue
        else:
            if index == 'davies_bouldin':
                best_mod = sub_df.sort_values(index, ascending=True).iloc[0]
            else:
                best_mod = sub_df.sort_values(index, ascending=False).iloc[0]
            CVI_results[index].append(best_mod)

In [None]:
best_sil = pd.DataFrame(CVI_results['silhouette'])
best_ch = pd.DataFrame(CVI_results['calinski_harabasz'])
best_db = pd.DataFrame(CVI_results['davies_bouldin'])
best_dunn = pd.DataFrame(CVI_results['dunn'])

## 2.3. Min/max for HDBSCAN

In [None]:
best_sil = pd.concat([best_sil, hdbscan_all.sort_values('silhouette', ascending=False).iloc[0:1]], axis=0)
best_ch = pd.concat([best_ch, hdbscan_all.sort_values('calinski_harabasz', ascending=False).iloc[0:1]], axis=0)
best_db = pd.concat([best_db, hdbscan_all.sort_values('davies_bouldin', ascending=True).iloc[0:1]], axis=0)
best_dunn = pd.concat([best_dunn, hdbscan_all.sort_values('dunn', ascending=False).iloc[0:1]], axis=0)

In [None]:
best_sil = best_sil.drop(columns=[col for col in best_sil.columns if col.endswith(('elbow', 'abs', 'gap'))])
best_ch = best_ch.drop(columns=[col for col in best_ch.columns if col.endswith(('elbow', 'abs', 'gap'))])
best_db = best_db.drop(columns=[col for col in best_db.columns if col.endswith(('elbow', 'abs', 'gap'))])
best_dunn = best_dunn.drop(columns=[col for col in best_dunn.columns if col.endswith(('elbow', 'abs', 'gap'))])

# 3. Results

In [None]:
def disp_params(d):
    return f"{d['msrt']} {d['covar']} covariates"

def refit_best_model(df):
    model = df.loc[0, 'model']
    config = df.loc[0, 'params']
    n_clust = int(df.loc[0, 'n_clust'])

    if model == 'latent':
        results = do_StepMix(
            data_f_lb if config['msrt'] == 'categorical' else data_n,
            controls_dum if covar == 'with' else None,
            data_f_oh if config['msrt'] == 'categorical' else None,                
            n_clust, 
            refit = True, 
            **config)

    elif model == 'kmeans':
        results = do_kmeans(data_n, n_clust, refit = True, **config)
    
    elif model == 'AHC':
        results = do_AHC(data_n, n_clust, refit = True, **config)
    
    elif model == 'HDBSCAN':
        results = do_hdbscan(data_n, refit = True, **config)

    return results

## 3.1. Latent models / fit criteria
### 3.1.1. Absolute values

In [None]:
print(f"Model minimizing AIC is {disp_params(min_aic['params'])} and {min_aic['n_clust']} clusters.")
print(f"Model minimizing BIC is {disp_params(min_bic['params'])} and {min_bic['n_clust']} clusters.")
print(f"Model minimizing SABIC is {disp_params(min_sabic['params'])} and {min_sabic['n_clust']} clusters.")
print(f"Model maximizing entropy is {disp_params(max_entropy['params'])} and {max_entropy['n_clust']} clusters.")

### 3.1.2 Elbow method

In [None]:
print(f"Best model according to the Elbow method applied to...")
if elbow_aic is None: print("- AIC is None")
else: print(f"- AIC is {disp_params(elbow_aic['params'])} and {elbow_aic['n_clust']} clusters.")
if elbow_bic is None: print("- BIC is None")
else: print(f"- BIC is {disp_params(elbow_bic['params'])} and {elbow_bic['n_clust']} clusters.")
if elbow_sabic is None: print("- SABIC is None")
else: print(f"- SABIC is {disp_params(elbow_sabic['params'])} and {elbow_sabic['n_clust']} clusters.")
if elbow_entropy is None: print("- Entropy is None")
else: print(f"- Relative entropy is {disp_params(elbow_entropy['params'])} and {elbow_entropy['n_clust']} clusters")

### 3.1.3. Statistical tests

In [None]:
LCA = latent_stats[latent_stats['params'] == {'msrt': 'categorical', 'covar': 'without'}].reset_index(drop=True)
LPA = latent_stats[latent_stats['params'] == {'msrt': 'gaussian_tied', 'covar': 'without'}].reset_index(drop=True)

In [None]:
def select_pval(df, crit, gap=0, threshold=0.05):
    if df[df[crit] > threshold].empty:
        return f"None (up to max_clust = {max_clust})"
    else:
        return df[df[crit] > threshold].iloc[0]['n_clust'] + gap

In [None]:
# LRT - select the last model to be significantly different from the previous one
print("Optimal number of clusters according to LRT:")
print(f"- {select_pval(LCA, 'LRT_pval', -1)} for LCA")
print(f"- {select_pval(LPA, 'LRT_pval', -1)} for LPA")

In [None]:
# BLRT - select the last model to be significantly different from the previous one
print("Optimal number of clusters according to BLRT:")
print(f"- {select_pval(LCA, 'BLRT_pval', -1)} for LCA")
print(f"- {select_pval(LPA, 'BLRT_pval', -1)} for LPA")

In [None]:
# Chi2 - select the first model to becom non-significant (i.e., where the local independance assumption cannot be rejected)
print("Optimal number of clusters according to raw Chi2:")
print(f"- {select_pval(LCA, 'chi2_pval')} for LCA")
print(f"- {select_pval(LPA, 'chi2_pval')} for LPA")

In [None]:
# Bootstrapped Chi2 - select the first model to becom non-significant (i.e., where the local independance assumption cannot be rejected)
print("Optimal number of clusters according to bootstrapped Chi2:")
print(f"- {select_pval(LCA, 'btsp_chi2_pval')} for LCA")
print(f"- {select_pval(LPA, 'btsp_chi2_pval')} for LPA")

### 3.1.3. Models

In [None]:
cols = ['model', 'params', 'n_clust', 'min_clust_size', 'max_clust_size', 'silhouette', 'calinski_harabasz',
        'davies_bouldin', 'dunn', 'aic', 'bic', 'sabic','relative_entropy', 'classif_error', 'df', 'LL',
        '-2LL_red_%', 'LRT_pval',  'BLRT_pval']

In [None]:
LCA = LCA[cols + ['chi2_pval', 'btsp_chi2_pval']]
LCA.to_csv("output/models/LCA_models.csv", index=False)
LCA.drop(columns=['model', 'params'])

In [None]:
LPA = LPA[cols]
LPA.to_csv("output/models/LPA_models.csv", index=False)
LPA.drop(columns=['model', 'params'])

In [None]:
best_f_blrt = select_pval(LCA, 'BLRT_pval', -1)
best_c_blrt = select_pval(LPA, 'BLRT_pval', -1)

if isinstance(best_f_blrt, str) and isinstance(best_c_blrt, str):
    print("No best model according to BLRT")
elif isinstance(best_f_blrt, str):
    blrt_model = LPA[LPA['n_clust'] == best_c_blrt]
elif isinstance(best_c_blrt, str):
    blrt_model = LCA[LCA['n_clust'] == best_f_blrt]
else:
    LCA_sabic = LCA[LCA['n_clust'] == best_c_blrt]['sabic'].iloc[0]
    LPA_sabic = LPA[LPA['n_clust'] == best_f_blrt]['sabic'].iloc[0]
    if LCA_sabic >= LPA_sabic:
        blrt_model = LCA[LCA['n_clust'] == best_f_blrt]
    else:
        blrt_model = LPA[LPA['n_clust'] == best_c_blrt]

blrt_model = blrt_model.reset_index(drop=True)
blrt_model.to_csv("output/models/best_latent.csv", index=False)
pred_clust_latent = refit_best_model(blrt_model)

In [None]:
plot_clusters(
    data_f_lb if blrt_model.loc[0, 'params'].get('msrt') == 'categorical' else data_n,
    pred_clust_latent,
    '2D PCA Projection of the Partition According from the best latent model according to the BLRT',
    'latent_clust')

In [None]:
if np.unique(pred_clust_latent, return_counts=True)[1].min() > 5:
    plot_cluster_profiles(data_n, pred_clust_latent, feature_names = var_list, sd = 1, title = 'BLRT', filename = 'latent_patterns')

## 3.2. CVI

### 3.2.1. Silhouette

In [None]:
sil = best_sil.sort_values('silhouette', ascending=False).drop(columns=['calinski_harabasz', 'davies_bouldin', 'dunn']).reset_index(drop=True)
sil.to_csv("output/models/best_sil.csv", index=False)
sil

In [None]:
pred_clust_sil = refit_best_model(sil)

In [None]:
plot_clusters(
    data_f_lb if (sil.loc[0, 'model'] == 'latent') and (sil.loc[0, 'params'].get('msrt') == 'categorical') else data_n,
    pred_clust_sil,
    '2D PCA Projection of the Best Partition According to the Silhouette Index',
    'sil_clust')

In [None]:
if np.unique(pred_clust_sil, return_counts=True)[1].min() > 5:
    plot_cluster_profiles(data_n, pred_clust_sil, feature_names = var_list, sd = 1, title = 'Silhouette', filename = 'sil_patterns')

### 3.2.2. Calinski-Harabasz

In [None]:
ch = best_ch.sort_values('calinski_harabasz', ascending=False).drop(columns=['silhouette', 'davies_bouldin', 'dunn']).reset_index(drop=True)
ch.to_csv("output/models/best_ch.csv", index=False)
ch

In [None]:
pred_clust_ch = refit_best_model(ch)

In [None]:
plot_clusters(
    data_f_lb if (ch.loc[0, 'model'] == 'latent') and (ch.loc[0, 'params'].get('msrt') == 'categorical') else data_n, 
    pred_clust_ch,
    '2D PCA Projection of the Best Partition According to the Calinski-Harabasz Index',
    'ch_clust')

In [None]:
if np.unique(pred_clust_ch, return_counts=True)[1].min() > 5:
    plot_cluster_profiles(data_n, pred_clust_ch, feature_names = var_list, sd = 1, title = 'Calinski-Harabaz', filename = 'ch_patterns')

### 3.2.3. Davies-Bouldin

In [None]:
db = best_db.sort_values('davies_bouldin', ascending=True).drop(columns=['silhouette', 'calinski_harabasz', 'dunn']).reset_index(drop=True)
db.to_csv("output/models/best_db.csv", index=False)
db

In [None]:
pred_clust_db = refit_best_model(db)

In [None]:
plot_clusters(
    data_f_lb if (db.loc[0, 'model'] == 'latent') and (db.loc[0, 'params'].get('msrt') == 'categorical') else data_n, 
    pred_clust_db,
    '2D PCA Projection of the Best Partition According to the Davies-Bouldin Index',
    'db_clust')

In [None]:
if np.unique(pred_clust_db, return_counts=True)[1].min() > 5:
    plot_cluster_profiles(data_n, pred_clust_db, feature_names = var_list, sd = 1, title = 'Davies-Bouldin', filename = 'db_patterns')

### 3.2.4. Generalized Dunn 43

In [None]:
gd = best_dunn.sort_values('dunn', ascending=False).drop(columns=['silhouette', 'calinski_harabasz', 'davies_bouldin']).reset_index(drop=True)
gd.to_csv("output/models/best_gd.csv", index=False)
gd

In [None]:
pred_clust_gd = refit_best_model(gd)

In [None]:
plot_clusters(
    data_f_lb if (gd.loc[0, 'model'] == 'latent') and (gd.loc[0, 'params'].get('msrt') == 'categorical') else data_n, 
    pred_clust_gd,
    '2D PCA Projection of the Best Partition According to the Generalized Dunn Index',
    'gd_clust')

In [None]:
if np.unique(pred_clust_gd, return_counts=True)[1].min() > 5:
    plot_cluster_profiles(data_n, pred_clust_gd, feature_names = var_list, sd = 1, title = 'Generalized Dunn', filename = 'gd_patterns')

# 4. Clusterability - Hopkins Statistic

*Function from the [pyclustertend package](https://pyclustertend.readthedocs.io/en/latest/_modules/pyclustertend/hopkins.html), which could not be installed because its depencies are outdated.*

In [None]:
hopkins_stat = hopkins(data_n.values, data_n.shape[0])
print(f"Hopkins stat on restricted data set: {hopkins_stat:.3f}")

In [None]:
full_var_list = var_list + ['clsetown', 'clsestat', 'clsenoam', 'amancstr', 'amsports', 'lessprd']
full_var_list_n = [var + "_n" for var in full_var_list]
data_n_full = data2004_i[full_var_list_n]
hopkins_stat = hopkins(data_n_full.values, data_n.shape[0])
print(f"Hopkins stat on full data set: {hopkins_stat:.3f}")

*The inclusion of questions discared by the authors slighly improves clusterability.*