In [1]:
import ast
import numpy as np
import pandas as pd
import time
import warnings

from itertools import product
from joblib import Parallel, delayed

from scipy.stats import chi2
from stepmix.bootstrap import blrt_sweep
from stepmix.stepmix import StepMix
from tqdm import tqdm

from src.model_fit import do_StepMix
from src.model_select import blrt_sweep_custom

In [2]:
max_threads = -1
bootstrap_iters = 1000

In [3]:
var_list = [
    'clseusa', 'ambornin', 'amcit', 'amlived', 'amenglsh', 'amchrstn',
    'amgovt', 'amfeel', 'amcitizn', 'amshamed', 'belikeus', 'ambetter',
    'ifwrong', 'proudsss', 'proudgrp', 'proudpol', 'prouddem', 'proudeco',
    'proudspt', 'proudart', 'proudhis', 'proudmil', 'proudsci']

var_list_n = [var + "_n" for var in var_list]

# n = 830

In [4]:
data2004 = pd.read_parquet(f"data/data2004_830.parquet")
data_f = data2004[var_list_n] - 1 # reindexing to 0 (as expected by StepMix)

In [5]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(
        data = data_f,
        controls = None,
        n = n_clust,
        msrt = 'categorical',
        covar = 'without',
        weights = None)
    for n_clust in tqdm(range(1,9), desc='Fitting latent models'))

replic_LCA = pd.DataFrame(results).drop(columns = ['model', 'params', 'silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn'])
replic_LCA['l2_red'] = 100 * (replic_LCA['LL'].iloc[0] - replic_LCA['LL']) / replic_LCA['LL'].iloc[0]

Fitting latent models: 100%|██████████| 8/8 [00:00<00:00, 2257.73it/s]


In [6]:
replic_LCA.style

Unnamed: 0,n_clust,min_clust_size,max_clust_size,aic,bic,sabic,relative_entropy,classif_error,df,LL,l2_red
0,1,830,830,38299.429095,38648.814597,38911.202214,,0.0,755,-22.982789,-0.0
1,2,284,546,36240.905998,36944.398427,37472.71944,0.854287,0.043047,680,-21.652353,5.788835
2,3,191,335,35556.628027,36614.227384,37408.481793,0.83981,0.072163,605,-21.149776,7.975589
3,4,127,307,35117.919085,36529.62537,37589.813174,0.868203,0.071435,530,-20.795132,9.518674
4,5,112,287,35175.926405,36941.739617,38267.860818,0.863087,0.08919,455,-20.739715,9.759799
5,6,33,303,34941.068848,37060.988988,38653.043584,0.868544,0.090967,380,-20.507873,10.768562
6,7,33,224,34995.213761,37469.240828,39327.22882,0.874248,0.094893,305,-20.450129,11.019811
7,8,30,176,34878.32767,37706.461665,39830.383053,0.883706,0.092656,230,-20.289354,11.719355


## BLRT

In [7]:
opt_params = {
    'method': 'gradient',
    'intercept': True,
    'max_iter': 500}

latent_mod = StepMix(
    measurement = 'categorical',
    n_init = 5,
    abs_tol = 1e-4,
    rel_tol = 1e-4,
    init_params = 'kmeans',
    structural_params = opt_params,
    progress_bar = 0)

In [8]:
s_time = time.time()
BLRT = blrt_sweep_custom(
    latent_mod,
    data_f,
    low = 1,
    high = 8,
    n_repetitions = bootstrap_iters,
    n_jobs = max_threads)
e_time = time.time()


Bootstrapping estimator...


Bootstrap Repetitions    :  43%|████▎     | 431/1000 [01:01<00:55, 10.16it/s, max_LL=-1.74e+4, median_LL=-1.79e+4, min_LL=-1.84e+4]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [02:14<00:00,  7.44it/s, max_LL=-1.74e+4, median_LL=-1.79e+4, min_LL=-1.84e+4]
Bootstrap Repetitions    :  37%|███▋      | 366/1000 [01:17<02:23,  4.43it/s, max_LL=-1.7e+4, median_LL=-1.74e+4, min_LL=-1.79e+4]


Bootstrapping estimator...


Bootstrap Repetitions    :  18%|█▊        | 180/1000 [00:37<02:45,  4.94it/s, max_LL=-1.67e+4, median_LL=-1.71e+4, min_LL=-1.75e+4]


Bootstrapping estimator...


Bootstrap Repetitions    :   0%|          | 0/1000 [00:00<?, ?it/s] 6.50it/s, max_LL=-1.7e+4, median_LL=-1.74e+4, min_LL=-1.8e+4]4]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [02:56<00:00,  5.67it/s, max_LL=-1.7e+4, median_LL=-1.74e+4, min_LL=-1.8e+4]]
Bootstrap Repetitions    :  22%|██▏       | 218/1000 [01:07<05:19,  2.45it/s, max_LL=-1.64e+4, median_LL=-1.68e+4, min_LL=-1.72e+4]


Bootstrapping estimator...


Bootstrap Repetitions    :   1%|          | 10/1000 [00:03<06:01,  2.74it/s, max_LL=-1.63e+4, median_LL=-1.66e+4, min_LL=-1.68e+4]]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [03:49<00:00,  4.35it/s, max_LL=-1.67e+4, median_LL=-1.71e+4, min_LL=-1.77e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [04:11<00:00,  3.98it/s, max_LL=-1.65e+4, median_LL=-1.7e+4, min_LL=-1.75e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [05:23<00:00,  3.09it/s, max_LL=-1.64e+4, median_LL=-1.68e+4, min_LL=-1.73e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [05:38<00:00,  2.95it/s, max_LL=-1.62e+4, median_LL=-1.66e+4, min_LL=-1.71e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [05:53<00:00,  2.83it/s, max_LL=-1.62e+4, median_LL=-1.67e+4, min_LL=-1.72e+4]


In [9]:
print(f"Total execution time: {(e_time - s_time) / 60:.2f} minutes")

Total execution time: 11.92 minutes


In [10]:
BLRT_res = pd.concat([pd.DataFrame({'p': [np.nan]}), BLRT]).reset_index(drop=True) # Add a row for the saturated model
BLRT_res["n clust"] = [f"{i+1} vs. {i} clust" for i in BLRT_res.index]
BLRT_res = BLRT_res.iloc[1:]
BLRT_res = BLRT_res[["n clust", "p"]]
BLRT_res.to_csv("output/models/BLRT_830.csv", index=False)

BLRT_res.style.hide(axis=0).format({"p": "{:.3f}"})

n clust,p
2 vs. 1 clust,0.559
3 vs. 2 clust,0.651
4 vs. 3 clust,0.057
5 vs. 4 clust,0.916
6 vs. 5 clust,0.731
7 vs. 6 clust,0.881
8 vs. 7 clust,0.689


In [11]:
if BLRT_res[BLRT_res['p'] > 0.05].empty:
    best_LCA = None
else:
    best_LCA = BLRT_res[BLRT_res['p'] > 0.05]
    best_LCA = best_LCA.index[0]

print(f"Optimal number of clusters for LCA without covariates and sample weights is {best_LCA} according to BLRT.")

Optimal number of clusters for LCA without covariates and sample weights is 1 according to BLRT.


# n = 1077

In [12]:
data2004 = pd.read_parquet(f"data/data2004_1077.parquet")
data_f = data2004[var_list_n] - 1 # reindexing to 0 (as expected by StepMix)

In [13]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(
        data = data_f,
        controls = None,
        n = n_clust,
        msrt = 'categorical',
        covar = 'without',
        weights = None)
    for n_clust in tqdm(range(1,9), desc='Fitting latent models'))

replic_LCA = pd.DataFrame(results).drop(columns = ['model', 'params', 'silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn'])
replic_LCA['l2_red'] = 100 * (replic_LCA['LL'].iloc[0] - replic_LCA['LL']) / replic_LCA['LL'].iloc[0]

Fitting latent models: 100%|██████████| 8/8 [00:00<00:00, 6818.62it/s]


In [14]:
replic_LCA.style

Unnamed: 0,n_clust,min_clust_size,max_clust_size,aic,bic,sabic,relative_entropy,classif_error,df,LL,l2_red
0,1,1077,1077,51154.814721,51523.477887,51805.102361,,0.0,1002,-23.680044,-0.0
1,2,381,696,48211.695403,48954.00367,49521.058354,0.8659,0.037523,927,-22.244055,6.06413
2,3,257,434,47237.53285,48353.486217,49205.971111,0.842141,0.071655,852,-21.72216,8.268075
3,4,183,407,46681.025488,48170.623956,49308.53906,0.862456,0.078173,777,-21.394162,9.653199
4,5,91,369,46230.041596,48093.285165,49516.63048,0.882576,0.073703,702,-21.115154,10.831441
5,6,147,214,46303.967702,48540.856372,50249.631896,0.855875,0.099772,627,-21.079836,10.980586
6,7,83,230,46076.892515,48687.426286,50681.632021,0.860263,0.10645,552,-20.904778,11.71985
7,8,74,237,46021.242465,49005.421337,51285.057281,0.87202,0.103059,477,-20.809305,12.123032


## BLRT

In [15]:
s_time = time.time()
BLRT = blrt_sweep_custom(
    latent_mod,
    data_f,
    low = 1,
    high = 8,
    n_repetitions = bootstrap_iters,
    n_jobs = max_threads)
e_time = time.time()


Bootstrapping estimator...


Bootstrap Repetitions    :   0%|          | 0/1000 [00:00<?, ?it/s] 4.21it/s, max_LL=-2.32e+4, median_LL=-2.39e+4, min_LL=-2.44e+4]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [02:16<00:00,  7.31it/s, max_LL=-2.32e+4, median_LL=-2.39e+4, min_LL=-2.45e+4]
Bootstrap Repetitions    :   0%|          | 1/1000 [00:00<02:14,  7.40it/s, max_LL=-2.29e+4, median_LL=-2.29e+4, min_LL=-2.29e+4]4]


Bootstrapping estimator...


Bootstrap Repetitions    :   8%|▊         | 76/1000 [00:11<02:11,  7.00it/s, max_LL=-2.23e+4, median_LL=-2.28e+4, min_LL=-2.32e+4]]


Bootstrapping estimator...


Bootstrap Repetitions    :  76%|███████▌  | 762/1000 [01:38<00:28,  8.33it/s, max_LL=-2.25e+4, median_LL=-2.33e+4, min_LL=-2.38e+4]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [02:07<00:00,  7.83it/s, max_LL=-2.25e+4, median_LL=-2.33e+4, min_LL=-2.38e+4]
Bootstrap Repetitions    :  44%|████▍     | 445/1000 [01:26<01:38,  5.66it/s, max_LL=-2.21e+4, median_LL=-2.26e+4, min_LL=-2.31e+4]


Bootstrapping estimator...


Bootstrap Repetitions    :  14%|█▍        | 144/1000 [00:39<04:11,  3.41it/s, max_LL=-2.18e+4, median_LL=-2.23e+4, min_LL=-2.27e+4]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [02:29<00:00,  6.70it/s, max_LL=-2.22e+4, median_LL=-2.29e+4, min_LL=-2.34e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [03:14<00:00,  5.14it/s, max_LL=-2.21e+4, median_LL=-2.26e+4, min_LL=-2.31e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [04:05<00:00,  4.07it/s, max_LL=-2.19e+4, median_LL=-2.24e+4, min_LL=-2.29e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [04:47<00:00,  3.48it/s, max_LL=-2.17e+4, median_LL=-2.23e+4, min_LL=-2.28e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [05:07<00:00,  3.25it/s, max_LL=-2.16e+4, median_LL=-2.21e+4, min_LL=-2.26e+4]


In [16]:
print(f"Total execution time: {(e_time - s_time) / 60:.2f} minutes")

Total execution time: 10.96 minutes


In [17]:
BLRT_res = pd.concat([pd.DataFrame({'p': [np.nan]}), BLRT]).reset_index(drop=True) # Add a row for the saturated model
BLRT_res["n clust"] = [f"{i+1} vs. {i} clust" for i in BLRT_res.index]
BLRT_res = BLRT_res.iloc[1:]
BLRT_res = BLRT_res[["n clust", "p"]]
BLRT_res.to_csv("output/models/BLRT_1077.csv", index=False)

BLRT_res.style.hide(axis=0).format({"p": "{:.3f}"})

n clust,p
2 vs. 1 clust,0.937
3 vs. 2 clust,0.513
4 vs. 3 clust,0.943
5 vs. 4 clust,0.211
6 vs. 5 clust,0.821
7 vs. 6 clust,0.832
8 vs. 7 clust,0.677


In [18]:
if BLRT_res[BLRT_res['p'] > 0.05].empty:
    best_LCA = None
else:
    best_LCA = BLRT_res[BLRT_res['p'] > 0.05]
    best_LCA = best_LCA.index[0]

print(f"Optimal number of clusters for LCA without covariates and sample weights is {best_LCA} according to BLRT.")

Optimal number of clusters for LCA without covariates and sample weights is 1 according to BLRT.


# n = 1215

In [19]:
data2004 = pd.read_parquet(f"data/data2004_1215.parquet")
data_f = data2004[var_list_n] - 1 # reindexing to 0 (as expected by StepMix)

In [20]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(
        data = data_f,
        controls = None,
        n = n_clust,
        msrt = 'categorical',
        covar = 'without',
        weights = None)
    for n_clust in tqdm(range(1,9), desc='Fitting latent models'))

replic_LCA = pd.DataFrame(results).drop(columns = ['model', 'params', 'silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn'])
replic_LCA['l2_red'] = 100 * (replic_LCA['LL'].iloc[0] - replic_LCA['LL']) / replic_LCA['LL'].iloc[0]

Fitting latent models: 100%|██████████| 8/8 [00:00<00:00, 5215.17it/s]


In [21]:
replic_LCA.style

Unnamed: 0,n_clust,min_clust_size,max_clust_size,aic,bic,sabic,relative_entropy,classif_error,df,LL,l2_red
0,1,1215,1215,61663.981832,62133.411773,62494.612077,,0.0,1121,-25.300404,-0.0
1,2,423,792,58163.507661,59107.470042,59833.79674,0.875307,0.034978,1028,-23.783336,5.996219
2,3,242,563,56972.090243,58390.585064,59482.038157,0.860825,0.062306,935,-23.216498,8.236651
3,4,196,443,56363.51123,58256.538491,59713.117978,0.862468,0.076808,842,-22.889511,9.52907
4,5,92,442,55714.306849,58081.86655,59903.572432,0.888762,0.070038,749,-22.545805,10.887568
5,6,110,388,55642.346116,58484.438257,60671.270534,0.884397,0.078146,656,-22.439649,11.307153
6,7,49,370,55690.248503,59006.873084,61558.831755,0.911771,0.066198,563,-22.382818,11.531775
7,8,58,341,55766.394932,59557.551953,62474.637018,0.911941,0.07414,470,-22.337611,11.710457


## BLRT

In [22]:
s_time = time.time()
BLRT = blrt_sweep_custom(
    latent_mod,
    data_f,
    low = 1,
    high = 8,
    n_repetitions = bootstrap_iters,
    n_jobs = max_threads)
e_time = time.time()


Bootstrapping estimator...


Bootstrap Repetitions    :   0%|          | 1/1000 [00:00<02:36,  6.39it/s, max_LL=-2.8e+4, median_LL=-2.8e+4, min_LL=-2.8e+4]e+4]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [01:41<00:00,  9.88it/s, max_LL=-2.79e+4, median_LL=-2.88e+4, min_LL=-2.96e+4]
Bootstrap Repetitions    :  34%|███▍      | 342/1000 [00:56<01:41,  6.47it/s, max_LL=-2.73e+4, median_LL=-2.81e+4, min_LL=-2.88e+4]


Bootstrapping estimator...


Bootstrap Repetitions    :  14%|█▍        | 140/1000 [00:25<02:50,  5.05it/s, max_LL=-2.68e+4, median_LL=-2.76e+4, min_LL=-2.83e+4]


Bootstrapping estimator...


Bootstrap Repetitions    :  16%|█▌        | 162/1000 [00:34<02:40,  5.21it/s, max_LL=-2.66e+4, median_LL=-2.73e+4, min_LL=-2.79e+4]


Bootstrapping estimator...


Bootstrap Repetitions    :   0%|          | 0/1000 [00:00<?, ?it/s] 5.86it/s, max_LL=-2.73e+4, median_LL=-2.81e+4, min_LL=-2.89e+4]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [02:39<00:00,  6.28it/s, max_LL=-2.73e+4, median_LL=-2.81e+4, min_LL=-2.89e+4]
Bootstrap Repetitions    :  28%|██▊       | 275/1000 [01:12<04:29,  2.69it/s, max_LL=-2.63e+4, median_LL=-2.7e+4, min_LL=-2.77e+4]]


Bootstrapping estimator...


Bootstrap Repetitions    : 100%|██████████| 1000/1000 [03:04<00:00,  5.43it/s, max_LL=-2.68e+4, median_LL=-2.76e+4, min_LL=-2.84e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, max_LL=-2.64e+4, median_LL=-2.73e+4, min_LL=-2.8e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [04:17<00:00,  3.88it/s, max_LL=-2.63e+4, median_LL=-2.7e+4, min_LL=-2.79e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [04:47<00:00,  3.48it/s, max_LL=-2.6e+4, median_LL=-2.69e+4, min_LL=-2.76e+4]
Bootstrap Repetitions    : 100%|██████████| 1000/1000 [05:25<00:00,  3.07it/s, max_LL=-2.58e+4, median_LL=-2.67e+4, min_LL=-2.74e+4]


In [23]:
print(f"Total execution time: {(e_time - s_time) / 60:.2f} minutes")

Total execution time: 10.21 minutes


In [24]:
BLRT_res = pd.concat([pd.DataFrame({'p': [np.nan]}), BLRT]).reset_index(drop=True) # Add a row for the saturated model
BLRT_res["n clust"] = [f"{i+1} vs. {i} clust" for i in BLRT_res.index]
BLRT_res = BLRT_res.iloc[1:]
BLRT_res = BLRT_res[["n clust", "p"]]
BLRT_res.to_csv("output/models/BLRT_1215.csv", index=False)

BLRT_res.style.hide(axis=0).format({"p": "{:.3f}"})

n clust,p
2 vs. 1 clust,0.339
3 vs. 2 clust,0.813
4 vs. 3 clust,0.265
5 vs. 4 clust,0.187
6 vs. 5 clust,0.921
7 vs. 6 clust,0.687
8 vs. 7 clust,0.666


In [25]:
if BLRT_res[BLRT_res['p'] > 0.05].empty:
    best_LCA = None
else:
    best_LCA = BLRT_res[BLRT_res['p'] > 0.05]
    best_LCA = best_LCA.index[0]

print(f"Optimal number of clusters for LCA without covariates and sample weights is {best_LCA} according to BLRT.")

Optimal number of clusters for LCA without covariates and sample weights is 1 according to BLRT.
