In [None]:
import ast
import numpy as np
import pandas as pd
import time
import warnings

from itertools import product
from joblib import Parallel, delayed

from scipy.stats import chi2
from stepmix.bootstrap import blrt_sweep
from stepmix.stepmix import StepMix
from tqdm import tqdm

from src.model_fit import do_StepMix
from src.model_select import blrt_sweep_custom

In [None]:
max_threads = 8
bootstrap_iters = 500

In [None]:
var_list = [
    'clseusa', 'ambornin', 'amcit', 'amlived', 'amenglsh', 'amchrstn',
    'amgovt', 'amfeel', 'amcitizn', 'amshamed', 'belikeus', 'ambetter',
    'ifwrong', 'proudsss', 'proudgrp', 'proudpol', 'prouddem', 'proudeco',
    'proudspt', 'proudart', 'proudhis', 'proudmil', 'proudsci']

var_list_n = [var + "_n" for var in var_list]

# n = 830

In [None]:
data2004 = pd.read_parquet(f"data/data2004_830.parquet")
data_f = data2004[var_list_n] - 1 # reindexing to 0 (as expected by StepMix)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(
        data = data_f,
        controls = None,
        n = n_clust,
        msrt = 'categorical',
        covar = 'without',
        weights = None)
    for n_clust in tqdm(range(1,9), desc='Fitting latent models'))

replic_LCA = pd.DataFrame(results).drop(columns = ['model', 'params', 'silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn'])
replic_LCA['l2_red'] = 100 * (replic_LCA['LL'].iloc[0] - replic_LCA['LL']) / replic_LCA['LL'].iloc[0]

In [None]:
replic_LCA.style

## BLRT

In [None]:
opt_params = {
    'method': 'gradient',
    'intercept': True,
    'max_iter': 500}

latent_mod = StepMix(
    measurement = 'categorical',
    n_init = 5,
    abs_tol = 1e-4,
    rel_tol = 1e-4,
    init_params = 'kmeans',
    structural_params = opt_params,
    progress_bar = 0)

In [None]:
s_time = time.time()
BLRT = blrt_sweep_custom(
    latent_mod,
    data_f,
    low = 1,
    high = 8,
    n_repetitions = bootstrap_iters,
    n_jobs = max_threads)
e_time = time.time()

In [None]:
print(f"Total execution time: {(e_time - s_time) / 60:.2f} minutes")

In [None]:
BLRT_res = pd.concat([pd.DataFrame({'p': [np.nan]}), BLRT]).reset_index(drop=True) # Add a row for the saturated model
BLRT_res["n clust"] = [f"{i+1} vs. {i} clust" for i in BLRT_res.index]
BLRT_res = BLRT_res.iloc[1:]
BLRT_res = BLRT_res[["n clust", "p"]]
BLRT_res.to_csv("output/models/BLRT_simplex.csv", index=False)

BLRT_res.style.hide(axis=0).format({"p": "{:.3f}"})

In [None]:
if BLRT_res[BLRT_res['p'] > 0.05].empty:
    best_LCA = None
else:
    best_LCA = BLRT_res[BLRT_res['p'] > 0.05]
    best_LCA = best_LCA.index[0]

print(f"Optimal number of clusters for LCA without covariates and sample weights is {best_LCA} according to BLRT.")

# n = 1077

In [None]:
data2004 = pd.read_parquet(f"data/data2004_1077.parquet")
data_f = data2004[var_list_n] - 1 # reindexing to 0 (as expected by StepMix)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(
        data = data_f,
        controls = None,
        n = n_clust,
        msrt = 'categorical',
        covar = 'without',
        weights = None)
    for n_clust in tqdm(range(1,9), desc='Fitting latent models'))

replic_LCA = pd.DataFrame(results).drop(columns = ['model', 'params', 'silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn'])
replic_LCA['l2_red'] = 100 * (replic_LCA['LL'].iloc[0] - replic_LCA['LL']) / replic_LCA['LL'].iloc[0]

In [None]:
replic_LCA.style

## BLRT

In [None]:
s_time = time.time()
BLRT = blrt_sweep_custom(
    latent_mod,
    data_f,
    low = 1,
    high = 8,
    n_repetitions = bootstrap_iters,
    n_jobs = max_threads)
e_time = time.time()

In [None]:
print(f"Total execution time: {(e_time - s_time) / 60:.2f} minutes")

In [None]:
BLRT_res = pd.concat([pd.DataFrame({'p': [np.nan]}), BLRT]).reset_index(drop=True) # Add a row for the saturated model
BLRT_res["n clust"] = [f"{i+1} vs. {i} clust" for i in BLRT_res.index]
BLRT_res = BLRT_res.iloc[1:]
BLRT_res = BLRT_res[["n clust", "p"]]
BLRT_res.to_csv("output/models/BLRT_simplex.csv", index=False)

BLRT_res.style.hide(axis=0).format({"p": "{:.3f}"})

In [None]:
if BLRT_res[BLRT_res['p'] > 0.05].empty:
    best_LCA = None
else:
    best_LCA = BLRT_res[BLRT_res['p'] > 0.05]
    best_LCA = best_LCA.index[0]

print(f"Optimal number of clusters for LCA without covariates and sample weights is {best_LCA} according to BLRT.")

# n = 1215

In [None]:
data2004 = pd.read_parquet(f"data/data2004_1215.parquet")
data_f = data2004[var_list_n] - 1 # reindexing to 0 (as expected by StepMix)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_StepMix)(
        data = data_f,
        controls = None,
        n = n_clust,
        msrt = 'categorical',
        covar = 'without',
        weights = None)
    for n_clust in tqdm(range(1,9), desc='Fitting latent models'))

replic_LCA = pd.DataFrame(results).drop(columns = ['model', 'params', 'silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn'])
replic_LCA['l2_red'] = 100 * (replic_LCA['LL'].iloc[0] - replic_LCA['LL']) / replic_LCA['LL'].iloc[0]

In [None]:
replic_LCA.style

## BLRT

In [None]:
s_time = time.time()
BLRT = blrt_sweep_custom(
    latent_mod,
    data_f,
    low = 1,
    high = 8,
    n_repetitions = bootstrap_iters,
    n_jobs = max_threads)
e_time = time.time()

In [None]:
print(f"Total execution time: {(e_time - s_time) / 60:.2f} minutes")

In [None]:
BLRT_res = pd.concat([pd.DataFrame({'p': [np.nan]}), BLRT]).reset_index(drop=True) # Add a row for the saturated model
BLRT_res["n clust"] = [f"{i+1} vs. {i} clust" for i in BLRT_res.index]
BLRT_res = BLRT_res.iloc[1:]
BLRT_res = BLRT_res[["n clust", "p"]]
BLRT_res.to_csv("output/models/BLRT_simplex.csv", index=False)

BLRT_res.style.hide(axis=0).format({"p": "{:.3f}"})

In [None]:
if BLRT_res[BLRT_res['p'] > 0.05].empty:
    best_LCA = None
else:
    best_LCA = BLRT_res[BLRT_res['p'] > 0.05]
    best_LCA = best_LCA.index[0]

print(f"Optimal number of clusters for LCA without covariates and sample weights is {best_LCA} according to BLRT.")