In [None]:
import ast
import numpy as np
import pandas as pd
import time
import warnings

from itertools import product
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

from src.model_fit import do_StepMix, do_kmeans, do_AHC, do_hdbscan
from src.model_select import bootstrap_gap, compute_gap, get_gap

In [None]:
time0 = time.time()

In [None]:
CVI = ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn']
max_threads = 8

approach = 'replic_830' # replic_830 / replic_1077 / own_1215

max_clust = 16
gap_iters = 500

# Data

In [None]:
q2 = ['clseusa']
q3 = ['ambornin', 'amcit', 'amlived', 'amenglsh', 
      'amchrstn', 'amgovt', 'amfeel']
q4 = ['amcitizn', 'amshamed', 'belikeus', 'ambetter', 'ifwrong']
q5 = ['proudsss', 'proudgrp', 'proudpol', 'prouddem', 'proudeco',
      'proudspt','proudart', 'proudhis', 'proudmil', 'proudsci']

# if 'own' in approach:
#    q2 = q2 + ['clsetown', 'clsestat', 'clsenoam']
#    q3 = q3 + ['amancstr']
#    q4 = q4 + ['amsports', 'lessprd']

q2_n = [var + "_n" for var in q2]
q3_n = [var + "_n" for var in q3]
q4_n = [var + "_n" for var in q4]
q5_n = [var + "_n" for var in q5]

var_list = q2 + q3 + q4 + q5
var_list_f = [var + "_f" for var in var_list]
var_list_n = [var + "_n" for var in var_list]

ctrl_list = ['party_f', 'race_f', 'educ_f', 'region_f', 'reltrad_f', 
             'religstr_f', 'born_usa_f', 'sex_f', 'age_n', 
             'lnrealinc2004_n', 'age_n', 'lnrealinc2004_n']

In [None]:
if '830' in approach:
    data2004 = pd.read_parquet(f"data/data2004_830.parquet")
elif '1077' in approach:
    data2004 = pd.read_parquet(f"data/data2004_1077.parquet")
else:
    data2004 = pd.read_parquet(f"data/data2004_1215.parquet")

# Dataset with numeric outcomes
scaler = StandardScaler()
data_n = data2004[var_list_n]
data_n = pd.DataFrame(scaler.fit_transform(data_n), columns=data_n.columns, index=data_n.index)

# Dataset with categorical outcomes and reindexing to 0 (as expected by StepMix)
data_f = data2004[var_list_n] - 1

# Dataset with controls (same as the authors)
controls = data2004[ctrl_list]
controls_dum = pd.get_dummies(controls)

# Sample weights
weights = data2004['wgt']

In [None]:
# Alternate: dataframe with weighted numeric outcomes
# Weighting after scaling (otherwise it defeats the purpose...)

data_n_w = pd.DataFrame()

q2_wgt = len(q2) / len(var_list)
q3_wgt = len(q3) / len(var_list)
q4_wgt = len(q4) / len(var_list)
q5_wgt = len(q5) / len(var_list)

for var in q2_n:
    data_n_w.loc[:, var] = data_n[var] * q2_wgt

for var in q3_n:
    data_n_w.loc[:, var] = data_n[var] * q3_wgt

for var in q4_n:
    data_n_w.loc[:, var] = data_n[var] * q4_wgt

for var in q5_n:
    data_n_w.loc[:, var] = data_n[var] * q5_wgt

# Remove the _w suffix to inject into main code
data_n_w = pd.DataFrame(data_n_w)

# 1. Fit models

## 1.1. Latent

Only base models as NAs, covariates, and sample weights are not compatible with the gap procedure.

In [None]:
# msrt = ['categorical', 'continuous']
msrt = ['categorical']
covar = ['without']
latent_params = list(product(msrt, covar))
clust_range = range(1, max_clust+1)
latent_grid = product(clust_range, latent_params)

In [None]:
results = Parallel(n_jobs = max_threads)(
    delayed(do_StepMix)(
        data_f if 'categorical' in msrt else data_n,
        controls_dum if covar == 'with' else None,
        n,
        msrt,
        covar)
    for n, (msrt, covar) in tqdm(latent_grid, desc='Fitting latent models'))

latent_all = pd.DataFrame(results)

In [None]:
# Convert params to legacy format (without infos on NAs and weights)
latent_all['params'] = latent_all['params'].apply(
    lambda d: {k: v for k, v in d.items() if k not in ['NAs', 'wgt']})

## 1.2. k-means

With a custom implementation, as scikit-learn does not allow to change the linkage function.

In [None]:
dist = ['euclidean', 'manhattan', 'chebyshev']
link = ['mean', 'median', 'medoid']
kmeans_params = list(product(dist, link))

clust_range = range(2, max_clust+1)
kmeans_grid = product(clust_range, kmeans_params)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_kmeans)(data_n, n, dist, link) 
    for n, (dist, link) in tqdm(kmeans_grid, desc='Fitting KMeans models'))

kmeans_all = pd.DataFrame(results)

## 1.3. AHC

In [None]:
distances = ['manhattan', 'euclidean', 'chebyshev', 'hamming']
linkages = ['single', 'average', 'complete']
ahc_params = [*product(distances, linkages), ('euclidean', 'ward')]

clust_range = range(1, max_clust+1)
ahc_grid = product(clust_range, ahc_params)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_AHC)(data_n, n, dist, link) 
    for n, (dist, link) in tqdm(ahc_grid, desc='Fitting AHC models'))

ahc_all = pd.DataFrame(results)

## 1.4. HDBSCAN

In [None]:
distances = ['manhattan', 'euclidean', 'chebyshev', 'mahalanobis', 'hamming']
min_cluster_sizes = range(2, 16)
min_samples_range = range(1, 16)
hdb_params = product(distances, min_cluster_sizes, min_samples_range)

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(do_hdbscan)(data_n, dist, min_clust, min_smpl)
    for dist, min_clust, min_smpl in tqdm(hdb_params, desc='Fitting HDBSCAN models'))

hdbscan_all = pd.DataFrame(results)

## 1.5. Aggregate results

In [None]:
all_models = pd.concat([latent_all, kmeans_all, ahc_all, hdbscan_all]).reset_index(drop=True)
all_models.to_csv(f"output/models/all_models_{approach}.csv", index=False)

# 2. Gap statistics for latent models, kmeans and AHC

In [None]:
all_models = pd.concat([latent_all, kmeans_all, ahc_all]).reset_index(drop=True)

## Step 1: compute the gap statistic for each model-config

In [None]:
params = {
    'kmeans': kmeans_params,
    'AHC': ahc_params,
    'latent': latent_params}

param_names = {
    'kmeans': ['dist', 'link'],
    'AHC': ['dist', 'link'],
    'latent': ['msrt', 'covar']}

models = ['kmeans', 'AHC', 'latent']

bootstrap_grid = [
    (model, {key: value for key, value in zip(param_names[model], param_values)}, n_val, n_iter)
    for model in models
    for param_values in params[model]
    for n_val in (range(1, max_clust+1) if model == 'latent' else range(2, max_clust+1))
    for n_iter in range(gap_iters)]

model_grid = [
    (model, dict(zip(param_names[model], param_values)))
    for model in models
    for param_values in params[model]]

In [None]:
results = Parallel(n_jobs=max_threads)(
    delayed(bootstrap_gap)(
        data = data_f if model == 'latent' and 'categorical' in config.get('msrt') else data_n,
        controls = controls_dum if model == 'latent' and config.get('covar') == 'with' else None,
        n = n,
        model = model,
        params = config,
        iter_num = iter_num)
    for model, config, n, iter_num in tqdm(bootstrap_grid, desc='Bootstrapping CVIs'))
bootstrap_results = pd.concat(results).reset_index(drop=True)

In [None]:
bootstrap_results['params'] = bootstrap_results['params'].apply(
    lambda d: {k: v for k, v in d.items() if k not in ['NAs', 'wgt']})

In [None]:
gap_values = []

for model, config in model_grid:
    rows_id = ((bootstrap_results['model'] == model) & (bootstrap_results['params'] == config))    
    bs_select_res = bootstrap_results[rows_id]
    gap_stats = compute_gap(bs_select_res, all_models, model, config, CVI)
    gap_values.append(gap_stats)

gap_values = pd.concat(gap_values, ignore_index=True)

## Step 2: identify the optimal number of clusters for each model-config

In [None]:
# Create df to store results
cols = ['model', 'params', 'n_clust'] + \
       [index for index in CVI] + \
       [f'{index}_gap' for index in CVI]

candidate_models = pd.DataFrame(columns=cols)
candidate_models['model'] = candidate_models['model'].astype('object')
candidate_models['params'] = candidate_models['params'].astype('object')

float_cols = [col for col in cols if col not in ['model', 'params', 'n_clust'] + CVI]
candidate_models[float_cols] = candidate_models[float_cols].astype('float64')
int_cols = [col for col in cols if col in ['n_clust'] + CVI]
candidate_models[int_cols] = candidate_models[int_cols].astype('int64')

In [None]:
# Find best n
for model, config in model_grid:
    for index in CVI:
        best_n = get_gap(gap_values, model, config, index)

        # Check if a best value has been identified
        if best_n != 'none':
            row_id = ((candidate_models['model'] == model) & 
                      (candidate_models['params'] == config) &
                      (candidate_models['n_clust'] == best_n))
            
            # Check if the corresponding row exists in the df
            if candidate_models[row_id].empty:

                model_id = ((all_models['model'] == model) & 
                           (all_models['params'] == config) &
                           (all_models['n_clust'] == best_n))
                
                new_row = {
                    'model': model,
                    'params': config,
                    'n_clust': best_n,
                    'min_clust_size': all_models.loc[model_id, 'min_clust_size'].values[0],
                    'max_clust_size': all_models.loc[model_id, 'max_clust_size'].values[0],
                    'silhouette': all_models.loc[model_id, 'silhouette'].values[0],
                    'calinski_harabasz': all_models.loc[model_id, 'calinski_harabasz'].values[0],
                    'davies_bouldin': all_models.loc[model_id, 'davies_bouldin'].values[0],
                    'dunn': all_models.loc[model_id, 'dunn'].values[0],
                    f'{index}_gap': 1}
                
                new_row = pd.DataFrame([new_row])
                candidate_models = pd.concat([candidate_models, new_row], ignore_index=True)

            # Otherwise, update the existing row
            else:
                candidate_models.loc[row_id, f'{index}_gap'] = 1

In [None]:
candidate_models.to_csv(f"output/models/candidate_models_{approach}.csv", index=False)

In [None]:
print(f"Total execution time: {(time.time() - time0)/60:.2f} minutes")