In [1]:
import numpy as np
import pandas as pd

from scipy.optimize import linear_sum_assignment
from src.model_fit import do_StepMix

# Preparation
## Data

In [2]:
var_list = [
    'clseusa', 'ambornin', 'amcit', 'amlived', 'amenglsh', 'amchrstn',
    'amgovt', 'amfeel', 'amcitizn', 'amshamed', 'belikeus', 'ambetter',
    'ifwrong', 'proudsss', 'proudgrp', 'proudpol', 'prouddem', 'proudeco',
    'proudspt', 'proudart', 'proudhis', 'proudmil', 'proudsci']

var_list_f = [var + "_f" for var in var_list]
var_list_n = [var + "_n" for var in var_list]

ctrl_list = [
    'party_f', 'race_f', 'educ_f', 'region_f', 'reltrad_f', 'religstr_f', 
    'born_usa_f', 'sex_f', 'age_n', 'lnrealinc2004_n']

In [3]:
data2004 = pd.read_parquet(f"data/data2004_830.parquet")

# Dataset with categorical outcomes and reindexing to 0 (as expected by StepMix)
data_f = data2004[var_list_n] - 1

# Dataset with controls (same as the authors)
controls = data2004[ctrl_list]
controls_dum = pd.get_dummies(controls)

# Sample weights
weights = data2004['wgt']

In [4]:
# Same options in LatentGold and StepMix (save for ordinal)
SM_pred_clust = do_StepMix(
    data = data_f,
    controls = controls_dum,
    n = 4,
    msrt = 'categorical_nan',
    covar = 'with',
    weights = weights,
    refit = True)

step_mix_results = data2004[var_list_n + ['age_n']].copy()
step_mix_results['SM_pred_clust'] = pd.DataFrame(SM_pred_clust) + 1

In [5]:
# With options used in the main approach
SM_pred_clust = do_StepMix(
    data = data_f,
    controls = None,
    n = 4,
    msrt = 'categorical',
    covar = 'without',
    weights = None,
    refit = True)

step_mix_results = data2004[var_list_n + ['age_n']].copy()
step_mix_results['SM_pred_clust'] = pd.DataFrame(SM_pred_clust) + 1

In [6]:
latent_gold_results = pd.read_csv("data/4C N830 wgt cov.csv", sep=';')

latent_gold_results.rename(columns={
    'ind_amshame': 'ind_amshamedr',
    'cov_age': 'age_n',
    'Modal': 'LG_pred_clust'}, inplace=True)
var_list_stata = ["ind_" + var + "r" for var in var_list]
latent_gold_results = latent_gold_results[var_list_stata + ['age_n', 'LG_pred_clust', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4']]
latent_gold_results.columns = var_list_n + list(latent_gold_results.columns[23:])

In [7]:
merged_results = pd.merge(
    step_mix_results,
    latent_gold_results,
    on=var_list_n + ['age_n'],
    how='inner')

In [8]:
# Step 1: Create a confusion matrix
conf_matrix = pd.crosstab(merged_results['LG_pred_clust'], merged_results['SM_pred_clust'])

# Step 2: Convert the confusion matrix to a cost matrix (negative for maximization)
cost_matrix = -conf_matrix.values

# Step 3: Solve the assignment problem
row_ind, col_ind = linear_sum_assignment(cost_matrix)

# Step 4: Create the mapping
mapping = dict(zip(conf_matrix.index[row_ind], conf_matrix.columns[col_ind]))
print("Most likely cluster mapping from LG_pred_clust to SM_pred_clust:")
print(mapping)

# Step 5: Apply mapping and assess overlap
merged_results['LG_mapped'] = merged_results['LG_pred_clust'].map(mapping)

# Step 6: Calculate overlap (e.g., accuracy)
overlap = (merged_results['LG_mapped'] == merged_results['SM_pred_clust']).mean()
print(f'Proportion of overlapping assignments after mapping: {overlap:.2%}')

Most likely cluster mapping from LG_pred_clust to SM_pred_clust:
{1: 1, 2: 3, 3: 2, 4: 4}
Proportion of overlapping assignments after mapping: 90.36%


In [9]:
merged_results['classif_error_LG'] = merged_results.apply(
    lambda row: row[f'Cluster{row["LG_pred_clust"]}'], axis=1
)
merged_results['classif_error_LG'] = merged_results['classif_error_LG'].str.replace(',', '.').astype(float)
merged_results['agreement'] = merged_results['LG_mapped'] == merged_results['LG_pred_clust']
print("LatentGold classification error depending on agreement with StepMix")
print(merged_results.groupby('agreement')['classif_error_LG'].mean())
print("")
print("LatentGold classification error depending on clusters")
print(merged_results.groupby('LG_pred_clust')['classif_error_LG'].mean())

LatentGold classification error depending on agreement with StepMix
agreement
False    0.934288
True     0.928488
Name: classif_error_LG, dtype: float64

LatentGold classification error depending on clusters
LG_pred_clust
1    0.921275
2    0.937321
3    0.930859
4    0.945407
Name: classif_error_LG, dtype: float64
