In [48]:
import numpy as np
import pandas as pd
from itertools import combinations_with_replacement

from sklearn.datasets import make_classification

## Generate data

In [49]:
from random import randint

# random_states = [randint(0, 100) for x in range(100)]

random_states = [69, 11, 97, 19, 72, 38, 78, 36, 16, 3, 53, 76, 74, 78, 8, 99,
                 45, 23, 58, 68, 1, 40, 52, 51, 95, 73, 64, 84, 38, 39, 82, 60,
                 11, 97, 12, 10, 91, 44, 46, 65, 51, 59, 26, 94, 26, 40, 99, 8,
                 47, 27, 42, 1, 87, 7, 98, 31, 9, 27, 44, 42, 24, 18, 69, 100,
                 85, 46, 33, 16, 47, 70, 9, 13, 73, 53, 87, 51, 72, 22, 93, 61,
                 33, 92, 50, 20, 65, 67, 74, 9, 75, 0, 61, 10, 99, 30, 52, 80,
                 10, 2, 83, 58]

### Configuration and input verification

In [50]:
CONFIG = {
    'easy': {
             'n_samples': 2000,
             'n_clusters': 3,
             # Features
             'n_numeric_features': 5,
             'n_categorical_features': 5,
             'categorical_cardinalities': [6, 6, 6, 6, 6],
             'n_multival_features': 5,
             'multival_vocab_lens': [(3, 3, 3),  # How many vocab items
                                           (3, 3, 3),  # are associated to a
                                           (3, 3, 3),  # cluster.
                                           (3, 3, 3),
                                           (3, 3, 3)],
             # Difficulty params
             'separability': 3.0,
             'multival_intersections': 1,
             'noise': 0.01,
             'class_weights': [0.33, 0.33],
             # Approach Settings
             'approach_settings': {
                'naive': {
                    'gamma': None  # By frequency of appearance
                },
                'one-hot': {
                    'gamma': None,
                    'max_dummies': 100
                },
                'one-hot-pca': {
                    'gamma': None,
                    'reduced_dimensions': 0.25
                },
                'extended': {
                    'gamma_c': 0.33,
                    'gamma_m': 0.33,
                    'theta': 0.001
                }
             }
            }
}

In [51]:
PARAM_GUIDE = {
    'n_samples': int,
    'n_clusters': int,
    # Numeric features
    'n_numeric_features': int,
    # Categorical features
    'n_categorical_features': int,
    'categorical_cardinalities': list,
    # Multi-valued features
    'n_multival_features': int,
    'multival_vocab_lens': list,
    # Difficulty params
    'separability': float,
    'multival_intersections': int,
    'noise': float,
    'class_weights': list,
    # Approach settings
    'approach_settings': dict
}

In [52]:
config_dict = CONFIG['easy']

def validate_config(config_dict, param_guide):
        """Validate structure and types of provided configuration."""
        try:
            for parm in param_guide.keys():
                if not isinstance(config_dict[parm], param_guide[parm]):
                    raise ValueError(f"Parameter {parm} should be an integer.")
        except KeyError as e:
            print(f"Missing key {parm} in the configuration dict.")
            raise e

        # Length checks
        if len(config_dict['class_weights']) != config_dict['n_clusters']:
            if (len(config_dict['class_weights']) !=
                    config_dict['n_clusters']-1):
                raise ValueError("A number of class_weights equal to the "
                                 "number of clusters or the number of "
                                 "clusters minus one must be provided. List "
                                 "had size "
                                 f"{len(config_dict['class_weights'])}.")

        if (len(config_dict['categorical_cardinalities']) !=
                config_dict['n_categorical_features']):

            raise ValueError("A cardinality must be provided for every "
                             "categorical attribute. "
                             f"{len(config_dict['categorical_cardinalities'])}"
                             " cardinalities were provided for "
                             f"{config_dict['n_categorical_features']}.")
        if (len(config_dict['multival_vocab_lens']) !=
                config_dict['n_multival_features']):

            raise ValueError("A vocabulary length must be provided for every "
                             f"multi-valued attribute. "
                             f"{len(config_dict['multival_subvocab_length'])} "
                             "vocabulary length list-likes were provided for "
                             f"{config_dict['n_multival_features']} "
                             "attributes.")

        for card in config_dict['categorical_cardinalities']:
            if card < config_dict['n_clusters']:
                raise ValueError("Categorical attribute cardinalities cannot "
                                 "be lower than n_clusters.")

        for card_tuple in config_dict['multival_vocab_lens']:
            if len(card_tuple) != config_dict['n_clusters']:
                raise ValueError("A sub-vocabulary length must be provided for"
                                 f" each cluster. {card_tuple} was provided "
                                 "without the required length "
                                 f"{config_dict['n_clusters']}")

        if (config_dict['approach_settings']['extended']['gamma_m'] +
                config_dict['approach_settings']['extended']['gamma_c'] >=
                1.0):
            actual_sum = (config_dict['approach_settings']['extended']
                          ['gamma_m'] + config_dict['approach_settings']
                          ['extended']['gamma_c'])
            raise ValueError("Extended K-Prototypes gamma values should sum "
                             "up to less than 1.0. The values provided sum up "
                             f"to {actual_sum}.")

        return config_dict


config_dict = validate_config(config_dict=config_dict, param_guide=PARAM_GUIDE)
config_dict

{'n_samples': 2000,
 'n_clusters': 3,
 'n_numeric_features': 5,
 'n_categorical_features': 5,
 'categorical_cardinalities': [6, 6, 6, 6, 6],
 'n_multival_features': 5,
 'multival_vocab_lens': [(3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3)],
 'separability': 3.0,
 'multival_intersections': 1,
 'noise': 0.01,
 'class_weights': [0.33, 0.33],
 'approach_settings': {'naive': {'gamma': None},
  'one-hot': {'gamma': None, 'max_dummies': 100},
  'one-hot-pca': {'gamma': None, 'reduced_dimensions': 0.25},
  'extended': {'gamma_c': 0.33, 'gamma_m': 0.33, 'theta': 0.001}}}

Done!

### Generate the data

In [53]:
xnum, y_true = make_classification(
    n_samples=config_dict['n_samples'],
    n_features=config_dict['n_numeric_features'],
    n_informative=config_dict['n_numeric_features'],
    n_redundant=0,
    n_repeated=0,
    n_classes=config_dict['n_clusters'],
    n_clusters_per_class=1,
    weights=config_dict['class_weights'],
    flip_y=config_dict['noise'],
    class_sep=config_dict['separability'],
    random_state=random_states[0]
    )

In [54]:
# Creation of categories
def assign_categorical_features(class_labels:np.ndarray,
                                # n_categorical:int,
                                cardinalities:list,
                                random_state):
    n_clusters = len(np.unique(class_labels))
    random_generator = np.random.default_rng(seed=random_state)
    cluster_classes_keys = []
    categorical_attribute_arrays = []

    for card in cardinalities:
        cluster_to_class = dict()
        
        extra_levels = card % n_clusters
        levels_per_cluster = (card - extra_levels) / n_clusters
        curr_level = 0
        
        for clust in np.unique(class_labels):
            # curr_level += 1
            cluster_to_class[clust] = []
            # assert (card - extra_levels) % n_clusters == 0
            
            for level in range(int(levels_per_cluster)):
                cluster_to_class[clust].append(curr_level+level)
            curr_level += 2
        
        # Assign the extra levels at random
        if extra_levels:
            for extra_level in range(extra_levels):
                clust = random_generator.choice(class_labels)
                cluster_to_class[clust].append(curr_level+extra_level)

        # For items belonging to a given label, assign it a level from the dict
        attribute = np.zeros(class_labels.shape[0], dtype=np.int32)
        for i_label, label in enumerate(class_labels):
            attribute[i_label] = random_generator.choice(cluster_to_class[label])
        
        categorical_attribute_arrays.append(attribute)
        cluster_classes_keys.append(cluster_to_class)

    return np.stack(categorical_attribute_arrays, axis=1), cluster_classes_keys

In [55]:
xcat, cat_guide = assign_categorical_features(class_labels=y_true,
                                   cardinalities=config_dict['categorical_cardinalities'],
                                   random_state=random_states[0])

xcat

array([[4, 4, 5, 4, 4],
       [3, 3, 3, 3, 3],
       [1, 1, 0, 1, 0],
       ...,
       [3, 3, 2, 3, 2],
       [0, 1, 0, 1, 1],
       [0, 0, 1, 1, 1]])

In [56]:
y_true

array([2, 1, 0, ..., 1, 0, 0])

In [57]:
cat_guide

[{0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]}]

In [58]:
xnum

array([[-2.60269058, -4.22476144,  3.45997033,  2.51034261,  2.93075784],
       [-0.1970824 , -3.10772864,  3.41599443,  6.02866969,  5.15555844],
       [-2.02381431,  3.14839166, -3.1550843 , -2.60587041, -1.76495351],
       ...,
       [ 3.89419075, -3.62263262,  3.72557832,  4.0990099 ,  1.46300835],
       [-2.25330148,  2.14794198, -3.15144734, -2.05743312, -2.78488292],
       [-2.42299465,  1.83934959, -2.3255219 , -4.04454782, -2.95657787]])

In [59]:
y_true

array([2, 1, 0, ..., 1, 0, 0])

In [60]:
config_dict

{'n_samples': 2000,
 'n_clusters': 3,
 'n_numeric_features': 5,
 'n_categorical_features': 5,
 'categorical_cardinalities': [6, 6, 6, 6, 6],
 'n_multival_features': 5,
 'multival_vocab_lens': [(3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3)],
 'separability': 3.0,
 'multival_intersections': 1,
 'noise': 0.01,
 'class_weights': [0.33, 0.33],
 'approach_settings': {'naive': {'gamma': None},
  'one-hot': {'gamma': None, 'max_dummies': 100},
  'one-hot-pca': {'gamma': None, 'reduced_dimensions': 0.25},
  'extended': {'gamma_c': 0.33, 'gamma_m': 0.33, 'theta': 0.001}}}

In [61]:
# Creation of multi-valued attributes
def assign_multi_valued_features(class_labels:np.ndarray,
                                 subvocab_lengths:list,
                                 level_of_intersection:int) -> np.ndarray:
    """
    Create multi-valued attributes from class labels, the lengths of the 
    vocabulary subsets that are assigned to each label, and the degree to which
    pairwise clusters should have intersections.
    
    Arguments
    ---------
    level_of_intersection:int
        The level of intersection refers to the number of items in the 
        vocabulary that are common to cluster pairs. The higher it is relative 
        to the sub-vocabulary lenghts, the lower the distance between clusters 
        in relation to their multi-valued attributes will be.
    """
    clusters = np.unique(class_labels)
    attribute_label_dicts = []
    multi_valued_attribute_arrays = []

    for subvocab in subvocab_lengths:   # Iterate over n_multival, implicitly
        total_attribute_vocabulary = {-1}
        label_vocab_dict = dict().fromkeys(clusters)

        for clust in clusters:
            label_vocab_dict[clust] = set()
            subvocab_clust_l = subvocab[clust]
            
            
            for item in range(max(total_attribute_vocabulary) + 1,
                              max(total_attribute_vocabulary) +
                                subvocab_clust_l + 1):
                label_vocab_dict[clust].add(item)
                total_attribute_vocabulary.add(item)
        
        total_attribute_vocabulary.remove(-1)

        # Add the pairwise intersections
        if level_of_intersection > 0:
            for cluster_pair in [clust_comb for clust_comb in
                                 combinations_with_replacement(clusters, 2)
                                 if clust_comb[0] != clust_comb[1]]:
                for item in range(max(total_attribute_vocabulary) + 1,
                                  max(total_attribute_vocabulary) +
                                    level_of_intersection + 1):
                    
                    label_vocab_dict[cluster_pair[0]].add(item)
                    label_vocab_dict[cluster_pair[1]].add(item)
                    total_attribute_vocabulary.add(item)
        
        attribute_label_dicts.append(label_vocab_dict)

        # Now build, for each attribute, the array containing the observations
        attribute = np.zeros(class_labels.shape[0], dtype=np.object_)
        for i_label, label in enumerate(class_labels):
            attribute[i_label] = label_vocab_dict[label]
        
        multi_valued_attribute_arrays.append(attribute)
            
    return (np.stack(multi_valued_attribute_arrays, axis=1),
            attribute_label_dicts)
    

In [62]:
xmulti, multi_guide = assign_multi_valued_features(class_labels=y_true,
                                                   subvocab_lengths=config_dict['multival_vocab_lens'],
                                                   level_of_intersection=config_dict['multival_intersections'])


In [63]:
xmulti

array([[{6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11},
        {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}],
       [{3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11},
        {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}],
       [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}],
       ...,
       [{3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11},
        {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}],
       [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}],
       [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}]], dtype=object)

In [64]:
multi_guide

[{0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}}]

Done!

### Put Everything Together

In [65]:
all_attributes = [
    ('num', xnum),
    ('cat', xcat),
    ('multi', xmulti)
]

all_attributes

[('num',
  array([[-2.60269058, -4.22476144,  3.45997033,  2.51034261,  2.93075784],
         [-0.1970824 , -3.10772864,  3.41599443,  6.02866969,  5.15555844],
         [-2.02381431,  3.14839166, -3.1550843 , -2.60587041, -1.76495351],
         ...,
         [ 3.89419075, -3.62263262,  3.72557832,  4.0990099 ,  1.46300835],
         [-2.25330148,  2.14794198, -3.15144734, -2.05743312, -2.78488292],
         [-2.42299465,  1.83934959, -2.3255219 , -4.04454782, -2.95657787]])),
 ('cat',
  array([[4, 4, 5, 4, 4],
         [3, 3, 3, 3, 3],
         [1, 1, 0, 1, 0],
         ...,
         [3, 3, 2, 3, 2],
         [0, 1, 0, 1, 1],
         [0, 0, 1, 1, 1]])),
 ('multi',
  array([[{6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11},
          {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}],
         [{3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11},
          {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}],
         [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
          {0, 1, 2, 9, 10}, {0, 1,

In [66]:
def consolidate_attributes(xlist: list[tuple[str, np.ndarray]]):   
    column_dictionary = dict()
    index_dict = dict()
    
    global_index = 0
    for attr_type, attr_vals in xlist:
        index_dict[attr_type] = []
        local_index = 0
        
        for i_attr in range(attr_vals.shape[1]):
            column_dictionary[f'{attr_type}_{local_index}'] = attr_vals[:, i_attr]
            index_dict[attr_type].append(global_index)

            local_index+=1
            global_index+=1
        
    return pd.DataFrame(column_dictionary), index_dict

In [67]:
joint_x, index_reference = consolidate_attributes(xlist=all_attributes)

joint_x

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4,multi_0,multi_1,multi_2,multi_3,multi_4
0,-2.602691,-4.224761,3.459970,2.510343,2.930758,4,4,5,4,4,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
1,-0.197082,-3.107729,3.415994,6.028670,5.155558,3,3,3,3,3,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
2,-2.023814,3.148392,-3.155084,-2.605870,-1.764954,1,1,0,1,0,"{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}"
3,1.997969,-3.428115,2.034194,5.085620,3.579017,3,2,2,3,2,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-3.827215,-3.512915,3.371360,0.135862,3.685980,5,4,4,4,5,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
1996,3.654517,-1.556371,4.098848,4.240478,0.000666,2,3,3,2,2,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
1997,3.894191,-3.622633,3.725578,4.099010,1.463008,3,3,2,3,2,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
1998,-2.253301,2.147942,-3.151447,-2.057433,-2.784883,0,1,0,1,1,"{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}"


In [68]:
index_reference['num']

[0, 1, 2, 3, 4]

Done!

## Preprocessing Approaches

### Option 1: Treat cell contents as single categories

In [69]:
option_1_df = joint_x.copy()

for col in index_reference['multi']:
    option_1_df.iloc[:, col] = option_1_df.iloc[:, col].astype(str)

In [70]:
option_1_df.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4,multi_0,multi_1,multi_2,multi_3,multi_4
0,-2.602691,-4.224761,3.45997,2.510343,2.930758,4,4,5,4,4,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
1,-0.197082,-3.107729,3.415994,6.02867,5.155558,3,3,3,3,3,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
2,-2.023814,3.148392,-3.155084,-2.60587,-1.764954,1,1,0,1,0,"{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}"
3,1.997969,-3.428115,2.034194,5.08562,3.579017,3,2,2,3,2,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"


Done!

### Option 2: One-Hot encode the vocabulary items

In [71]:
option_2_df = joint_x.copy()

for col in index_reference['multi']:
    option_2_df.iloc[:, col] = option_2_df.iloc[:, col].apply(lambda x: list(x))

option_2_df.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4,multi_0,multi_1,multi_2,multi_3,multi_4
0,-2.602691,-4.224761,3.45997,2.510343,2.930758,4,4,5,4,4,"[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]"
1,-0.197082,-3.107729,3.415994,6.02867,5.155558,3,3,3,3,3,"[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]"
2,-2.023814,3.148392,-3.155084,-2.60587,-1.764954,1,1,0,1,0,"[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]"
3,1.997969,-3.428115,2.034194,5.08562,3.579017,3,2,2,3,2,"[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]"
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4,"[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]"


In [72]:
unchanged_indexes = [i for i in range(option_2_df.shape[1])
                     if i not in index_reference['multi']]

df_as_is = option_2_df.iloc[:, unchanged_indexes]
multi_val_df = option_2_df.iloc[:, index_reference['multi']]

multi_val_df.head()

Unnamed: 0,multi_0,multi_1,multi_2,multi_3,multi_4
0,"[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]"
1,"[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]"
2,"[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]","[0, 1, 2, 9, 10]"
3,"[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]","[3, 4, 5, 9, 11]"
4,"[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]","[6, 7, 8, 10, 11]"


In [73]:
df_as_is.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4
0,-2.602691,-4.224761,3.45997,2.510343,2.930758,4,4,5,4,4
1,-0.197082,-3.107729,3.415994,6.02867,5.155558,3,3,3,3,3
2,-2.023814,3.148392,-3.155084,-2.60587,-1.764954,1,1,0,1,0
3,1.997969,-3.428115,2.034194,5.08562,3.579017,3,2,2,3,2
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4


In [74]:
multi_val_df.index.rename('index', inplace=True)

In [75]:
pd.get_dummies(multi_val_df.iloc[:, 0].apply(pd.Series).stack()).groupby('index', level=0).sum().columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [76]:
columns_to_concat = [df_as_is]
out_cols = list(df_as_is.columns)

for icol in range(multi_val_df.shape[1]):
    dummy_df = pd.get_dummies(multi_val_df.iloc[:, 0].apply(pd.Series).stack()).groupby('index', level=0).sum()
    out_cols += [f'multi_{icol}_{col}' for col in dummy_df.columns]
    columns_to_concat.append(dummy_df)

out_cols[-5:]

['multi_4_7', 'multi_4_8', 'multi_4_9', 'multi_4_10', 'multi_4_11']

In [77]:
option_2_df = pd.concat(columns_to_concat, axis=1)
option_2_df.columns = out_cols

option_2_df.head()

Unnamed: 0_level_0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4,...,multi_4_2,multi_4_3,multi_4_4,multi_4_5,multi_4_6,multi_4_7,multi_4_8,multi_4_9,multi_4_10,multi_4_11
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.602691,-4.224761,3.45997,2.510343,2.930758,4,4,5,4,4,...,0,0,0,0,1,1,1,0,1,1
1,-0.197082,-3.107729,3.415994,6.02867,5.155558,3,3,3,3,3,...,0,1,1,1,0,0,0,1,0,1
2,-2.023814,3.148392,-3.155084,-2.60587,-1.764954,1,1,0,1,0,...,1,0,0,0,0,0,0,1,1,0
3,1.997969,-3.428115,2.034194,5.08562,3.579017,3,2,2,3,2,...,0,1,1,1,0,0,0,1,0,1
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4,...,0,0,0,0,1,1,1,0,1,1


In [78]:
categorical_indexes = [5, 6, 7, 8, 9]
new_categorical_indexes = [icol for icol in range(5, option_2_df.shape[1])]

dummy_indexes = list(set(new_categorical_indexes).difference(set(categorical_indexes)))
dummy_indexes[:5]

[10, 11, 12, 13, 14]

In [79]:
frequencies = pd.DataFrame(option_2_df.iloc[:, dummy_indexes].mean(), columns=['freq']).sort_values('freq', ascending=False)

frequencies.head()

Unnamed: 0,freq
multi_4_11,0.67
multi_3_11,0.67
multi_2_11,0.67
multi_1_11,0.67
multi_0_11,0.67


In [80]:
frequencies.index[:100]

Index(['multi_4_11', 'multi_3_11', 'multi_2_11', 'multi_1_11', 'multi_0_11',
       'multi_1_10', 'multi_2_10', 'multi_3_10', 'multi_0_10', 'multi_4_10',
       'multi_1_9', 'multi_0_9', 'multi_4_9', 'multi_3_9', 'multi_2_9',
       'multi_2_7', 'multi_2_8', 'multi_4_7', 'multi_3_6', 'multi_4_6',
       'multi_3_8', 'multi_4_8', 'multi_3_7', 'multi_2_6', 'multi_0_7',
       'multi_0_8', 'multi_1_8', 'multi_1_7', 'multi_1_6', 'multi_0_6',
       'multi_0_5', 'multi_4_3', 'multi_4_4', 'multi_4_5', 'multi_3_5',
       'multi_3_4', 'multi_3_3', 'multi_1_3', 'multi_1_4', 'multi_1_5',
       'multi_0_3', 'multi_2_5', 'multi_2_4', 'multi_2_3', 'multi_0_4',
       'multi_4_1', 'multi_4_2', 'multi_0_2', 'multi_2_0', 'multi_4_0',
       'multi_1_0', 'multi_1_1', 'multi_1_2', 'multi_2_1', 'multi_3_2',
       'multi_3_1', 'multi_3_0', 'multi_0_1', 'multi_2_2', 'multi_0_0'],
      dtype='object')

In [81]:
def preserve_top_k_frequent_dummies(df_with_dummies,
                                    dummy_indexes,
                                    top_k):
    frequencies = pd.DataFrame(df_with_dummies.iloc[:, dummy_indexes].mean(),
                               columns=['freq']).sort_values('freq',
                                                             ascending=False)
    saved_dummies = frequencies.index[:top_k]
    saved_cols = [col for icol, col in enumerate(df_with_dummies.columns)
                  if icol not in dummy_indexes] + list(saved_dummies)
    
    return df_with_dummies[saved_cols]

In [82]:
preserve_top_k_frequent_dummies(option_2_df, dummy_indexes=dummy_indexes,
                                top_k=100).head()

Unnamed: 0_level_0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4,...,multi_1_0,multi_1_1,multi_1_2,multi_2_1,multi_3_2,multi_3_1,multi_3_0,multi_0_1,multi_2_2,multi_0_0
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.602691,-4.224761,3.45997,2.510343,2.930758,4,4,5,4,4,...,0,0,0,0,0,0,0,0,0,0
1,-0.197082,-3.107729,3.415994,6.02867,5.155558,3,3,3,3,3,...,0,0,0,0,0,0,0,0,0,0
2,-2.023814,3.148392,-3.155084,-2.60587,-1.764954,1,1,0,1,0,...,1,1,1,1,1,1,1,1,1,1
3,1.997969,-3.428115,2.034194,5.08562,3.579017,3,2,2,3,2,...,0,0,0,0,0,0,0,0,0,0
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4,...,0,0,0,0,0,0,0,0,0,0


Done!

### Option 3: One-Hot and apply PCA

In [83]:
option_3_df = option_2_df.copy()

In [84]:
categorical_indexes = [5, 6, 7, 8, 9]
new_categorical_indexes = [icol for icol in range(5, option_3_df.shape[1])]

dummy_indexes = list(set(new_categorical_indexes).difference(set(categorical_indexes)))
dummy_indexes[:5]

[10, 11, 12, 13, 14]

In [85]:
from sklearn.decomposition import PCA

In [86]:
pca = PCA(n_components=round(len(dummy_indexes)*0.25))

In [87]:
option_3_df.iloc[:, dummy_indexes].head()

Unnamed: 0_level_0,multi_0_0,multi_0_1,multi_0_2,multi_0_3,multi_0_4,multi_0_5,multi_0_6,multi_0_7,multi_0_8,multi_0_9,...,multi_4_2,multi_4_3,multi_4_4,multi_4_5,multi_4_6,multi_4_7,multi_4_8,multi_4_9,multi_4_10,multi_4_11
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,1,1,1,0,1,1
1,0,0,0,1,1,1,0,0,0,1,...,0,1,1,1,0,0,0,1,0,1
2,1,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,1,0
3,0,0,0,1,1,1,0,0,0,1,...,0,1,1,1,0,0,0,1,0,1
4,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,1,1,1,0,1,1


In [88]:
pca_df = pd.DataFrame(pca.fit_transform(option_3_df.iloc[:, dummy_indexes]))
pca_df.columns = [f'pca_{col}' for col in pca_df.columns]

pca_df.head()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14
0,3.6167,-0.08539,-3.536722e-15,1.122066e-16,1.6097080000000002e-27,3.761017e-29,1.889023e-30,1.1185180000000001e-31,4.541803e-32,-1.403002e-43,5.845756e-46,-1.320724e-45,2.376924e-46,-1.884234e-47,-7.096375999999999e-50
1,-1.934965,-3.11508,1.251536e-14,-1.697297e-16,-6.729479e-28,-1.8979840000000002e-29,7.04114e-30,3.366396e-31,-1.099938e-31,-4.059072e-44,3.727849e-45,-9.763144e-46,5.699142e-48,1.570727e-49,6.500764e-49
2,-1.78292,3.207648,9.825552e-15,-2.917431e-16,-4.104865000000001e-28,-3.952422e-30,4.2129329999999995e-30,-2.636029e-31,6.784604e-33,-1.166818e-43,-3.462285e-45,-8.393877e-46,3.762166e-46,3.798913e-47,6.724078e-49
3,-1.934965,-3.11508,-7.733755e-15,-4.6587820000000007e-17,5.261214e-28,-1.3604e-29,-3.5834479999999996e-30,1.560138e-32,-3.229774e-32,7.706555e-44,3.301257e-45,-9.63007e-46,1.743305e-46,3.026049e-47,1.699227e-48
4,3.6167,-0.08539,2.777936e-15,-6.113352e-17,6.6835750000000005e-28,-1.906224e-29,6.326317e-31,-1.5003200000000001e-31,9.220929e-33,2.7779010000000003e-43,5.556984e-45,-1.290559e-45,3.532495e-47,-1.850242e-47,1.0731989999999998e-48


In [89]:
other_icols = [icol for icol in range(option_3_df.shape[1])
                       if icol not in dummy_indexes]

option_3_df = pd.concat([
    option_3_df.iloc[:, other_icols],
    pca_df
], axis=1)

option_3_df.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4,...,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14
0,-2.602691,-4.224761,3.45997,2.510343,2.930758,4,4,5,4,4,...,3.761017e-29,1.889023e-30,1.1185180000000001e-31,4.541803e-32,-1.403002e-43,5.845756e-46,-1.320724e-45,2.376924e-46,-1.884234e-47,-7.096375999999999e-50
1,-0.197082,-3.107729,3.415994,6.02867,5.155558,3,3,3,3,3,...,-1.8979840000000002e-29,7.04114e-30,3.366396e-31,-1.099938e-31,-4.059072e-44,3.727849e-45,-9.763144e-46,5.699142e-48,1.570727e-49,6.500764e-49
2,-2.023814,3.148392,-3.155084,-2.60587,-1.764954,1,1,0,1,0,...,-3.952422e-30,4.2129329999999995e-30,-2.636029e-31,6.784604e-33,-1.166818e-43,-3.462285e-45,-8.393877e-46,3.762166e-46,3.798913e-47,6.724078e-49
3,1.997969,-3.428115,2.034194,5.08562,3.579017,3,2,2,3,2,...,-1.3604e-29,-3.5834479999999996e-30,1.560138e-32,-3.229774e-32,7.706555e-44,3.301257e-45,-9.63007e-46,1.743305e-46,3.026049e-47,1.699227e-48
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4,...,-1.906224e-29,6.326317e-31,-1.5003200000000001e-31,9.220929e-33,2.7779010000000003e-43,5.556984e-45,-1.290559e-45,3.532495e-47,-1.850242e-47,1.0731989999999998e-48


Done!

### Option 4: Apply Extended K-Prototypes

In [90]:
# No transformation necessary

Done!

## Apply K-Prototypes

In [91]:
import os
import sys
current_dir = os.getcwd()

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.append(parent_dir)

from kmodes.kprototypes import KPrototypes

In [92]:
results_dict = dict()

### Option 1

In [94]:
categorical_indexes = [icol for icol in range(5, option_1_df.shape[1])]

In [95]:
kp = KPrototypes(n_clusters=config_dict['n_clusters'],
                 gamma=config_dict['approach_settings']['naive']['gamma'])

kp.fit(option_1_df, categorical=categorical_indexes)

In [100]:
results_dict['option_1'] = {'gamma': kp.gamma,
                            'n_iter_': kp.n_iter_}

In [102]:
predicted_labels = kp.labels_

In [103]:
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [104]:
adjusted_mutual_info_score(labels_true=y_true, labels_pred=predicted_labels)

0.9586058530553813

In [105]:
adjusted_rand_score(labels_true=y_true, labels_pred=predicted_labels)

0.9791126643997697

Done!

## Apply Extended K-Prototypes

In [22]:
from kmodes.extended_kprototypes import ExtendedKPrototypes

In [23]:
config_dict

{'n_samples': 2000,
 'n_clusters': 3,
 'n_numeric_features': 5,
 'n_categorical_features': 5,
 'categorical_cardinalities': [6, 6, 6, 6, 6],
 'n_multival_features': 5,
 'multival_subvocab_lengths': [(3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3)],
 'separability': 3.0,
 'multival_intersections': 1,
 'noise': 0.01,
 'class_weights': [0.33, 0.33],
 'gamma_c': 0.33,
 'gamma_m': 0.33,
 'theta': 0.001}

In [24]:
kp = ExtendedKPrototypes(n_clusters=config_dict['n_clusters'],
                         gamma_c=config_dict['gamma_c'],
                         gamma_m=config_dict['gamma_m'],
                         theta=config_dict['theta'])

In [25]:
kp.fit(X=joint_x,
       categorical=index_reference['cat'],
       multi_valued=index_reference['multi'])

### Cluster Centroid Comparison

In [26]:
kp.cluster_centroids_

array([[3.0185861000790264, -3.0664419685857043, 3.0088145764917775,
        3.0260429790005916, 2.9907214055909104, 3, 3, 2, 3, 2,
        {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11},
        {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}],
       [-2.9811794209672797, -2.9764893380390816, 3.004605624486815,
        3.0095916438809307, 3.074282197970204, 5, 4, 5, 4, 5,
        {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11},
        {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}],
       [-3.0410068092084375, 2.966551008126802, -3.014000568918504,
        -3.005772394882082, -3.0427057444493957, 1, 1, 1, 0, 1,
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}]], dtype=object)

In [28]:
joint_x.iloc[:, index_reference['multi']]

Unnamed: 0,multi_0,multi_1,multi_2,multi_3,multi_4
0,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
1,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
2,"{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}"
3,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
4,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
...,...,...,...,...,...
1995,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
1996,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
1997,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
1998,"{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}"


The algorithm found the centroid values that we established (with no within-cluster variability.)

### Partition Similarity Evaluation

In [31]:
y_pred = kp.predict(X=joint_x,
                    categorical=index_reference['cat'],
                    multi_valued=index_reference['multi'])

In [32]:
adjusted_mutual_info_score(labels_true=y_true, labels_pred=y_pred)

0.950279137401805

In [33]:
adjusted_rand_score(labels_true=y_true, labels_pred=y_pred)

0.973182055844923

These evaluations are quite wonderful!