In [1]:
import numpy as np
import pandas as pd
from itertools import combinations_with_replacement

from sklearn.datasets import make_classification

## Generate data

In [2]:
from random import randint

# random_states = [randint(0, 100) for x in range(100)]

random_states = [69, 11, 97, 19, 72, 38, 78, 36, 16, 3, 53, 76, 74, 78, 8, 99,
                 45, 23, 58, 68, 1, 40, 52, 51, 95, 73, 64, 84, 38, 39, 82, 60,
                 11, 97, 12, 10, 91, 44, 46, 65, 51, 59, 26, 94, 26, 40, 99, 8,
                 47, 27, 42, 1, 87, 7, 98, 31, 9, 27, 44, 42, 24, 18, 69, 100,
                 85, 46, 33, 16, 47, 70, 9, 13, 73, 53, 87, 51, 72, 22, 93, 61,
                 33, 92, 50, 20, 65, 67, 74, 9, 75, 0, 61, 10, 99, 30, 52, 80,
                 10, 2, 83, 58]

### Configuration and input verification

In [3]:
CONFIG = {
    'easy':{
        'n_samples': 2000,
        'n_clusters': 3,
        # Features
        'n_numeric_features': 5,
        'n_categorical_features': 5,
        'categorical_cardinalities': [6, 6, 6, 6, 6],
        'n_multival_features': 5,
        'multival_subvocab_lengths': [(3, 3, 3),  # How many vocab items are
                                      (3, 3, 3),  # associated to a cluster.
                                      (3, 3, 3),
                                      (3, 3, 3),
                                      (3, 3, 3)],
        # Difficulty params
        'separability': 3.0,
        'multival_intersections': 1,
        'noise': 0.01,
        'class_weights': [0.33, 0.33],
        # Clustering Algo Settings
        'gamma_c':0.33,
        'gamma_m':0.33,
        'theta':0.001    
    }
}

In [4]:
PARAM_GUIDE = {
    'n_samples': int,
    'n_clusters': int,
    # Numeric features
    'n_numeric_features': int,
    # Categorical features
    'n_categorical_features': int,
    'categorical_cardinalities': list,
    # Multi-valued features
    'n_multival_features': int,
    'multival_subvocab_lengths': list,
    # Difficulty params
    'separability': float,
    'multival_intersections': int,
    'noise': float,
    'class_weights': list,
    'gamma_c': float,
    'gamma_m': float,
    'theta': float
}

In [5]:
config_dict = CONFIG['easy']

def validate_config(config_dict, param_guide=PARAM_GUIDE):
    try:
        for param in param_guide.keys():
            if not isinstance(config_dict[param], param_guide[param]):
                raise ValueError(f"Parameter {param} should be an integer.")
    except KeyError as e:
        print(f"Missing key {param} in the configuration dict.")
        raise e
    
    # Length checks
    if len(config_dict['class_weights']) != config_dict['n_clusters']:
        if (len(config_dict['class_weights']) !=
            config_dict['n_clusters']-1):
            raise ValueError("A number of class_weights equal to the number of "
                            "clusters or the number of clusters minus one "
                            "must be provided. List had size "
                            f"{len(config_dict['class_weights'])}.")
    
    if len(config_dict['categorical_cardinalities']) != config_dict['n_categorical_features']:
        raise ValueError("A cardinality must be provided for every "
                         f"categorical attribute. {len(config_dict['categorical_cardinalities'])} cardinalities "
                         f"were provided for {config_dict['n_categorical_features']} attributes.")
    if len(config_dict['multival_subvocab_lengths']) != config_dict['n_multival_features']:
        raise ValueError("A vocabulary length must be provided for every "
                         f"multi-valued attribute. {len(config_dict['multival_subvocab_length'])} vocabulary length list-likes "
                         f"were provided for {config_dict['n_multival_features']} attributes.")
    
    for card in config_dict['categorical_cardinalities']:
        if card < config_dict['n_clusters']:
            raise ValueError("Categorical attribute cardinalities cannot be lower than n_clusters.")
    
    
    for card_tuple in config_dict['multival_subvocab_lengths']:
        if len(card_tuple) != config_dict['n_clusters']:
            raise ValueError("A sub-vocabulary length must be provided for each cluster. "
                             f"{card_tuple} was provided without the required length {config_dict['n_clusters']}")
        
    if config_dict['gamma_m']+config_dict['gamma_c'] >= 1.0:
        raise ValueError("Extended K-Prototypes gamma values should sum to "
                         "be less than 1.0. Provided values sum up to "
                         f"{config_dict['gamma_m']+config_dict['gamma_c']}.")
        
    return config_dict


config_dict = validate_config(config_dict=config_dict)
config_dict

{'n_samples': 2000,
 'n_clusters': 3,
 'n_numeric_features': 5,
 'n_categorical_features': 5,
 'categorical_cardinalities': [6, 6, 6, 6, 6],
 'n_multival_features': 5,
 'multival_subvocab_lengths': [(3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3)],
 'separability': 3.0,
 'multival_intersections': 1,
 'noise': 0.01,
 'class_weights': [0.33, 0.33],
 'gamma_c': 0.33,
 'gamma_m': 0.33,
 'theta': 0.001}

Done!

### Generate the data

In [6]:
xnum, y_true = make_classification(
    n_samples=config_dict['n_samples'],
    n_features=config_dict['n_numeric_features'],
    n_informative=config_dict['n_numeric_features'],
    n_redundant=0,
    n_repeated=0,
    n_classes=config_dict['n_clusters'],
    n_clusters_per_class=1,
    weights=config_dict['class_weights'],
    flip_y=config_dict['noise'],
    class_sep=config_dict['separability'],
    random_state=random_states[0]
    )

In [7]:
# Creation of categories
def assign_categorical_features(class_labels:np.ndarray,
                                # n_categorical:int,
                                cardinalities:list,
                                random_state):
    n_clusters = len(np.unique(class_labels))
    random_generator = np.random.default_rng(seed=random_state)
    cluster_classes_keys = []
    categorical_attribute_arrays = []

    for card in cardinalities:
        cluster_to_class = dict()
        
        extra_levels = card % n_clusters
        levels_per_cluster = (card - extra_levels) / n_clusters
        curr_level = 0
        
        for clust in np.unique(class_labels):
            # curr_level += 1
            cluster_to_class[clust] = []
            # assert (card - extra_levels) % n_clusters == 0
            
            for level in range(int(levels_per_cluster)):
                cluster_to_class[clust].append(curr_level+level)
            curr_level += 2
        
        # Assign the extra levels at random
        if extra_levels:
            for extra_level in range(extra_levels):
                clust = random_generator.choice(class_labels)
                cluster_to_class[clust].append(curr_level+extra_level)

        # For items belonging to a given label, assign it a level from the dict
        attribute = np.zeros(class_labels.shape[0], dtype=np.int32)
        for i_label, label in enumerate(class_labels):
            attribute[i_label] = random_generator.choice(cluster_to_class[label])
        
        categorical_attribute_arrays.append(attribute)
        cluster_classes_keys.append(cluster_to_class)

    return np.stack(categorical_attribute_arrays, axis=1), cluster_classes_keys

In [8]:
xcat, cat_guide = assign_categorical_features(class_labels=y_true,
                                   cardinalities=config_dict['categorical_cardinalities'],
                                   random_state=random_states[0])

xcat

array([[4, 4, 5, 4, 4],
       [3, 3, 3, 3, 3],
       [1, 1, 0, 1, 0],
       ...,
       [3, 3, 2, 3, 2],
       [0, 1, 0, 1, 1],
       [0, 0, 1, 1, 1]])

In [9]:
y_true

array([2, 1, 0, ..., 1, 0, 0])

In [10]:
cat_guide

[{0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]},
 {0: [0, 1], 1: [2, 3], 2: [4, 5]}]

In [11]:
xnum

array([[-2.60269058, -4.22476144,  3.45997033,  2.51034261,  2.93075784],
       [-0.1970824 , -3.10772864,  3.41599443,  6.02866969,  5.15555844],
       [-2.02381431,  3.14839166, -3.1550843 , -2.60587041, -1.76495351],
       ...,
       [ 3.89419075, -3.62263262,  3.72557832,  4.0990099 ,  1.46300835],
       [-2.25330148,  2.14794198, -3.15144734, -2.05743312, -2.78488292],
       [-2.42299465,  1.83934959, -2.3255219 , -4.04454782, -2.95657787]])

In [12]:
y_true

array([2, 1, 0, ..., 1, 0, 0])

In [13]:
config_dict

{'n_samples': 2000,
 'n_clusters': 3,
 'n_numeric_features': 5,
 'n_categorical_features': 5,
 'categorical_cardinalities': [6, 6, 6, 6, 6],
 'n_multival_features': 5,
 'multival_subvocab_lengths': [(3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3)],
 'separability': 3.0,
 'multival_intersections': 1,
 'noise': 0.01,
 'class_weights': [0.33, 0.33],
 'gamma_c': 0.33,
 'gamma_m': 0.33,
 'theta': 0.001}

In [14]:
# Creation of multi-valued attributes
def assign_multi_valued_features(class_labels:np.ndarray,
                                 subvocab_lengths:list,
                                 level_of_intersection:int) -> np.ndarray:
    """
    Create multi-valued attributes from class labels, the lengths of the 
    vocabulary subsets that are assigned to each label, and the degree to which
    pairwise clusters should have intersections.
    
    Arguments
    ---------
    level_of_intersection:int
        The level of intersection refers to the number of items in the 
        vocabulary that are common to cluster pairs. The higher it is relative 
        to the sub-vocabulary lenghts, the lower the distance between clusters 
        in relation to their multi-valued attributes will be.
    """
    clusters = np.unique(class_labels)
    attribute_label_dicts = []
    multi_valued_attribute_arrays = []

    for subvocab in subvocab_lengths:   # Iterate over n_multival, implicitly
        total_attribute_vocabulary = {-1}
        label_vocab_dict = dict().fromkeys(clusters)

        for clust in clusters:
            label_vocab_dict[clust] = set()
            subvocab_clust_l = subvocab[clust]
            
            
            for item in range(max(total_attribute_vocabulary) + 1,
                              max(total_attribute_vocabulary) +
                                subvocab_clust_l + 1):
                label_vocab_dict[clust].add(item)
                total_attribute_vocabulary.add(item)
        
        total_attribute_vocabulary.remove(-1)

        # Add the pairwise intersections
        if level_of_intersection > 0:
            for cluster_pair in [clust_comb for clust_comb in
                                 combinations_with_replacement(clusters, 2)
                                 if clust_comb[0] != clust_comb[1]]:
                for item in range(max(total_attribute_vocabulary) + 1,
                                  max(total_attribute_vocabulary) +
                                    level_of_intersection + 1):
                    
                    label_vocab_dict[cluster_pair[0]].add(item)
                    label_vocab_dict[cluster_pair[1]].add(item)
                    total_attribute_vocabulary.add(item)
        
        attribute_label_dicts.append(label_vocab_dict)

        # Now build, for each attribute, the array containing the observations
        attribute = np.zeros(class_labels.shape[0], dtype=np.object_)
        for i_label, label in enumerate(class_labels):
            attribute[i_label] = label_vocab_dict[label]
        
        multi_valued_attribute_arrays.append(attribute)
            
    return (np.stack(multi_valued_attribute_arrays, axis=1),
            attribute_label_dicts)
    

In [15]:
xmulti, multi_guide = assign_multi_valued_features(class_labels=y_true,
                                                   subvocab_lengths=config_dict['multival_subvocab_lengths'],
                                                   level_of_intersection=config_dict['multival_intersections'])


In [16]:
xmulti

array([[{6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11},
        {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}],
       [{3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11},
        {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}],
       [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}],
       ...,
       [{3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11},
        {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}],
       [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}],
       [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
        {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}]], dtype=object)

In [17]:
multi_guide

[{0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}},
 {0: {0, 1, 2, 9, 10}, 1: {3, 4, 5, 9, 11}, 2: {6, 7, 8, 10, 11}}]

Done!

### Put Everything Together

In [18]:
all_attributes = [
    ('num', xnum),
    ('cat', xcat),
    ('multi', xmulti)
]

all_attributes

[('num',
  array([[-2.60269058, -4.22476144,  3.45997033,  2.51034261,  2.93075784],
         [-0.1970824 , -3.10772864,  3.41599443,  6.02866969,  5.15555844],
         [-2.02381431,  3.14839166, -3.1550843 , -2.60587041, -1.76495351],
         ...,
         [ 3.89419075, -3.62263262,  3.72557832,  4.0990099 ,  1.46300835],
         [-2.25330148,  2.14794198, -3.15144734, -2.05743312, -2.78488292],
         [-2.42299465,  1.83934959, -2.3255219 , -4.04454782, -2.95657787]])),
 ('cat',
  array([[4, 4, 5, 4, 4],
         [3, 3, 3, 3, 3],
         [1, 1, 0, 1, 0],
         ...,
         [3, 3, 2, 3, 2],
         [0, 1, 0, 1, 1],
         [0, 0, 1, 1, 1]])),
 ('multi',
  array([[{6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11},
          {6, 7, 8, 10, 11}, {6, 7, 8, 10, 11}],
         [{3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11},
          {3, 4, 5, 9, 11}, {3, 4, 5, 9, 11}],
         [{0, 1, 2, 9, 10}, {0, 1, 2, 9, 10}, {0, 1, 2, 9, 10},
          {0, 1, 2, 9, 10}, {0, 1,

In [19]:
def consolidate_attributes(xlist: list[tuple[str, np.ndarray]]):   
    column_dictionary = dict()
    index_dict = dict()
    
    global_index = 0
    for attr_type, attr_vals in xlist:
        index_dict[attr_type] = []
        local_index = 0
        
        for i_attr in range(attr_vals.shape[1]):
            column_dictionary[f'{attr_type}_{local_index}'] = attr_vals[:, i_attr]
            index_dict[attr_type].append(global_index)

            local_index+=1
            global_index+=1
        
    return pd.DataFrame(column_dictionary), index_dict

In [20]:
joint_x, index_reference = consolidate_attributes(xlist=all_attributes)

joint_x

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,cat_0,cat_1,cat_2,cat_3,cat_4,multi_0,multi_1,multi_2,multi_3,multi_4
0,-2.602691,-4.224761,3.459970,2.510343,2.930758,4,4,5,4,4,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
1,-0.197082,-3.107729,3.415994,6.028670,5.155558,3,3,3,3,3,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
2,-2.023814,3.148392,-3.155084,-2.605870,-1.764954,1,1,0,1,0,"{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}"
3,1.997969,-3.428115,2.034194,5.085620,3.579017,3,2,2,3,2,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
4,-2.744668,-2.208053,0.557755,2.809556,1.193116,5,5,4,5,4,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-3.827215,-3.512915,3.371360,0.135862,3.685980,5,4,4,4,5,"{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}","{6, 7, 8, 10, 11}"
1996,3.654517,-1.556371,4.098848,4.240478,0.000666,2,3,3,2,2,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
1997,3.894191,-3.622633,3.725578,4.099010,1.463008,3,3,2,3,2,"{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}","{3, 4, 5, 9, 11}"
1998,-2.253301,2.147942,-3.151447,-2.057433,-2.784883,0,1,0,1,1,"{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}","{0, 1, 2, 9, 10}"


In [21]:
index_reference['num']

[0, 1, 2, 3, 4]

Done!

## Apply Extended K-Prototypes

In [22]:
import os
import sys
current_dir = os.getcwd()

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.append(parent_dir)

from kmodes.extended_kprototypes import ExtendedKPrototypes

In [23]:
config_dict

{'n_samples': 2000,
 'n_clusters': 3,
 'n_numeric_features': 5,
 'n_categorical_features': 5,
 'categorical_cardinalities': [6, 6, 6, 6, 6],
 'n_multival_features': 5,
 'multival_subvocab_lengths': [(3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3),
  (3, 3, 3)],
 'separability': 3.0,
 'multival_intersections': 1,
 'noise': 0.01,
 'class_weights': [0.33, 0.33],
 'gamma_c': 0.33,
 'gamma_m': 0.33,
 'theta': 0.001}

In [24]:
kp = ExtendedKPrototypes(n_clusters=config_dict['n_clusters'],
                         gamma_c=config_dict['gamma_c'],
                         gamma_m=config_dict['gamma_m'],
                         theta=config_dict['theta'])

In [25]:
kp.fit(X=joint_x,
       categorical=index_reference['cat'],
       multi_valued=index_reference['multi'])

### Cluster Centroid Comparison

In [None]:
kp.cluster_centroids_

In [None]:
joint_x[index_reference['multi']]

In [26]:
y_pred = kp.predict(X=joint_x,
                    categorical=index_reference['cat'],
                    multi_valued=index_reference['multi'])

### Partition Similarity Evaluation

In [27]:
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [28]:
adjusted_mutual_info_score(labels_true=y_true, labels_pred=y_pred)

0.950279137401805

In [29]:
adjusted_rand_score(labels_true=y_true, labels_pred=y_pred)

0.973182055844923