# Benchmarking Experiments for Extended K-Prototypes (Paper Versions)

Variations on the experimental structure of the original extension as presented for the undergraduate thesis of the author.

In [1]:
import os
import sys
current_dir = os.getcwd()

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.append(parent_dir)

from random import randint
from typing import Any, Optional

import numpy as np
import pandas as pd

from benchmark_extension import Experiment

Data generation has been changed to follow a probabilistic approach for the generation of the categorical and multi-valued attributes.  
  
Categorical attributes are sampled from a user-defined distribution of categorical items.  
  
Multi-valued attributes are sampled using a tree of conditional probability per cluster to simulate the way in which some items of multi-valued attributes tend to appear in common with others. This is preferred to repeating the approach pursued for categorical variables to better represent real-life conditions that practitioners may face.

In [2]:
sample_configuration = {
    'n_samples': 2000,
    'n_clusters': 3,
    'class_weights': [0.33, 0.33],
    # Numeric Features
    'n_numeric_features': 5,
    'separability': 3.0,
    'noise': 0.01,
    # Categroical Features
    'n_categorical_features': 5,
    'categorical_cardinalities': [6, 6, 6, 6, 6],
    'category_distributions':(  # One list per feature per cluster
        [
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05]],
        [
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05]],
        [
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4]]
    ),
    # Multi-valued Features
    'n_multival_features': 5,
    'base_chances': (   # One list per feature per cluster
        [   
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05],
            [0.4, 0.4, 0.05, 0.05, 0.05]
        ],
        [
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05],
            [0.05, 0.05, 0.4, 0.4, 0.05]
        ],
        [
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4],
            [0.05, 0.05, 0.05, 0.05, 0.4]
        ]
    ),
    'conditional_probabilities': (
        [  # One dict per feature, per cluster
            {
                0: [0.04, 0.80, 0.04, 0.04, 0.04],
                1: [0.80, 0.04, 0.04, 0.04, 0.04]
            },
            {
                0: [0.04, 0.80, 0.04, 0.04, 0.04],
                1: [0.80, 0.04, 0.04, 0.04, 0.04]
            },
            {
                0: [0.04, 0.80, 0.04, 0.04, 0.04],
                1: [0.80, 0.04, 0.04, 0.04, 0.04]
            },
            {
                0: [0.04, 0.80, 0.04, 0.04, 0.04],
                1: [0.80, 0.04, 0.04, 0.04, 0.04]
            },
            {
                0: [0.04, 0.80, 0.04, 0.04, 0.04],
                1: [0.80, 0.04, 0.04, 0.04, 0.04]
            }],
        [
            {
                2: [0.04, 0.04, 0.04, 0.80, 0.04],
                3: [0.04, 0.04, 0.80, 0.04, 0.04],
            },
            {
                2: [0.04, 0.04, 0.04, 0.80, 0.04],
                3: [0.04, 0.04, 0.80, 0.04, 0.04],
            },
            {
                2: [0.04, 0.04, 0.04, 0.80, 0.04],
                3: [0.04, 0.04, 0.80, 0.04, 0.04],
            },
            {
                2: [0.04, 0.04, 0.04, 0.80, 0.04],
                3: [0.04, 0.04, 0.80, 0.04, 0.04],
            },
            {
                2: [0.04, 0.04, 0.04, 0.80, 0.04],
                3: [0.04, 0.04, 0.80, 0.04, 0.04],
            }],
        [
            {
                4: [0.04, 0.04, 0.04, 0.04, 0.04],
                5: [0.04, 0.04, 0.04, 0.04, 0.80]
            },
            {
                4: [0.04, 0.04, 0.04, 0.04, 0.04],
                5: [0.04, 0.04, 0.04, 0.04, 0.80]
            },
            {
                4: [0.04, 0.04, 0.04, 0.04, 0.04],
                5: [0.04, 0.04, 0.04, 0.04, 0.80]
            },
            {
                4: [0.04, 0.04, 0.04, 0.04, 0.04],
                5: [0.04, 0.04, 0.04, 0.04, 0.80]
            },
            {
                4: [0.04, 0.04, 0.04, 0.04, 0.04],
                5: [0.04, 0.04, 0.04, 0.04, 0.80]
            }]
    ),
    # Approach Settings
    'approach_settings': {
        'naive': {
            'gamma': None
        },
        'one-hot': {
            'gamma': None,
            'max_dummies': 100
        },
        'one-hot-pca': {
            'gamma': None,
            'reduced_dimensions': 0.25
        },
        'extended': {
            'gamma_c': 0.33,
            'gamma_m': 0.33,
            'theta': 0.001
        }
    }
}

The sample configuration includes a bi-modal distribution for each categorical attribute that modifies the modes for each cluster. Harder-to-cluster configurations where some elements are modal in more than one cluster or where some levels are never modal could be tried.

**Input Rules**:  

- Class weights should be of length `n_clusters - 1`. The missing weight will be calculated with `1 - sum(class_weights)`.

- Length of `categorical_cardinalities` equal to `n_categorical_features`.

- Length of `category_distributions` should be equal to `n_clusters`. Each item is a list of lists specifying a categorical distribution for each categorical attribute describing the categorical characteristics of the cluster.
  
- Each item in `category_distributions` should include a list per categorical attribute containing the sampling probabilities for each category (as defined in `categorical_cardinalities` minus one) in the attribute. The probabilities should sum up to less than one. The probability for the missing category will be calculated as `1 - sum(distribution)`.
  
- Length of `probability_trees` should be equal to `n_multival_features`.
  
- The probabilities of the children of each node in each ``tree`` of ``probability_trees`` should sum up to one. Rule does not apply to leafs.
  
- In `approach_settings`, the field `reduced_dimensions` in `one-hot-pca` must be a float in the open interval $(0, 1)$.
  
- In `approach_settings`, the field the gamma fields in `extended` must be floats in the interval $[0, 1)$ and not sum up to more than one. 

In [3]:
def cat_distributions_to_generation(cluster_assignment_vector: np.ndarray[int],
                                    category_distributions: tuple[list[list[float]]],
                                    n_categorical_features: int,
                                    categorical_cardinalities: list[int],
                                    n_clusters: int,
                                    random_generator: np.random.Generator,
                                    round_digits: int = 5):

    # The expected structure of category distributions is
    # tuple (len n_clusters)[list(len n_categorical_features)[list(categorical_cardinality - 1)]]
    # A tuple containing matrices of shape (n_cat_features x (cardinality - 1)) ONLY IF THE CARDINALITY IS CONSTANT. 
    # Alternatively, it is a list of lists defining probabilities

    # Checks that everything should be the way it is
    if len(categorical_cardinalities) != n_categorical_features:
        raise ValueError(f"Mismatched categorical cardinalities ({len(categorical_cardinalities)}) and number of categorical features ({n_categorical_features})")

    if len(category_distributions) != n_clusters:
        raise ValueError(f"Mismatched distributions ({len(category_distributions)}) and clusters ({n_clusters})")
    
    for distr in category_distributions:
        if len(distr) != n_categorical_features:
            raise ValueError("Probability distributions must be provided for all"
                             f"categorical features ({n_categorical_features})."
                             f"Only {len(distr)} have been specified.")
        
        for i, var_distr in enumerate(distr):
            if len(var_distr) != (categorical_cardinalities[i]-1):
                raise ValueError("Probability distribution must include categorical_cardinalities - 1 values.")
            if round(sum(var_distr), round_digits) > 1:
                raise ValueError("Probability distribution must not sum up to more than one.")
    
    # Code
    output_columns = []

    for i_feature in range(n_categorical_features):
        # Check the cluster and get the respective prob distribution
        # Sample once from the multinomial and use argmax to get the category
        choice_func = np.vectorize(
            lambda i_cluster:
            np.argmax(
                random_generator.multinomial(1,
                    pvals=category_distributions[i_cluster][i_feature]+\
                            [np.round(1-sum(
                                category_distributions[i_cluster][i_feature]
                                ),
                            round_digits)],
                    size=1),
                axis=1)
        )
        
        output_columns.append(choice_func(cluster_assignment_vector))

    cat_df = pd.DataFrame(np.array(output_columns).T)

    colnames = []
    for col in cat_df.columns:
        colnames.append(f"cat_{col}")
    
    cat_df.columns = colnames

    return cat_df


In [4]:
cluster_array = np.array([0, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64)

In [5]:
cat_distributions_to_generation(cluster_assignment_vector=cluster_array,
                                category_distributions=sample_configuration['category_distributions'],
                                n_categorical_features=sample_configuration['n_categorical_features'],
                                categorical_cardinalities=sample_configuration['categorical_cardinalities'],
                                n_clusters=sample_configuration['n_clusters'],
                                random_generator=np.random.default_rng(42))

Unnamed: 0,cat_0,cat_1,cat_2,cat_3,cat_4
0,5,0,1,1,4
1,3,2,3,3,3
2,2,2,2,3,0
3,4,5,4,5,5
4,2,3,2,1,2
5,2,1,0,4,1
6,0,0,0,0,0
7,1,1,2,1,5
8,1,0,0,1,1
9,0,0,1,1,5


Now for the multi-valued attributes, as generated from probability trees.

In [6]:
from collections import defaultdict

base_chance = [0.05, 0.05, 0.4, 0.4, 0.05]

conditional_probabilities = defaultdict(lambda: base_chance+[round(1-sum(base_chance))])

specified_conditional_probabilities = {
    2: [0.04, 0.04, 0.04, 0.80, 0.04],
    3: [0.04, 0.04, 0.80, 0.04, 0.04]
}

conditional_probabilities.update(specified_conditional_probabilities)

In [7]:
conditional_probabilities[1]

[0.05, 0.05, 0.4, 0.4, 0.05, 0]

In [8]:
conditional_probabilities[2]

[0.04, 0.04, 0.04, 0.8, 0.04]

In [9]:
def sample_multival_once(iterations: int,
                         base_chance: list[float],
                         conditional_probabilities: dict,
                         random_generator: np.random.Generator):
    choice = np.argmax(
        random_generator.multinomial(1,
            pvals=base_chance+[round(1-sum(base_chance))]
        )
    )
    choice_set = {choice}

    for i in range(iterations):
        # Will get the base chance or the specified ones
        choice = np.argmax(
            random_generator.multinomial(1,
                pvals=conditional_probabilities[choice]+[
                    round(1-sum(conditional_probabilities[choice]))
                ]
            )
        )
        choice_set.add(choice)

    return choice_set

In [10]:
sample_multival_once(iterations=10,
                     base_chance=base_chance,
                     conditional_probabilities=conditional_probabilities,
                     random_generator=np.random.default_rng(42))

{1, 2, 3}

In [11]:
from copy import deepcopy

In [12]:
def fixed_lambda(val):
    return lambda: val
# We trick python by defining a new scope over the lambda,
# such that the p array will be stored instead of pointing to the base_chance
# variable.

In [23]:
def multival_generation(cluster_assignment_vector: np.ndarray,
                        n_clusters: int,
                        n_multival_features: int,
                        base_chances: list[list[float]],
                        conditional_probabilities: list[dict],
                        iterations: int,
                        random_generator: np.random.Generator):
    # Check input integrity
    if len(base_chances) != n_clusters:
        pass

    # Cache the p-dicts here for the function call below
    cluster_p_dicts = []
    for i_cluster in range(n_clusters):
        feature_dicts = []
        
        for i_feature in range(n_multival_features):
            base_chance = deepcopy(
                base_chances[i_cluster][i_feature]+[round(
                    1-sum(base_chances[i_cluster][i_feature]))
                ])
            p_dict = defaultdict(fixed_lambda(base_chance))
            
            p_dict.update(conditional_probabilities[i_cluster][i_feature])
            feature_dicts.append(p_dict)
        
        cluster_p_dicts.append(feature_dicts)
    
    # Vectorize the sampling function to create the features
    output_columns = []

    for i_feature in range(n_multival_features):
        sampling_func = np.vectorize(lambda i_cl:
                            sample_multival_once(iterations=iterations,
                                base_chance=base_chances[i_cl][i_feature],
                                conditional_probabilities=cluster_p_dicts[i_cl][i_feature],
                                random_generator=random_generator
                            )
                        )
        output_columns.append(sampling_func(cluster_assignment_vector))

    multi_df = pd.DataFrame(np.array(output_columns).T)

    colnames = []
    for col in multi_df.columns:
        colnames.append(f"multi_{col}")
    
    multi_df.columns = colnames

    return multi_df
    

In [25]:
out_cols = multival_generation(cluster_assignment_vector=[0, 0, 0, 1, 1, 1, 2, 2, 2],
                    n_clusters=sample_configuration['n_clusters'],
                    n_multival_features=sample_configuration['n_multival_features'],
                    base_chances=sample_configuration['base_chances'],
                    conditional_probabilities=sample_configuration['conditional_probabilities'],
                    iterations=4,
                    random_generator=np.random.default_rng(42))

out_cols

Unnamed: 0,multi_0,multi_1,multi_2,multi_3,multi_4
0,"{0, 1, 3}","{0, 1, 4}","{0, 1}","{0, 1}","{0, 1, 2}"
1,"{0, 1, 2}","{0, 1, 4, 5}","{0, 1, 2}","{0, 1, 4, 5}","{0, 1, 3}"
2,"{0, 1, 3}","{0, 1}","{0, 1}","{0, 1}","{0, 4}"
3,"{2, 3, 5}","{2, 3}","{2, 3}","{0, 1, 2, 3}","{2, 3}"
4,"{2, 3}","{1, 2, 3}","{2, 3, 5}","{0, 2, 3, 5}","{0, 2, 3, 5}"
5,"{2, 3}","{2, 3}","{0, 1, 2, 3}","{2, 3}","{2, 3}"
6,"{2, 3, 4, 5}","{4, 5}","{4, 5}","{4, 5}","{4, 5}"
7,"{2, 4, 5}","{2, 4, 5}","{1, 4, 5, 6}","{0, 2, 4, 6}","{4, 5}"
8,"{4, 5}","{4, 5}","{4, 5}","{0, 1, 2, 3, 4}","{1, 4, 5, 6}"
