# Benchmarking Experiments for Extended K-Prototypes (Paper Versions)

Variations on the experimental structure of the original extension as presented for the undergraduate thesis of the author.

In [None]:
import os
import sys
current_dir = os.getcwd()

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.append(parent_dir)

from random import randint
from typing import Any

import pandas as pd

from benchmark_extension import Experiment

Data generation has been changed to follow a probabilistic approach for the generation of the categorical and multi-valued attributes.  
  
Categorical attributes are sampled from a user-defined distribution of categorical items.  
  
Multi-valued attributes are sampled using a tree of conditional probability per cluster to simulate the way in which some items of multi-valued attributes tend to appear in common with others. This is preferred to repeating the approach pursued for categorical variables to better represent real-life conditions that practitioners may face.

In [None]:
sample_configuration = {
    'n_samples': 2000,
    'n_clusters': 3,
    'class_weights': [0.33, 0.33],
    # Numeric Features
    'n_numeric_features': 5,
    'separability': 3.0,
    'noise': 0.01,
    # Categroical Features
    'n_categorical_features': 5,
    'categorical_cardinalities': [6, 6, 6, 6, 6],
    'category distributions':[[],
                              [],
                              [],
                              [],
                              []],
    # Multi-valued Features
    'n_multival_features': 5,
    'probability_trees': [],
    # Approach Settings
    'approach_settings': {
        'naive': {
            'gamma': None
        },
        'one-hot': {
            'gamma': None,
            'max_dummies': 100
        },
        'one-hot-pca': {
            'gamma': None,
            'reduced_dimensions': 0.25
        },
        'extended': {
            'gamma_c': 0.33,
            'gamma_m': 0.33,
            'theta': 0.001
        }
    }
}