In [13]:
import pandas as pd
import numpy as np
import itertools

# Datos de entrada
data = {
    "job_title": ["Graphic Designer", "Software Engineer", "Warehouse Associate", "Software Engineer", "Graphic Designer"],
    "gender": ["Female", "Male", "Female", "Male", "Male"],
    "perf_eval": [5, 5, 4, 5, 5],
    "edu": ["College", "College", "PhD", "Masters", "Masters"],
    "dept": ["Operations", "Management", "Administration", "Sales", "Engineering"],
    "seniority": [2, 5, 5, 4, 5],
    "total_salary_range": ["40,000-80,000", "80,001-120,000", "80,001-120,000", "80,001-120,000", "80,001-120,000"],
    "age_group": ["0-19", "20-39", "0-19", "20-39", "20-39"]
}

df = pd.DataFrame(data)

def generate_contrast_sets(data, max_level):
    tree = {}
    columns = list(data.keys())
    classes_level_1 = list(set(data[columns[0]]))
    tree[1] = classes_level_1

    for level in range(2, max_level + 1):
        new_level_classes = []
        if level == 2:
            combinations = list(itertools.product(tree[1], set(data[columns[1]])))
        else:
            combinations = generate_combinations(tree[level - 1], set(data[columns[level - 1]]))

        for combination in combinations:
            new_classes = tuple(combination)
            if not any(set(c) == set(new_classes) for c in tree.values()):
                new_level_classes.append(new_classes)

        if new_level_classes:
            tree[level] = new_level_classes
        else:
            break

    return tree

def generate_combinations(previous_level, column_data):
    combinations = []
    for prev_class in previous_level:
        for column_value in column_data:
            new_combination = prev_class + (column_value,)
            combinations.append(new_combination)
    return combinations

def calculate_support(contrast_sets, df):
    supports = {}
    for level, classes in contrast_sets.items():
        supports[level] = {}
        for c in classes:
            if type(c) == tuple:
                support = np.mean(df.apply(lambda row: all(item in row.values for item in c), axis=1))
            else:
                support = (df[df.columns[0]] == c).mean()
            supports[level][c] = support
    return supports
            
max_level = 8
mindev = 0.1

contrast_sets_tree = generate_contrast_sets(data, max_level)
filtered_supports = calculate_support(contrast_sets_tree, df)

for level, support in filtered_supports.items():
    if support:  # Si el nivel tiene soportes que cumplen con mindev
        print(f'Level {level}: {support}')


Level 1: {'Warehouse Associate': 0.2, 'Graphic Designer': 0.4, 'Software Engineer': 0.4}
Level 2: {('Warehouse Associate', 'Female'): 0.2, ('Warehouse Associate', 'Male'): 0.0, ('Graphic Designer', 'Female'): 0.2, ('Graphic Designer', 'Male'): 0.2, ('Software Engineer', 'Female'): 0.0, ('Software Engineer', 'Male'): 0.4}
Level 3: {('Warehouse Associate', 'Female', 4): 0.2, ('Warehouse Associate', 'Female', 5): 0.2, ('Warehouse Associate', 'Male', 4): 0.0, ('Warehouse Associate', 'Male', 5): 0.0, ('Graphic Designer', 'Female', 4): 0.0, ('Graphic Designer', 'Female', 5): 0.2, ('Graphic Designer', 'Male', 4): 0.0, ('Graphic Designer', 'Male', 5): 0.2, ('Software Engineer', 'Female', 4): 0.0, ('Software Engineer', 'Female', 5): 0.0, ('Software Engineer', 'Male', 4): 0.2, ('Software Engineer', 'Male', 5): 0.4}
Level 4: {('Warehouse Associate', 'Female', 4, 'College'): 0.0, ('Warehouse Associate', 'Female', 4, 'Masters'): 0.0, ('Warehouse Associate', 'Female', 4, 'PhD'): 0.2, ('Warehouse Ass

In [14]:
import pandas as pd
import numpy as np
import itertools

# Datos de entrada
data = {
    "job_title": ["Graphic Designer", "Software Engineer", "Warehouse Associate", "Software Engineer", "Graphic Designer"],
    "gender": ["Female", "Male", "Female", "Male", "Male"],
    "perf_eval": [5, 5, 4, 5, 5],
    "edu": ["College", "College", "PhD", "Masters", "Masters"],
    "dept": ["Operations", "Management", "Administration", "Sales", "Engineering"],
    "seniority": [2, 5, 5, 4, 5],
    "total_salary_range": ["40,000-80,000", "80,001-120,000", "80,001-120,000", "80,001-120,000", "80,001-120,000"],
    "age_group": ["0-19", "20-39", "0-19", "20-39", "20-39"]
}

df = pd.DataFrame(data)

def generate_contrast_sets(data, max_level):
    tree = {}
    columns = list(data.keys())
    classes_level_1 = list(set(data[columns[0]]))
    tree[1] = classes_level_1

    for level in range(2, max_level + 1):
        new_level_classes = []
        if level == 2:
            combinations = list(itertools.product(tree[1], set(data[columns[1]])))
        else:
            combinations = generate_combinations(tree[level - 1], set(data[columns[level - 1]]))

        for combination in combinations:
            new_classes = tuple(combination)
            if not any(set(c) == set(new_classes) for c in tree.values()):
                new_level_classes.append(new_classes)

        if new_level_classes:
            tree[level] = new_level_classes
        else:
            break

    return tree

def generate_combinations(previous_level, column_data):
    combinations = []
    for prev_class in previous_level:
        for column_value in column_data:
            new_combination = prev_class + (column_value,)
            if len(set(new_combination)) == len(new_combination):
                combinations.append(new_combination)
    return combinations

def calculate_support(contrast_sets, df, mindev):
    supports = {}
    for level, classes in contrast_sets.items():
        supports[level] = {}
        for c in classes:
            if type(c) == tuple:
                support = np.mean(df.apply(lambda row: all(item in row.values for item in c), axis=1))
            else:
                support = (df[df.columns[0]] == c).mean()
            supports[level][c] = support
            
    # Seleccionar solo aquellos conjuntos de contraste cuyo soporte sea mayor o igual a mindev
    filtered_supports = {}
    for level, support_dict in supports.items():
        filtered_support_dict = {}
        for c, support in support_dict.items():
            if level + 1 in contrast_sets:  # Si existe el nivel de los hijos
                children_supports = [supports[level + 1][child_c] for child_c in contrast_sets[level + 1]]
                max_diff = np.max(np.abs(np.array(children_supports) - support))  # Calcular la máxima diferencia de soporte 
                if max_diff >= mindev:  # Si la máxima diferencia de soporte es mayor o igual a mindev
                    filtered_support_dict[c] = support
        if filtered_support_dict:  # Si el diccionario no está vacío
            filtered_supports[level] = filtered_support_dict

    return filtered_supports

max_level = 8
mindev = 0.1

contrast_sets_tree = generate_contrast_sets(data, max_level)
filtered_supports = calculate_support(contrast_sets_tree, df, mindev)

for level, support in filtered_supports.items():
    if support:  # Si el nivel tiene soportes que cumplen con mindev
        print(f'Level {level}: {support}')

Level 1: {'Warehouse Associate': 0.2, 'Graphic Designer': 0.4, 'Software Engineer': 0.4}
Level 2: {('Warehouse Associate', 'Female'): 0.2, ('Warehouse Associate', 'Male'): 0.0, ('Graphic Designer', 'Female'): 0.2, ('Graphic Designer', 'Male'): 0.2, ('Software Engineer', 'Female'): 0.0, ('Software Engineer', 'Male'): 0.4}
Level 3: {('Warehouse Associate', 'Female', 4): 0.2, ('Warehouse Associate', 'Female', 5): 0.2, ('Warehouse Associate', 'Male', 4): 0.0, ('Warehouse Associate', 'Male', 5): 0.0, ('Graphic Designer', 'Female', 4): 0.0, ('Graphic Designer', 'Female', 5): 0.2, ('Graphic Designer', 'Male', 4): 0.0, ('Graphic Designer', 'Male', 5): 0.2, ('Software Engineer', 'Female', 4): 0.0, ('Software Engineer', 'Female', 5): 0.0, ('Software Engineer', 'Male', 4): 0.2, ('Software Engineer', 'Male', 5): 0.4}
Level 4: {('Warehouse Associate', 'Female', 4, 'College'): 0.0, ('Warehouse Associate', 'Female', 4, 'Masters'): 0.0, ('Warehouse Associate', 'Female', 4, 'PhD'): 0.2, ('Warehouse Ass