Preprocesamiento de datos

In [2]:
### importacion de librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import itertools

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
# carga de datos

try:
    data = pd.read_csv('/datasets/data.csv')
except:
    data = pd.read_csv('datasets/data.csv')

In [4]:
# Configura pandas para mostrar el contenido completo de las filas
pd.set_option('display.max_colwidth', None)

In [5]:
data = data.rename(columns={'jobTitle':'job_title',
                            'perfEval':'perf_eval',
                            'basePay':'base_pay'})

In [6]:
# creamos una columna nueva con el salario total

data['total_salary'] = data['base_pay'] + data['bonus']

In [7]:
# función que calcule la categoría de edad

def assing_age_range(age):
    if age < 0 or pd.isna(age):
        return 'NA'
    elif age < 20:
        return '0-19'
    elif age < 40:
        return '20-39'
    elif age < 60:
        return '40-59'
    elif age >= 60:
        return '60+'


In [8]:
# función que calcule la categoría de ingresos

def total_salary_range(salary):
    if salary < 40000 or pd.isna(salary):
        return '< 40,000'
    elif salary < 80001:
        return '40,000-80,000'
    elif salary < 120001:
        return '80,001-120,000'
    elif salary < 160001:
        return '120,001-160,000'
    elif salary >= 160001:
        return '> 160,000'

In [9]:
data['total_salary_range'] = data['total_salary'].apply(total_salary_range)
data['age_group'] = data['age'].apply(assing_age_range)


In [10]:
data.head()

Unnamed: 0,job_title,gender,age,perf_eval,edu,dept,seniority,base_pay,bonus,total_salary,total_salary_range,age_group
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938,52301,"40,000-80,000",0-19
1,Software Engineer,Male,21,5,College,Management,5,108476,11128,119604,"80,001-120,000",20-39
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268,99476,"80,001-120,000",0-19
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154,118234,"80,001-120,000",20-39
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319,108783,"80,001-120,000",20-39


In [11]:
testisng_df = (
    data
    .drop(columns={'age','base_pay','bonus','total_salary'})
)

In [27]:
import pandas as pd
import numpy as np
import itertools

# Datos de entrada
data = {
    "job_title": ["Graphic Designer", "Software Engineer", "Warehouse Associate", "Software Engineer", "Graphic Designer"],
    "gender": ["Female", "Male", "Female", "Male", "Male"],
    "perf_eval": [5, 5, 4, 5, 5],
    "edu": ["College", "College", "PhD", "Masters", "Masters"],
    "dept": ["Operations", "Management", "Administration", "Sales", "Engineering"],
    "seniority": [2, 5, 5, 4, 5],
    "total_salary_range": ["40,000-80,000", "80,001-120,000", "80,001-120,000", "80,001-120,000", "80,001-120,000"],
    "age_group": ["0-19", "20-39", "0-19", "20-39", "20-39"]
}

df = pd.DataFrame(data)

def generate_contrast_sets(data, max_level):
    tree = {}
    columns = list(data.keys())
    classes_level_1 = list(set(data[columns[0]]))
    tree[1] = classes_level_1

    for level in range(2, max_level + 1):
        new_level_classes = []
        if level == 2:
            combinations = list(itertools.product(tree[1], set(data[columns[1]])))
        else:
            combinations = generate_combinations(tree[level - 1], set(data[columns[level - 1]]))

        for combination in combinations:
            new_classes = tuple(combination)
            if not any(set(c) == set(new_classes) for c in tree.values()):
                new_level_classes.append(new_classes)

        if new_level_classes:
            tree[level] = new_level_classes
        else:
            break

    return tree

def generate_combinations(previous_level, column_data):
    combinations = []
    for prev_class in previous_level:
        for column_value in column_data:
            new_combination = prev_class + (column_value,)
            if len(set(new_combination)) == len(new_combination):
                combinations.append(new_combination)
    return combinations

def calculate_support(contrast_sets, df, mindev):
    supports = {}
    for level, classes in contrast_sets.items():
        supports[level] = {}
        for c in classes:
            if type(c) == tuple:
                support = np.mean(df.apply(lambda row: all(item in row.values for item in c), axis=1))
            else:
                support = (df[df.columns[0]] == c).mean()
            supports[level][c] = support
            
    # Seleccionar solo aquellos conjuntos de contraste cuyo soporte sea mayor o igual a mindev
    filtered_supports = {}
    for level, support_dict in supports.items():
        filtered_support_dict = {}
        for c, support in support_dict.items():
            if level + 1 in contrast_sets:  # Si existe el nivel de los hijos
                children_supports = [supports[level + 1][child_c] for child_c in contrast_sets[level + 1]]
                max_diff = np.max(np.abs(np.array(children_supports) - support))  # Calcular la máxima diferencia de soporte con los hijos
                if max_diff >= mindev:  # Si la máxima diferencia de soporte es mayor o igual a mindev
                    filtered_support_dict[c] = support
        if filtered_support_dict:  # Si el diccionario no está vacío
            filtered_supports[level] = filtered_support_dict

    return filtered_supports

max_level = 8
mindev = 0.1

contrast_sets_tree = generate_contrast_sets(data, max_level)
filtered_supports = calculate_support(contrast_sets_tree, df, mindev)

for level, support in filtered_supports.items():
    if support:  # Si el nivel tiene soportes que cumplen con mindev
        print(f'Level {level}: {support}')

Level 1: {'Warehouse Associate': 0.2, 'Graphic Designer': 0.4, 'Software Engineer': 0.4}
Level 2: {('Warehouse Associate', 'Female'): 0.2, ('Warehouse Associate', 'Male'): 0.0, ('Graphic Designer', 'Female'): 0.2, ('Graphic Designer', 'Male'): 0.2, ('Software Engineer', 'Female'): 0.0, ('Software Engineer', 'Male'): 0.4}
Level 3: {('Warehouse Associate', 'Female', 4): 0.2, ('Warehouse Associate', 'Female', 5): 0.2, ('Warehouse Associate', 'Male', 4): 0.0, ('Warehouse Associate', 'Male', 5): 0.0, ('Graphic Designer', 'Female', 4): 0.0, ('Graphic Designer', 'Female', 5): 0.2, ('Graphic Designer', 'Male', 4): 0.0, ('Graphic Designer', 'Male', 5): 0.2, ('Software Engineer', 'Female', 4): 0.0, ('Software Engineer', 'Female', 5): 0.0, ('Software Engineer', 'Male', 4): 0.2, ('Software Engineer', 'Male', 5): 0.4}
Level 4: {('Warehouse Associate', 'Female', 4, 'PhD'): 0.2, ('Warehouse Associate', 'Female', 4, 'College'): 0.0, ('Warehouse Associate', 'Female', 4, 'Masters'): 0.0, ('Warehouse Ass

In [29]:
# Generar tabla de resumen
summary_table = []
for level, support in filtered_supports.items():
    for cand, sup in support.items():
        dev = sup - mindev
        surp = abs(dev)
        summary_table.append([level, cand, dev, surp])

# Convertir a DataFrame de pandas y mostrar
summary_df = pd.DataFrame(summary_table, columns=["Level", "Cand.", "Dev.", "Surp."])

# Agregamos los valores por cada nivel y mostramos los resultados
summary_df_agg = summary_df.groupby('Level').agg({'Cand.': 'count', 'Dev.': 'sum', 'Surp.': 'sum'}).reset_index()
summary_df_agg.loc['total'] = summary_df_agg.sum()
summary_df_agg['Level'].iloc[-1] = 'total'
print(summary_df_agg) 

       Level   Cand.   Dev.  Surp.
0        1.0     3.0    0.7    0.7
1        2.0     6.0    0.4    0.8
2        3.0    12.0    0.2    1.4
3        4.0    36.0   -2.2    3.6
4        5.0   180.0  -16.6   18.0
5        6.0   360.0  -35.0   36.0
6        7.0   720.0  -71.0   72.0
total  total  1317.0 -123.5  132.5


In [32]:
import pandas as pd
import numpy as np
import itertools
from scipy.stats import chisquare

# Datos de entrada
data = {
    "job_title": ["Graphic Designer", "Software Engineer", "Warehouse Associate", "Software Engineer", "Graphic Designer"],
    "gender": ["Female", "Male", "Female", "Male", "Male"],
    "perf_eval": [5, 5, 4, 5, 5],
    "edu": ["College", "College", "PhD", "Masters", "Masters"],
    "dept": ["Operations", "Management", "Administration", "Sales", "Engineering"],
    "seniority": [2, 5, 5, 4, 5],
    "total_salary_range": ["40,000-80,000", "80,001-120,000", "80,001-120,000", "80,001-120,000", "80,001-120,000"],
    "age_group": ["0-19", "20-39", "0-19", "20-39", "20-39"]
}

df = pd.DataFrame(data)

def generate_contrast_sets(data, max_level):
    tree = {}
    columns = list(data.keys())
    classes_level_1 = list(set(data[columns[0]]))
    tree[1] = classes_level_1

    for level in range(2, max_level + 1):
        new_level_classes = []
        if level == 2:
            combinations = list(itertools.product(tree[1], set(data[columns[1]])))
        else:
            combinations = generate_combinations(tree[level - 1], set(data[columns[level - 1]]))

        for combination in combinations:
            new_classes = tuple(combination)
            if not any(set(c) == set(new_classes) for c in tree.values()):
                new_level_classes.append(new_classes)

        if new_level_classes:
            tree[level] = new_level_classes
        else:
            break

    return tree

def generate_combinations(previous_level, column_data):
    combinations = []
    for prev_class in previous_level:
        for column_value in column_data:
            new_combination = prev_class + (column_value,)
            if len(set(new_combination)) == len(new_combination):
                combinations.append(new_combination)
    return combinations

def calculate_support(contrast_sets, df, mindev, alpha=0.05):
    supports = {}
    for level, classes in contrast_sets.items():
        supports[level] = {}
        for c in classes:
            if type(c) == tuple:
                support = np.mean(df.apply(lambda row: all(item in row.values for item in c), axis=1))
            else:
                support = (df[df.columns[0]] == c).mean()
            supports[level][c] = support

    filtered_supports = {}
    for level, support_dict in supports.items():
        filtered_support_dict = {}
        for c, support in support_dict.items():
            if level + 1 in contrast_sets:
                children_supports = [supports[level + 1][child_c] for child_c in contrast_sets[level + 1] if child_c in supports[level + 1]]
                max_diff = np.max(np.abs(np.array(children_supports) - support))

                # Round the observed and expected frequencies
                obs_freq = np.around(children_supports, decimals=6)
                exp_freq = np.around([support]*len(children_supports), decimals=6)

                # Perform chi-square test
                chi2, p = chisquare(obs_freq, f_exp=exp_freq)

                if p <= alpha:
                    filtered_support_dict[c] = support

        if filtered_support_dict:
            filtered_supports[level] = filtered_support_dict

    return filtered_supports

max_level = 8
mindev = 0.1

contrast_sets_tree = generate_contrast_sets(data, max_level)
filtered_supports = calculate_support(contrast_sets_tree, df, mindev)

for level, support in filtered_supports.items():
    if support:
        print(f'Level {level}: {support}')


ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:
0.19999999999999996