Preprocesamiento de datos

In [17]:
### importacion de librerias

import pandas as pd
import numpy as np
import itertools
from scipy.stats import chi2_contingency

In [18]:
# carga de datos

try:
    data = pd.read_csv('/datasets/data.csv')
except:
    data = pd.read_csv('datasets/data.csv')

In [19]:
# Configura pandas para mostrar el contenido completo de las filas
pd.set_option('display.max_colwidth', None)

In [20]:
data = data.rename(columns={'jobTitle':'job_title',
                            'perfEval':'perf_eval',
                            'basePay':'base_pay'})

In [21]:
# creamos una columna nueva con el salario total

data['total_salary'] = data['base_pay'] + data['bonus']

In [22]:
# función que calcule la categoría de edad

def assing_age_range(age):
    if age < 0 or pd.isna(age):
        return 'NA'
    elif age < 20:
        return '0-19'
    elif age < 40:
        return '20-39'
    elif age < 60:
        return '40-59'
    elif age >= 60:
        return '60+'


In [23]:
# función que calcule la categoría de ingresos

def total_salary_range(salary):
    if salary < 40000 or pd.isna(salary):
        return '< 40,000'
    elif salary < 80001:
        return '40,000-80,000'
    elif salary < 120001:
        return '80,001-120,000'
    elif salary < 160001:
        return '120,001-160,000'
    elif salary >= 160001:
        return '> 160,000'

In [24]:
data['total_salary_range'] = data['total_salary'].apply(total_salary_range)
data['age_group'] = data['age'].apply(assing_age_range)


In [25]:
data.head()

Unnamed: 0,job_title,gender,age,perf_eval,edu,dept,seniority,base_pay,bonus,total_salary,total_salary_range,age_group
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938,52301,"40,000-80,000",0-19
1,Software Engineer,Male,21,5,College,Management,5,108476,11128,119604,"80,001-120,000",20-39
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268,99476,"80,001-120,000",0-19
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154,118234,"80,001-120,000",20-39
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319,108783,"80,001-120,000",20-39


In [26]:
df = (
    data
    .drop(columns={'age','base_pay','bonus','total_salary'})
)

In [27]:
df.head()

Unnamed: 0,job_title,gender,perf_eval,edu,dept,seniority,total_salary_range,age_group
0,Graphic Designer,Female,5,College,Operations,2,"40,000-80,000",0-19
1,Software Engineer,Male,5,College,Management,5,"80,001-120,000",20-39
2,Warehouse Associate,Female,4,PhD,Administration,5,"80,001-120,000",0-19
3,Software Engineer,Male,5,Masters,Sales,4,"80,001-120,000",20-39
4,Graphic Designer,Male,5,Masters,Engineering,5,"80,001-120,000",20-39


In [28]:


# Función para calcular soporte
def calculate_support(df, c):
    if type(c) == tuple:
        support = np.mean(df.apply(lambda row: all(item in row.values for item in c), axis=1))
    else:
        support = (df[df.columns[0]] == c).mean()
    return support

# Función para calcular chi2 y p-value
def calculate_chi2(df, c):
    contingency_table = pd.crosstab(df[df.columns[0]], df.apply(lambda row: all(item in row.values for item in c), axis=1))
    chi2, p, _, _ = chi2_contingency(contingency_table)
    return chi2, p

# Función para generar conjuntos de contraste
def generate_contrast_sets(df, max_level, mindev, level1_column):
    # Reordenar las columnas para que la columna del nivel 1 esté al inicio
    cols = df.columns.tolist()
    cols.remove(level1_column)
    cols.insert(0, level1_column)
    df = df[cols]

    tree = {}
    supports = {}
    chi2s = {}
    ps = {}
    columns = df.columns
    classes_level_1 = list(df[columns[0]].unique())
    tree[1] = classes_level_1
    supports[1] = {c: calculate_support(df, c) for c in classes_level_1}
    chi2s[1] = {c: calculate_chi2(df, c)[0] for c in classes_level_1}
    ps[1] = {c: calculate_chi2(df, c)[1] for c in classes_level_1}

    for level in range(2, max_level + 1):
        new_level_classes = []
        new_level_supports = {}
        new_level_chi2s = {}
        new_level_ps = {}
        if level == 2:
            combinations = list(itertools.product(tree[1], df[columns[1]].unique()))
        else:
            combinations = generate_combinations(tree[level - 1], df[columns[level - 1]].unique())

        for combination in combinations:
            new_classes = tuple(combination)
            new_support = calculate_support(df, new_classes)
            new_chi2, new_p = calculate_chi2(df, new_classes)
            if new_support >= mindev and not any(set(c) == set(new_classes) for c in tree.values()):
                new_level_classes.append(new_classes)
                new_level_supports[new_classes] = new_support
                new_level_chi2s[new_classes] = new_chi2
                new_level_ps[new_classes] = new_p

        if new_level_classes:
            tree[level] = new_level_classes
            supports[level] = new_level_supports
            chi2s[level] = new_level_chi2s
            ps[level] = new_level_ps
        else:
            break

    return tree, supports, chi2s, ps

# Función para generar combinaciones
def generate_combinations(previous_level, column_data):
    combinations = []
    for prev_class in previous_level:
        for column_value in column_data:
            new_combination = prev_class + (column_value,)
            if len(set(new_combination)) == len(new_combination):
                combinations.append(new_combination)
    return combinations

# Función para reestructurar soportes, chi2s y ps
def restructure(supports, chi2s, ps):
    restructured = []
    for level, support in supports.items():
        for classes, value in support.items():
            contrast_set = ", ".join(classes) if isinstance(classes, tuple) else classes
            restructured.append({
                'Contrast Set': contrast_set,
                'Support': value,
                'Chi2': chi2s[level][classes],
                'P-value': ps[level][classes]
            })
    return restructured

# Configurar variables
max_level = 8
mindev = 0.05




In [29]:
df.head()

Unnamed: 0,job_title,gender,perf_eval,edu,dept,seniority,total_salary_range,age_group
0,Graphic Designer,Female,5,College,Operations,2,"40,000-80,000",0-19
1,Software Engineer,Male,5,College,Management,5,"80,001-120,000",20-39
2,Warehouse Associate,Female,4,PhD,Administration,5,"80,001-120,000",0-19
3,Software Engineer,Male,5,Masters,Sales,4,"80,001-120,000",20-39
4,Graphic Designer,Male,5,Masters,Engineering,5,"80,001-120,000",20-39


In [30]:
# Indicar la columna para el nivel 1
level1_column = 'job_title'  

# Generar conjuntos de contraste y calcular soportes, chi2s y ps
contrast_sets_tree, supports, chi2s, ps = generate_contrast_sets(df, max_level, mindev, level1_column)

# Reestructurar los soportes, chi2s y ps y convertirlos a un DataFrame de pandas
restructured = restructure(supports, chi2s, ps)
df_restructured = pd.DataFrame(restructured)

df_restructured

Unnamed: 0,Contrast Set,Support,Chi2,P-value
0,Graphic Designer,0.098,0.0,1.0
1,Software Engineer,0.109,0.0,1.0
2,Warehouse Associate,0.09,0.0,1.0
3,IT,0.096,0.0,1.0
4,Sales Associate,0.094,0.0,1.0
5,Driver,0.091,0.0,1.0
6,Financial Analyst,0.107,0.0,1.0
7,Marketing Associate,0.118,0.0,1.0
8,Data Scientist,0.107,0.0,1.0
9,Manager,0.09,0.0,1.0


In [31]:
# Indicar la columna para el nivel 1
level1_column = 'gender'  

# Generar conjuntos de contraste y calcular soportes, chi2s y ps
contrast_sets_tree, supports, chi2s, ps = generate_contrast_sets(df, max_level, mindev, level1_column)

# Reestructurar los soportes, chi2s y ps y convertirlos a un DataFrame de pandas
restructured = restructure(supports, chi2s, ps)
df_restructured = pd.DataFrame(restructured)

df_restructured

Unnamed: 0,Contrast Set,Support,Chi2,P-value
0,Female,0.468,0.0,1.0
1,Male,0.532,0.0,1.0
2,"Female, IT",0.05,57.600986,3.210646e-14
3,"Female, Marketing Associate",0.107,133.824312,5.969056e-31
4,"Female, Data Scientist",0.053,61.383376,4.697603e-15
5,"Male, Graphic Designer",0.05,44.342469,2.756705e-11
6,"Male, Software Engineer",0.101,96.751798,7.858619000000001e-23
7,"Male, Sales Associate",0.051,45.315754,1.676947e-11
8,"Male, Financial Analyst",0.058,52.187016,5.045866e-13
9,"Male, Data Scientist",0.054,48.24805,3.755716e-12


In [32]:
# Indicar la columna para el nivel 1
level1_column = 'edu'  

# Generar conjuntos de contraste y calcular soportes, chi2s y ps
contrast_sets_tree, supports, chi2s, ps = generate_contrast_sets(df, max_level, mindev, level1_column)

# Reestructurar los soportes, chi2s y ps y convertirlos a un DataFrame de pandas
restructured = restructure(supports, chi2s, ps)
df_restructured = pd.DataFrame(restructured)

df_restructured

Unnamed: 0,Contrast Set,Support,Chi2,P-value
0,College,0.241,0.0,1.0
1,PhD,0.238,0.0,1.0
2,Masters,0.256,0.0,1.0
3,High School,0.265,0.0,1.0


In [33]:
# Indicar la columna para el nivel 1
level1_column = 'dept'  

# Generar conjuntos de contraste y calcular soportes, chi2s y ps
contrast_sets_tree, supports, chi2s, ps = generate_contrast_sets(df, max_level, mindev, level1_column)

# Reestructurar los soportes, chi2s y ps y convertirlos a un DataFrame de pandas
restructured = restructure(supports, chi2s, ps)
df_restructured = pd.DataFrame(restructured)

df_restructured

Unnamed: 0,Contrast Set,Support,Chi2,P-value
0,Operations,0.21,0.0,1.0
1,Management,0.198,0.0,1.0
2,Administration,0.193,0.0,1.0
3,Sales,0.207,0.0,1.0
4,Engineering,0.192,0.0,1.0


In [34]:
# Indicar la columna para el nivel 1
level1_column = 'total_salary_range'  

# Generar conjuntos de contraste y calcular soportes, chi2s y ps
contrast_sets_tree, supports, chi2s, ps = generate_contrast_sets(df, max_level, mindev, level1_column)

# Reestructurar los soportes, chi2s y ps y convertirlos a un DataFrame de pandas
restructured = restructure(supports, chi2s, ps)
df_restructured = pd.DataFrame(restructured)

df_restructured

Unnamed: 0,Contrast Set,Support,Chi2,P-value
0,"40,000-80,000",0.206,0.0,1.0
1,"80,001-120,000",0.575,0.0,1.0
2,"120,001-160,000",0.203,0.0,1.0
3,"> 160,000",0.016,0.0,1.0
4,"40,000-80,000, Marketing Associate",0.052,211.421081,1.435654e-45
5,"80,001-120,000, Graphic Designer",0.06,47.178538,3.184578e-10
6,"80,001-120,000, Software Engineer",0.058,45.509093,7.212103e-10
7,"80,001-120,000, Warehouse Associate",0.057,44.677025,1.083685e-09
8,"80,001-120,000, IT",0.061,48.015928,2.112926e-10
9,"80,001-120,000, Sales Associate",0.058,45.509093,7.212103e-10


In [35]:
# Indicar la columna para el nivel 1
level1_column = 'age_group'  

# Generar conjuntos de contraste y calcular soportes, chi2s y ps
contrast_sets_tree, supports, chi2s, ps = generate_contrast_sets(df, max_level, mindev, level1_column)

# Reestructurar los soportes, chi2s y ps y convertirlos a un DataFrame de pandas
restructured = restructure(supports, chi2s, ps)
df_restructured = pd.DataFrame(restructured)

df_restructured

Unnamed: 0,Contrast Set,Support,Chi2,P-value
0,0-19,0.049,0.0,1.0
1,20-39,0.415,0.0,1.0
2,40-59,0.407,0.0,1.0
3,60+,0.129,0.0,1.0
4,"20-39, Marketing Associate",0.051,75.755075,2.496111e-16
5,"40-59, Marketing Associate",0.052,79.919966,3.193032e-17
