Preprocesamiento de datos

In [1]:
### importacion de librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import itertools

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# carga de datos

try:
    data = pd.read_csv('/datasets/data.csv')
except:
    data = pd.read_csv('datasets/data.csv')

In [3]:
# Configura pandas para mostrar el contenido completo de las filas
pd.set_option('display.max_colwidth', None)

In [4]:
data = data.rename(columns={'jobTitle':'job_title',
                            'perfEval':'perf_eval',
                            'basePay':'base_pay'})

In [5]:
# creamos una columna nueva con el salario total

data['total_salary'] = data['base_pay'] + data['bonus']

In [6]:
# función que calcule la categoría de edad

def assing_age_range(age):
    if age < 0 or pd.isna(age):
        return 'NA'
    elif age < 20:
        return '0-19'
    elif age < 40:
        return '20-39'
    elif age < 60:
        return '40-59'
    elif age >= 60:
        return '60+'


In [7]:
# función que calcule la categoría de ingresos

def total_salary_range(salary):
    if salary < 40000 or pd.isna(salary):
        return '< 40,000'
    elif salary < 80001:
        return '40,000-80,000'
    elif salary < 120001:
        return '80,001-120,000'
    elif salary < 160001:
        return '120,001-160,000'
    elif salary >= 160001:
        return '> 160,000'

In [8]:
data['total_salary_range'] = data['total_salary'].apply(total_salary_range)
data['age_group'] = data['age'].apply(assing_age_range)


In [9]:
data.head()

Unnamed: 0,job_title,gender,age,perf_eval,edu,dept,seniority,base_pay,bonus,total_salary,total_salary_range,age_group
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938,52301,"40,000-80,000",0-19
1,Software Engineer,Male,21,5,College,Management,5,108476,11128,119604,"80,001-120,000",20-39
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268,99476,"80,001-120,000",0-19
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154,118234,"80,001-120,000",20-39
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319,108783,"80,001-120,000",20-39


In [10]:
testisng_df = (
    data
    .drop(columns={'age','base_pay','bonus','total_salary'})
)

In [11]:
testisng_df.head()

Unnamed: 0,job_title,gender,perf_eval,edu,dept,seniority,total_salary_range,age_group
0,Graphic Designer,Female,5,College,Operations,2,"40,000-80,000",0-19
1,Software Engineer,Male,5,College,Management,5,"80,001-120,000",20-39
2,Warehouse Associate,Female,4,PhD,Administration,5,"80,001-120,000",0-19
3,Software Engineer,Male,5,Masters,Sales,4,"80,001-120,000",20-39
4,Graphic Designer,Male,5,Masters,Engineering,5,"80,001-120,000",20-39


In [12]:

df = pd.DataFrame(testisng_df.head(100))

def generate_contrast_sets(data, max_level):
    tree = {}
    columns = list(data.keys())
    classes_level_1 = list(set(data[columns[0]]))
    tree[1] = classes_level_1

    for level in range(2, max_level + 1):
        new_level_classes = []
        if level == 2:
            combinations = list(itertools.product(tree[1], set(data[columns[1]])))
        else:
            combinations = generate_combinations(tree[level - 1], set(data[columns[level - 1]]))

        for combination in combinations:
            new_classes = tuple(combination)
            if not any(set(c) == set(new_classes) for c in tree.values()):
                new_level_classes.append(new_classes)

        if new_level_classes:
            tree[level] = new_level_classes
        else:
            break

    return tree

def generate_combinations(previous_level, column_data):
    combinations = []
    for prev_class in previous_level:
        for column_value in column_data:
            new_combination = prev_class + (column_value,)
            if len(set(new_combination)) == len(new_combination):
                combinations.append(new_combination)
    return combinations

def calculate_support(contrast_sets, df, mindev):
    supports = {}
    for level, classes in contrast_sets.items():
        supports[level] = {}
        for c in classes:
            if type(c) == tuple:
                support = np.mean(df.apply(lambda row: all(item in row.values for item in c), axis=1))
            else:
                support = (df[df.columns[0]] == c).mean()
            supports[level][c] = support
            
    # Seleccionar solo aquellos conjuntos de contraste cuyo soporte sea mayor o igual a mindev
    filtered_supports = {}
    for level, support_dict in supports.items():
        filtered_support_dict = {}
        for c, support in support_dict.items():
            if level + 1 in contrast_sets:  # Si existe el nivel de los hijos
                children_supports = [supports[level + 1][child_c] for child_c in contrast_sets[level + 1]]
                max_diff = np.max(np.abs(np.array(children_supports) - support))  # Calcular la máxima diferencia de soporte 
                if max_diff >= mindev:  # Si la máxima diferencia de soporte es mayor o igual a mindev
                    filtered_support_dict[c] = support
        if filtered_support_dict:  # Si el diccionario no está vacío
            filtered_supports[level] = filtered_support_dict

    return filtered_supports

max_level = 8
mindev = 0.1

contrast_sets_tree = generate_contrast_sets(df, max_level)
filtered_supports = calculate_support(contrast_sets_tree, df, mindev)

for level, support in filtered_supports.items():
    if support:  # Si el nivel tiene soportes que cumplen con mindev
        print(f'Level {level}: {support}')

Level 1: {'Manager': 0.03, 'Data Scientist': 0.14, 'Sales Associate': 0.07, 'Software Engineer': 0.19, 'Graphic Designer': 0.12, 'Driver': 0.05, 'IT': 0.07, 'Warehouse Associate': 0.17}
Level 2: {('Manager', 'Male'): 0.03, ('Manager', 'Female'): 0.0, ('Financial Analyst', 'Male'): 0.03, ('Financial Analyst', 'Female'): 0.05, ('Data Scientist', 'Male'): 0.05, ('Sales Associate', 'Male'): 0.04, ('Sales Associate', 'Female'): 0.03, ('Software Engineer', 'Male'): 0.18, ('Software Engineer', 'Female'): 0.01, ('Graphic Designer', 'Male'): 0.06, ('Graphic Designer', 'Female'): 0.06, ('Driver', 'Male'): 0.01, ('Driver', 'Female'): 0.04, ('Marketing Associate', 'Male'): 0.01, ('Marketing Associate', 'Female'): 0.07, ('IT', 'Male'): 0.02, ('IT', 'Female'): 0.05, ('Warehouse Associate', 'Male'): 0.07, ('Warehouse Associate', 'Female'): 0.1}
Level 3: {('Software Engineer', 'Male', 5): 0.17}


In [13]:
filtered_supports

{1: {'Manager': 0.03,
  'Data Scientist': 0.14,
  'Sales Associate': 0.07,
  'Software Engineer': 0.19,
  'Graphic Designer': 0.12,
  'Driver': 0.05,
  'IT': 0.07,
  'Warehouse Associate': 0.17},
 2: {('Manager', 'Male'): 0.03,
  ('Manager', 'Female'): 0.0,
  ('Financial Analyst', 'Male'): 0.03,
  ('Financial Analyst', 'Female'): 0.05,
  ('Data Scientist', 'Male'): 0.05,
  ('Sales Associate', 'Male'): 0.04,
  ('Sales Associate', 'Female'): 0.03,
  ('Software Engineer', 'Male'): 0.18,
  ('Software Engineer', 'Female'): 0.01,
  ('Graphic Designer', 'Male'): 0.06,
  ('Graphic Designer', 'Female'): 0.06,
  ('Driver', 'Male'): 0.01,
  ('Driver', 'Female'): 0.04,
  ('Marketing Associate', 'Male'): 0.01,
  ('Marketing Associate', 'Female'): 0.07,
  ('IT', 'Male'): 0.02,
  ('IT', 'Female'): 0.05,
  ('Warehouse Associate', 'Male'): 0.07,
  ('Warehouse Associate', 'Female'): 0.1},
 3: {('Software Engineer', 'Male', 5): 0.17}}

In [14]:
# Generar tabla de resumen
summary_table = []
for level, support in filtered_supports.items():
    for cand, sup in support.items():
        dev = sup - mindev
        surp = abs(dev)
        summary_table.append([level, cand, dev, surp])

# Convertir a DataFrame de pandas y mostrar
summary_df = pd.DataFrame(summary_table, columns=["Level", "Cand.", "Dev.", "Surp."])

# Agregamos los valores por cada nivel y mostramos los resultados
summary_df_agg = summary_df.groupby('Level').agg({'Cand.': 'count', 'Dev.': 'sum', 'Surp.': 'sum'}).reset_index()
summary_df_agg.loc['total'] = summary_df_agg.sum()
summary_df_agg['Level'].iloc[-1] = 'total'
print(summary_df_agg) 

       Level  Cand.  Dev.  Surp.
0        1.0    8.0  0.04   0.40
1        2.0   19.0 -0.99   1.15
2        3.0    1.0  0.07   0.07
total  total   28.0 -0.88   1.62
