Preprocesamiento de datos

In [1]:
### importacion de librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import itertools

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# carga de datos

try:
    data = pd.read_csv('/datasets/data.csv')
except:
    data = pd.read_csv('datasets/data.csv')

In [3]:
# Configura pandas para mostrar el contenido completo de las filas
pd.set_option('display.max_colwidth', None)

In [4]:
data = data.rename(columns={'jobTitle':'job_title',
                            'perfEval':'perf_eval',
                            'basePay':'base_pay'})

In [5]:
# creamos una columna nueva con el salario total

data['total_salary'] = data['base_pay'] + data['bonus']

In [6]:
# función que calcule la categoría de edad

def assing_age_range(age):
    if age < 0 or pd.isna(age):
        return 'NA'
    elif age < 20:
        return '0-19'
    elif age < 40:
        return '20-39'
    elif age < 60:
        return '40-59'
    elif age >= 60:
        return '60+'


In [7]:
# función que calcule la categoría de ingresos

def total_salary_range(salary):
    if salary < 40000 or pd.isna(salary):
        return '< 40,000'
    elif salary < 80001:
        return '40,000-80,000'
    elif salary < 120001:
        return '80,001-120,000'
    elif salary < 160001:
        return '120,001-160,000'
    elif salary >= 160001:
        return '> 160,000'

In [8]:
data['total_salary_range'] = data['total_salary'].apply(total_salary_range)
data['age_group'] = data['age'].apply(assing_age_range)


In [9]:
data.head()

Unnamed: 0,job_title,gender,age,perf_eval,edu,dept,seniority,base_pay,bonus,total_salary,total_salary_range,age_group
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938,52301,"40,000-80,000",0-19
1,Software Engineer,Male,21,5,College,Management,5,108476,11128,119604,"80,001-120,000",20-39
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268,99476,"80,001-120,000",0-19
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154,118234,"80,001-120,000",20-39
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319,108783,"80,001-120,000",20-39


In [10]:
testisng_df = (
    data
    .drop(columns={'age','base_pay','bonus','total_salary'})
)

In [11]:
testisng_df.head()

Unnamed: 0,job_title,gender,perf_eval,edu,dept,seniority,total_salary_range,age_group
0,Graphic Designer,Female,5,College,Operations,2,"40,000-80,000",0-19
1,Software Engineer,Male,5,College,Management,5,"80,001-120,000",20-39
2,Warehouse Associate,Female,4,PhD,Administration,5,"80,001-120,000",0-19
3,Software Engineer,Male,5,Masters,Sales,4,"80,001-120,000",20-39
4,Graphic Designer,Male,5,Masters,Engineering,5,"80,001-120,000",20-39


In [17]:
data = {
    "job_title": ["Graphic Designer", "Software Engineer", "Warehouse Associate", "Software Engineer", "Graphic Designer"],
    "gender": ["Female", "Male", "Female", "Male", "Male"],
    "perf_eval": [5, 5, 4, 5, 5],
    "edu": ["College", "College", "PhD", "Masters", "Masters"],
    "dept": ["Operations", "Management", "Administration", "Sales", "Engineering"],
    "seniority": [2, 5, 5, 4, 5],
    "total_salary_range": ["40,000-80,000", "80,001-120,000", "80,001-120,000", "80,001-120,000", "80,001-120,000"],
    "age_group": ["0-19", "20-39", "0-19", "20-39", "20-39"]
}


def generate_contrast_sets(data, max_level):
    tree = {}

    columns = list(data.keys())

    classes_level_1 = list(set(data[columns[0]]))
    tree[1] = classes_level_1

    for level in range(2, max_level + 1):
        new_level_classes = []
        if level == 2:
            combinations = list(itertools.product(tree[1], set(data[columns[1]])))
        else:
            combinations = generate_combinations(tree[level - 1], set(data[columns[level - 1]]))

        for combination in combinations:
            new_classes = tuple(combination)
            if not any(set(c) == set(new_classes) for c in tree.values()):
                new_level_classes.append(new_classes)

        if new_level_classes:
            tree[level] = new_level_classes
        else:
            break

    return tree

def generate_combinations(previous_level, column_data):
    combinations = []
    for prev_class in previous_level:
        for column_value in column_data:
            new_combination = prev_class + (column_value,)
            if len(set(new_combination)) == len(new_combination):
                combinations.append(new_combination)
    return combinations



max_level = 8

contrast_sets_tree = generate_contrast_sets(data, max_level)

for level, classes in contrast_sets_tree.items():
    print(f'Level {level}: {classes}')

Level 1: ['Graphic Designer', 'Warehouse Associate', 'Software Engineer']
Level 2: [('Graphic Designer', 'Female'), ('Graphic Designer', 'Male'), ('Warehouse Associate', 'Female'), ('Warehouse Associate', 'Male'), ('Software Engineer', 'Female'), ('Software Engineer', 'Male')]
Level 3: [('Graphic Designer', 'Female', 4), ('Graphic Designer', 'Female', 5), ('Graphic Designer', 'Male', 4), ('Graphic Designer', 'Male', 5), ('Warehouse Associate', 'Female', 4), ('Warehouse Associate', 'Female', 5), ('Warehouse Associate', 'Male', 4), ('Warehouse Associate', 'Male', 5), ('Software Engineer', 'Female', 4), ('Software Engineer', 'Female', 5), ('Software Engineer', 'Male', 4), ('Software Engineer', 'Male', 5)]
Level 4: [('Graphic Designer', 'Female', 4, 'Masters'), ('Graphic Designer', 'Female', 4, 'PhD'), ('Graphic Designer', 'Female', 4, 'College'), ('Graphic Designer', 'Female', 5, 'Masters'), ('Graphic Designer', 'Female', 5, 'PhD'), ('Graphic Designer', 'Female', 5, 'College'), ('Graphic 