Preprocesamiento de datos

In [1]:
### importacion de librerias

import pandas as pd

import matplotlib.pyplot as plt

import itertools

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# carga de datos

try:
    data = pd.read_csv('/datasets/data.csv')
except:
    data = pd.read_csv('datasets/data.csv')

In [3]:
data = data.rename(columns={'jobTitle':'job_title',
                            'perfEval':'perf_eval',
                            'basePay':'base_pay'})

In [4]:
# creamos una columna nueva con el salario total

data['total_salary'] = data['base_pay'] + data['bonus']

In [5]:
# función que calcule la categoría de edad

def assing_age_range(age):
    if age < 0 or pd.isna(age):
        return 'NA'
    elif age < 20:
        return '0-19'
    elif age < 40:
        return '20-39'
    elif age < 60:
        return '40-59'
    elif age >= 60:
        return '60+'


In [6]:
# función que calcule la categoría de ingresos

def total_salary_range(salary):
    if salary < 40000 or pd.isna(salary):
        return '< 40,000'
    elif salary < 80001:
        return '40,000-80,000'
    elif salary < 120001:
        return '80,001-120,000'
    elif salary < 160001:
        return '120,001-160,000'
    elif salary >= 160001:
        return '> 160,000'

In [7]:
data['t_salary_rg'] = data['total_salary'].apply(total_salary_range)
data['age_group'] = data['age'].apply(assing_age_range)


In [8]:
data.head()

Unnamed: 0,job_title,gender,age,perf_eval,edu,dept,seniority,base_pay,bonus,total_salary,t_salary_rg,age_group
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938,52301,"40,000-80,000",0-19
1,Software Engineer,Male,21,5,College,Management,5,108476,11128,119604,"80,001-120,000",20-39
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268,99476,"80,001-120,000",0-19
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154,118234,"80,001-120,000",20-39
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319,108783,"80,001-120,000",20-39


In [9]:
testisng_df = (
    data
    .drop(columns={'age','base_pay','bonus','total_salary'})
)

In [10]:

# Crea una lista de todas las combinaciones posibles de dos columnas
column_pairs = list(itertools.combinations(testisng_df.columns, 2))

# Crea una lista de conjuntos de contraste con pares de clave:valor
contrast_sets = []
for pair in column_pairs:
    values = zip(testisng_df[pair[0]], testisng_df[pair[1]])
    for value in values:
        contrast_set = dict(zip(pair, value))
        contrast_sets.append(contrast_set)

# Imprime los conjuntos de contraste resultantes
print("Conjuntos de contraste:")
for s in contrast_sets:
    print(s)



Conjuntos de contraste:
{'job_title': 'Graphic Designer', 'gender': 'Female'}
{'job_title': 'Software Engineer', 'gender': 'Male'}
{'job_title': 'Warehouse Associate', 'gender': 'Female'}
{'job_title': 'Software Engineer', 'gender': 'Male'}
{'job_title': 'Graphic Designer', 'gender': 'Male'}
{'job_title': 'IT', 'gender': 'Female'}
{'job_title': 'Graphic Designer', 'gender': 'Female'}
{'job_title': 'Software Engineer', 'gender': 'Male'}
{'job_title': 'Graphic Designer', 'gender': 'Female'}
{'job_title': 'Sales Associate', 'gender': 'Female'}
{'job_title': 'Graphic Designer', 'gender': 'Male'}
{'job_title': 'Driver', 'gender': 'Female'}
{'job_title': 'Financial Analyst', 'gender': 'Female'}
{'job_title': 'Warehouse Associate', 'gender': 'Female'}
{'job_title': 'Warehouse Associate', 'gender': 'Female'}
{'job_title': 'Marketing Associate', 'gender': 'Female'}
{'job_title': 'Financial Analyst', 'gender': 'Female'}
{'job_title': 'Warehouse Associate', 'gender': 'Female'}
{'job_title': 'Sale

In [11]:
import itertools
import pandas as pd
from scipy.stats import chi2_contingency

# Crea una lista de todas las combinaciones posibles de dos columnas
column_pairs = list(itertools.combinations(testisng_df.columns, 2))

# Crea una lista de conjuntos de contraste con pares de clave:valor
contrast_sets = []
probabilities = []
chi2_values = []
p_values = []

for pair in column_pairs:
    values = zip(testisng_df[pair[0]], testisng_df[pair[1]])
    for value in values:
        contrast_set = dict(zip(pair, value))
        contrast_sets.append(contrast_set)
        
        data = pd.DataFrame(contrast_set, index=[0])
        contingency_table = pd.crosstab(testisng_df[pair[0]], testisng_df[pair[1]])
        chi2, p_value, _, _ = chi2_contingency(contingency_table)
        
        probabilities.append(1 / len(contrast_sets))
        chi2_values.append(chi2)
        p_values.append(p_value)

# Crea un DataFrame con los conjuntos de contraste, probabilidades, chi2 y p-values
result_df_1 = pd.DataFrame({
    'Set de contraste': contrast_sets,
    'Probabilidad': probabilities,
    'Chi2': chi2_values,
    'p-value': p_values
})

# Imprime el DataFrame resultante
print(result_df_1)


                                        Set de contraste  Probabilidad  \
0      {'job_title': 'Graphic Designer', 'gender': 'F...      1.000000   
1      {'job_title': 'Software Engineer', 'gender': '...      0.500000   
2      {'job_title': 'Warehouse Associate', 'gender':...      0.333333   
3      {'job_title': 'Software Engineer', 'gender': '...      0.250000   
4      {'job_title': 'Graphic Designer', 'gender': 'M...      0.200000   
...                                                  ...           ...   
27995  {'t_salary_rg': '40,000-80,000', 'age_group': ...      0.000036   
27996  {'t_salary_rg': '80,001-120,000', 'age_group':...      0.000036   
27997  {'t_salary_rg': '80,001-120,000', 'age_group':...      0.000036   
27998  {'t_salary_rg': '80,001-120,000', 'age_group':...      0.000036   
27999  {'t_salary_rg': '120,001-160,000', 'age_group'...      0.000036   

             Chi2       p-value  
0      188.235454  9.631049e-36  
1      188.235454  9.631049e-36  
2      18

In [12]:
import itertools
import pandas as pd
from scipy.stats import chi2_contingency

# Crea una lista de todas las combinaciones posibles de dos columnas
column_pairs = list(itertools.combinations(testisng_df.columns, 2))

# Crea una lista de conjuntos de contraste con pares de clave:valor
contrast_sets = []
probabilities_male = []  # Probabilidades para hombres
probabilities_female = []  # Probabilidades para mujeres
chi2_values = []
p_values = []

variable_to_compare = 'gender'  # Reemplaza 'gender' con el nombre de la variable que deseas comparar

for pair in column_pairs:
    if variable_to_compare not in pair:
        continue  # Salta la iteración si la variable a comparar no está presente en el par
        
    values = zip(testisng_df[pair[0]], testisng_df[pair[1]])
    for value in values:
        contrast_set = dict(zip(pair, value))
        contrast_sets.append(contrast_set)
        
        data = pd.DataFrame(contrast_set, index=[0])
        contingency_table = pd.crosstab(data[variable_to_compare], columns=[data.columns[0]])
        chi2, p_value, _, _ = chi2_contingency(contingency_table)
        
        # Divide las probabilidades según el valor de la variable a comparar
        if data[variable_to_compare].iloc[0] == 'Male':
            probabilities_male.append(1 / len(contrast_sets))
            probabilities_female.append(0)
        else:
            probabilities_male.append(0)
            probabilities_female.append(1 / len(contrast_sets))
        
        chi2_values.append(chi2)
        p_values.append(p_value)

# Crea un DataFrame con los conjuntos de contraste, probabilidades, chi2 y p-values
result_df = pd.DataFrame({
    'Set de contraste': contrast_sets,
    'Probabilidad (Male)': probabilities_male,
    'Probabilidad (Female)': probabilities_female,
    'Chi2': chi2_values,
    'p-value': p_values
})

# Imprime el DataFrame resultante
print(result_df)


                                       Set de contraste  Probabilidad (Male)  \
0     {'job_title': 'Graphic Designer', 'gender': 'F...             0.000000   
1     {'job_title': 'Software Engineer', 'gender': '...             0.500000   
2     {'job_title': 'Warehouse Associate', 'gender':...             0.000000   
3     {'job_title': 'Software Engineer', 'gender': '...             0.250000   
4     {'job_title': 'Graphic Designer', 'gender': 'M...             0.200000   
...                                                 ...                  ...   
6995           {'gender': 'Female', 'age_group': '60+'}             0.000000   
6996           {'gender': 'Male', 'age_group': '40-59'}             0.000143   
6997           {'gender': 'Male', 'age_group': '40-59'}             0.000143   
6998             {'gender': 'Male', 'age_group': '60+'}             0.000143   
6999             {'gender': 'Male', 'age_group': '60+'}             0.000143   

      Probabilidad (Female)  Chi2  p-va

In [13]:

pd.set_option('display.max_colwidth', None)  # Configura para mostrar todos los caracteres en una columna

result_df.head()

Unnamed: 0,Set de contraste,Probabilidad (Male),Probabilidad (Female),Chi2,p-value
0,"{'job_title': 'Graphic Designer', 'gender': 'Female'}",0.0,1.0,0.0,1.0
1,"{'job_title': 'Software Engineer', 'gender': 'Male'}",0.5,0.0,0.0,1.0
2,"{'job_title': 'Warehouse Associate', 'gender': 'Female'}",0.0,0.333333,0.0,1.0
3,"{'job_title': 'Software Engineer', 'gender': 'Male'}",0.25,0.0,0.0,1.0
4,"{'job_title': 'Graphic Designer', 'gender': 'Male'}",0.2,0.0,0.0,1.0
