In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

---

# Columnas Categoricas

In [2]:
df = pd.read_csv("../datos/hr_raw_data_final.csv", index_col = 0)

In [3]:
df.dtypes

age                          object
attrition                    object
businesstravel               object
dailyrate                   float64
department                   object
distancefromhome              int64
education                     int64
educationfield               object
employeecount                 int64
employeenumber                int64
environmentsatisfaction       int64
gender                        int64
hourlyrate                  float64
jobinvolvement                int64
joblevel                      int64
jobrole                      object
jobsatisfaction               int64
maritalstatus                object
monthlyincome                object
monthlyrate                  object
numcompaniesworked            int64
over18                       object
overtime                     object
percentsalaryhike             int64
performancerating            object
relationshipsatisfaction      int64
standardhours                object
stockoptionlevel            

In [4]:
def buscar_columnas_categoricas(df):
    """
    Busca y devuelve las columnas de tipo categórico en un DataFrame.

    Parámetros:
    - df (pd.DataFrame): El DataFrame en el que se buscarán columnas categóricas.

    Retorna:
    - list: Lista de nombres de columnas que son de tipo categórico.
    """
    # Verifica si las columnas son del tipo 'category' o de tipo 'object' (usualmente usadas para categóricos)
    columnas_categoricas = df.select_dtypes(include=['O']).columns.tolist()
    return columnas_categoricas


In [5]:
print(f"Columnas categoricas: {buscar_columnas_categoricas(df)}")

Columnas categoricas: ['age', 'attrition', 'businesstravel', 'department', 'educationfield', 'jobrole', 'maritalstatus', 'monthlyincome', 'monthlyrate', 'over18', 'overtime', 'performancerating', 'standardhours', 'totalworkingyears', 'worklifebalance', 'yearsincurrentrole', 'sameasmonthlyincome', 'salary', 'roledepartament', 'remotework']


---

# Transformación de los NAN

In [6]:
def transformar_nan_sql(df, columna=None, valor_reemplazo='NULL'):
    """
    Reemplaza los valores NaN en una columna específica o en todas las columnas 
    de un DataFrame con un valor compatible con SQL.

    Parámetros:
    - df (pd.DataFrame): El DataFrame en el que se reemplazarán los NaN.
    - columna (str, opcional): El nombre de la columna donde se reemplazarán los NaN. 
                               Si es None, reemplazará en todas las columnas.
    - valor_reemplazo: El valor con el que se reemplazarán los NaN. Ejemplo: 'NULL', '', 0.

    Retorna:
    - pd.DataFrame: El DataFrame con los NaN reemplazados.
    """
    if columna:
        # Si se especifica una columna, verifica que exista
        if columna not in df.columns:
            raise ValueError(f"La columna '{columna}' no existe en el DataFrame.")
        # Reemplaza los NaN en la columna específica
        df[columna] = df[columna].replace({np.nan: valor_reemplazo})
    else:
        # Reemplaza los NaN en todas las columnas
        df = df.replace({np.nan: valor_reemplazo})
    
    return df

In [7]:
# Reemplazar los NaN en una columna específica
transformar_nan_sql(df, columna="department", valor_reemplazo='NULL')

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes
1,52,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,"43331,17$",0,,,14,30,1,,1,340,5,30,33,,11,9,,1971,"199990,00$",,,1
2,42,No,travel_rarely,1984.253968,Research & Development,4,2,Technical Degree,1,3,3,0,,3,5,ManaGER,4,Married,,"41669,33$",1,,No,11,30,4,,0,220,3,,22,,11,15,,1981,"192320,00$",ManaGER - Research & Development,,1
3,47,No,travel_rarely,1771.404762,,2,4,Medical,1,4,1,1,,3,4,ReseArCH DIrECtOr,3,Married,"14307,50$","37199,50$",3,Y,,19,30,2,Full Time,2,,2,,20,,5,6,"14307,50$",1976,"171690,00$",,,False
4,46,No,,1582.771346,,3,3,Technical Degree,1,5,1,1,,4,4,sAleS EXECUtIve,1,Divorced,"12783,92$","33238,20$",2,Y,No,12,30,4,,1,,5,30,19,,2,8,"12783,92$",1977,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,43,No,,488.944444,,-26,3,Medical,1,824,2,1,,4,1,rESEaRcH SciEnTiST,3,Single,"3949,17$","10267,83$",4,,,12,30,4,,0,,2,30,3,,1,2,"3949,17$",1980,,,,Yes
1674,47,No,,1973.984127,,26,4,,1,1087,4,1,,3,5,mANager,3,Married,"15943,72$","41453,67$",3,Y,No,11,30,3,Full Time,1,270,2,30,5,,1,0,"15943,72$",1976,"191324,62$",,,False
1675,29,No,travel_rarely,290.035510,,15,3,,1,528,3,0,,3,1,reSearch sCienTiSt,4,,,"6090,75$",1,,No,19,30,1,Part Time,0,60,1,30,6,,1,5,,1994,"28111,13$",,,False
1676,47,No,travel_rarely,1032.487286,,4,3,Life Sciences,1,76,3,1,,2,3,maNufACTURING DIREctOr,2,Divorced,"8339,32$","21682,23$",8,,Yes,12,,3,Part Time,1,,4,30,22,,14,10,"8339,32$",1976,"100071,84$",,,Yes


In [8]:
# Reemplazar los NaN en todas las columnas
transformar_nan_sql(df, valor_reemplazo='NULL')

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes
1,52,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,"43331,17$",0,,,14,30,1,,1,340,5,30,33,,11,9,,1971,"199990,00$",,,1
2,42,No,travel_rarely,1984.253968,Research & Development,4,2,Technical Degree,1,3,3,0,,3,5,ManaGER,4,Married,,"41669,33$",1,,No,11,30,4,,0,220,3,,22,,11,15,,1981,"192320,00$",ManaGER - Research & Development,,1
3,47,No,travel_rarely,1771.404762,,2,4,Medical,1,4,1,1,,3,4,ReseArCH DIrECtOr,3,Married,"14307,50$","37199,50$",3,Y,,19,30,2,Full Time,2,,2,,20,,5,6,"14307,50$",1976,"171690,00$",,,False
4,46,No,,1582.771346,,3,3,Technical Degree,1,5,1,1,,4,4,sAleS EXECUtIve,1,Divorced,"12783,92$","33238,20$",2,Y,No,12,30,4,,1,,5,30,19,,2,8,"12783,92$",1977,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673,43,No,,488.944444,,-26,3,Medical,1,824,2,1,,4,1,rESEaRcH SciEnTiST,3,Single,"3949,17$","10267,83$",4,,,12,30,4,,0,,2,30,3,,1,2,"3949,17$",1980,,,,Yes
1674,47,No,,1973.984127,,26,4,,1,1087,4,1,,3,5,mANager,3,Married,"15943,72$","41453,67$",3,Y,No,11,30,3,Full Time,1,270,2,30,5,,1,0,"15943,72$",1976,"191324,62$",,,False
1675,29,No,travel_rarely,290.035510,,15,3,,1,528,3,0,,3,1,reSearch sCienTiSt,4,,,"6090,75$",1,,No,19,30,1,Part Time,0,60,1,30,6,,1,5,,1994,"28111,13$",,,False
1676,47,No,travel_rarely,1032.487286,,4,3,Life Sciences,1,76,3,1,,2,3,maNufACTURING DIREctOr,2,Divorced,"8339,32$","21682,23$",8,,Yes,12,,3,Part Time,1,,4,30,22,,14,10,"8339,32$",1976,"100071,84$",,,Yes


In [9]:
df.head(2)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,51,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,"16280,83$","42330,17$",7,Y,No,13,30,3,Full Time,0,,5,30,20,,15,15,"16280,83$",1972,"195370,00$",,,Yes
1,52,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,"43331,17$",0,,,14,30,1,,1,340.0,5,30,33,,11,9,,1971,"199990,00$",,,1


In [10]:
# Crear función de conversión de números a numeros absolutos

def convertir_a_absoluto(numero):
    """
    Convierte un número a su valor absoluto.

    Parámetros:
    numero (int | float): Número a convertir.

    Retorno:
    int | float: Valor absoluto del número.
    """
    return abs(numero)

# Ejemplo de uso
print(convertir_a_absoluto(-5))      
print(convertir_a_absoluto(-3.14)) 

5
3.14


In [11]:
# Crear función para transformar float a string

def float_a_string(dato):
    """
    Convierte un número float a string.
    
    Parámetros:
    dato (float): Número a convertir.
    
    Retorno:
    str: Representación del número en formato string.
    """
    return str(dato)

# Ejemplo de uso
resultado = float_a_string(123.456)
print(resultado)

123.456


In [12]:
type(resultado)

str

In [49]:
# Crear función para transformar string a int. No esta hecho para las columnas adecuadas. 

def string_a_int (dato):
    try:
        return int (dato)
    except:
        return np.nan
columns = ['age', 'monthlyincome','monthlyrate','yearsincurrentrole', 'salary']
for col in columns:
    df[col] = df[col].apply(string_a_int)

# Ejemplo de uso
df['age']

0       51.0
1       52.0
2       42.0
3       47.0
4       46.0
        ... 
1673    43.0
1674    47.0
1675    29.0
1676    47.0
1677    32.0
Name: age, Length: 1678, dtype: float64

In [50]:
def cambiar_coma_punto(columna):
    """
    Cambia las comas por puntos y elimina símbolos como '$' en los valores de una columna.
    Parámetros:
        columna (str): Cadena de texto a modificar.
    Retorna:
        float: Número transformado, o np.nan si ocurre un error.
    """
    if isinstance(columna, (int, float)):
            return columna
    try:
        # Reemplazar comas por puntos y eliminar el símbolo '$'
        return float(columna.replace(',', '.').replace('$', ''))
    except:
        # Si hay un error (como valores no numéricos), devolver NaN
        return np.nan
columnas = ["totalworkingyears", "worklifebalance", "yearsincurrentrole", "monthlyincome", "monthlyrate", "sameasmonthlyincome", "salary"]
# Aplicar la transformación
for col in columnas:
    if col in df.columns:
        df[col] = df[col].apply(cambiar_coma_punto)
df.head(30)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,51.0,No,,2015.722222,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,,,7,Y,No,13,30.0,3,Full Time,0,,5,,20,,15,15,,1972,,,,Yes
1,52.0,No,,2063.388889,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,,0,,,14,30.0,1,,1,,5,,33,,11,9,,1971,,,,1
2,42.0,No,travel_rarely,1984.253968,Research & Development,4,2,Technical Degree,1,3,3,0,,3,5,ManaGER,4,Married,,,1,,No,11,30.0,4,,0,,3,,22,,11,15,,1981,,ManaGER - Research & Development,,1
3,47.0,No,travel_rarely,1771.404762,,2,4,Medical,1,4,1,1,,3,4,ReseArCH DIrECtOr,3,Married,,,3,Y,,19,30.0,2,Full Time,2,,2,,20,,5,6,,1976,,,,False
4,46.0,No,,1582.771346,,3,3,Technical Degree,1,5,1,1,,4,4,sAleS EXECUtIve,1,Divorced,,,2,Y,No,12,30.0,4,,1,,5,,19,,2,8,,1977,,,,0
5,48.0,No,,1771.920635,Research & Development,22,3,Medical,1,6,4,1,,3,4,MANAger,4,,,,3,,No,11,30.0,2,,1,,3,,22,,4,7,,1975,,MANAger - Research & Development,,Yes
6,59.0,No,,1032.487286,,25,3,Life Sciences,1,7,1,1,,3,3,Sales ExeCutIVe,1,,,,7,Y,,11,30.0,4,Part Time,0,,3,,21,,7,9,,1964,,,,True
7,42.0,No,travel_rarely,556.256661,,1,1,,1,8,2,0,69.532083,3,2,Sales eXEcUTiVe,3,Married,,,1,,No,25,40.0,3,Part Time,0,,3,,20,,11,6,,1981,,,,0
8,41.0,No,,1712.18254,,2,5,,1,9,2,1,,3,4,mANAGEr,1,Married,,,7,,No,16,30.0,2,Full Time,1,,2,,18,,11,8,,1982,,,,True
9,41.0,No,travel_frequently,1973.984127,,9,3,,1,10,1,0,,3,5,reSEaRCH DIrectoR,3,,,,2,,No,17,30.0,2,,1,,2,,18,,0,11,,1982,,,,0


In [30]:
type(resultado2)

float

In [17]:
# Quedarse con el Primer Dígito de un Número de Dos Dígitos:
def primer_digito(df, column):
    df[column] = df[column].astype(str).str[0].astype(int)
    return df
# Uso:
df = primer_digito(df, 'environmentsatisfaction')


In [18]:
df['environmentsatisfaction'].unique()

array([1, 3, 4, 2])