In [4]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [8]:
def explporacion_csv(archivo):
    """
    Función para explorar un archivo csv
    
    Args:
        archivo (str): Archivo csv separado por comas

    Returns:
        df (DataFrame): Devuelve un csv directamente del archivo  función no devuleve nada (None) pero realiza varios prints con información de los datos incluidos en el csv    """
    
    # Convertimos el csv a DataFrame
    df = pd.read_csv(archivo,index_col=0)

    #Echamos un vistazo
    print("Primeras filas:")
    display(df.head(10))
    print("-----------------------------")

    print("Últimas filas:")
    display(df.tail(10))
    print("-----------------------------")

    print("Filas aleatorias:")
    display(df.sample(10))
    print("-----------------------------")

    print("El df tiene {df.shape[0]} filas y {df.shape[1]} columnas")
    print("-----------------------------")

    print("Tipos de datos y nulos:")
    display(df.info())
    print("-----------------------------")
    
    print(f"Características columnas númericas:\n{df.describe().T}")
    print(f"-----------------------------")

    print(f"Características columnas de texto:\n{df.describe (include='object').T}")
    print(f"-----------------------------")

    print(f"Duplicados:\n{df.duplicated().sum()}")
    print(f"-----------------------------")

    # generamos un DataFrame para los valores nulos
    print("Los nulos que tenemos en el conjunto de datos son:")
    df_nulos = pd.DataFrame(df.isnull().sum() / df.shape[0] * 100, columns = ["%_nulos"])
    display(df_nulos[df_nulos["%_nulos"] > 0])

    print("\n ..................... \n")
    print("Los valores que tenemos para las columnas categóricas son: ")
    df_categoricas = df.select_dtypes(include = "O")
    
    for col in df_categoricas.columns:
        print(f"La columna {col.upper()} tiene las siguientes valore únicos:")
        display(pd.DataFrame(df[col].value_counts()/df[col].shape[0])*100)  

In [9]:
explporacion_csv("../data/HR_RAW_DATA.csv")

Primeras filas:


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
0,51,No,,"684,0$",,6,3,,1,1620.0,1,0,51,3,5,resEArch DIREcToR,3,,195370.0,6462,7,Y,No,13,30,3,,0,,5,30.0,20,,15,15,195370.0,1972,1000000000$,,,Yes
1,52,No,,"699,0$",,1,4,Life Sciences,1,2590.0,3,0,65,2,5,ManAGeR,3,,199990.0,5678,0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,199990.0,1971,1000000000$,,,1
2,42,No,travel_rarely,"532,0$",Research & Development,4,2,Technical Degree,1,3190.0,3,0,58,3,5,ManaGER,4,Married,192320.0,4933,1,,No,11,30,4,,0,220.0,3,,22,,11,15,192320.0,1981,1000000000$,ManaGER - Research & Development,,1
3,47,No,travel_rarely,"359,0$",,2,4,Medical,1,,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690.0,26703,3,Y,,19,30,2,,2,,2,,20,,5,6,171690.0,1976,1000000000$,,,False
4,46,No,,"1319,0$",,3,3,Technical Degree,1,,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,,1977,1000000000$,,,0
5,48,No,,"117,0$",Research & Development,22,3,Medical,1,19000.0,4,1,58,3,4,MANAger,4,,171740.0,2437,3,,No,11,30,2,,1,,3,30.0,22,,4,7,171740.0,1975,1000000000$,MANAger - Research & Development,,Yes
6,59,No,,"1435,0$",,25,3,Life Sciences,1,810.0,1,1,99,3,3,Sales ExeCutIVe,1,,,2354,7,Y,,11,30,4,,0,280.0,3,20.0,21,,7,9,,1964,1000000000$,,,True
7,42,No,travel_rarely,"635,0$",,1,1,,1,3870.0,2,0,99,3,2,Sales eXEcUTiVe,3,Married,,24532,1,,No,25,40,3,,0,200.0,3,30.0,20,,11,6,,1981,1000000000$,,,0
8,41,No,,"1276,0$",,2,5,,1,,2,1,91,3,4,mANAGEr,1,Married,165950.0,5626,7,,No,16,30,2,,1,220.0,2,30.0,18,,11,8,165950.0,1982,1000000000$,,,True
9,41,No,travel_frequently,"840,0$",,9,3,,1,9990.0,1,0,64,3,5,reSEaRCH DIrectoR,3,,,3735,2,,No,17,30,2,,1,210.0,2,40.0,18,,0,11,,1982,1000000000$,,,0


-----------------------------
Últimas filas:
     Age Attrition     BusinessTravel DailyRate                Department  \
1604  41       Yes                NaN   1085,0$   Research & Development    
1605  31        No      travel_rarely    154,0$                       NaN   
1606  26        No  travel_frequently   1283,0$                    Sales    
1607  31        No                NaN    616,0$                       NaN   
1608  32        No                NaN    498,0$                       NaN   
1609  36       Yes      travel_rarely    530,0$                       NaN   
1610  45        No         non-travel    805,0$                       NaN   
1611  39        No      travel_rarely    903,0$                       NaN   
1612  36        No         non-travel   1229,0$                       NaN   
1613  46        No                NaN    566,0$                       NaN   

      DistanceFromHome  Education    EducationField  employeecount  \
1604                 2          4    

Unnamed: 0,%_nulos
BusinessTravel,47.831475
Department,81.288724
EducationField,46.158612
employeenumber,26.703841
MaritalStatus,40.334572
MonthlyIncome,52.230483
Over18,55.82404
OverTime,41.883519
PerformanceRating,12.081784
StandardHours,74.039653



 ..................... 

Los valores que tenemos para las columnas categóricas son: 
La columna AGE tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
35,5.204461
34,5.142503
31,5.080545
29,4.832714
36,4.584882
32,3.965304
38,3.965304
30,3.903346
33,3.77943
40,3.717472


La columna ATTRITION tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
No,83.952912
Yes,16.047088


La columna BUSINESSTRAVEL tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
BusinessTravel,Unnamed: 1_level_1
travel_rarely,36.307311
travel_frequently,10.223048
non-travel,5.638166


La columna DAILYRATE tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
DailyRate,Unnamed: 1_level_1
nan$,7.682776
"691,0$",0.433705
"329,0$",0.433705
"147,0$",0.371747
"530,0$",0.371747
...,...
"317,0$",0.061958
"891,0$",0.061958
"759,0$",0.061958
"483,0$",0.061958


La columna DEPARTMENT tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
Department,Unnamed: 1_level_1
Research & Development,12.143742
Sales,5.638166
Human Resources,0.929368


La columna EDUCATIONFIELD tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
EducationField,Unnamed: 1_level_1
Life Sciences,21.623296
Medical,17.100372
Marketing,6.443618
Technical Degree,4.275093
Other,3.655514
Human Resources,0.743494


La columna EMPLOYEENUMBER tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
employeenumber,Unnamed: 1_level_1
4820,0.123916
5300,0.123916
5070,0.123916
5170,0.123916
5220,0.123916
...,...
1610,0.061958
1640,0.061958
1900,0.061958
1940,0.061958


La columna HOURLYRATE tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
HourlyRate,Unnamed: 1_level_1
Not Available,5.204461
42,2.044610
66,1.982652
48,1.858736
57,1.796778
...,...
47,0.929368
53,0.805452
68,0.805452
38,0.743494


La columna JOBROLE tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
JobRole,Unnamed: 1_level_1
mANager,0.247831
ManageR,0.185874
ManagEr,0.185874
mAnaGeR,0.185874
MANAgER,0.185874
...,...
ResEArch ScieNTiST,0.061958
HealthcARE RePreSENtAtiVe,0.061958
ReSearcH scIEntist,0.061958
LAbOrATOry techNicIan,0.061958


La columna MARITALSTATUS tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
MaritalStatus,Unnamed: 1_level_1
Married,25.030979
Single,20.136307
Divorced,11.648079
Marreid,2.168525
divorced,0.681537


La columna MONTHLYINCOME tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
MonthlyIncome,Unnamed: 1_level_1
63470,0.247831
53040,0.247831
26570,0.185874
22580,0.185874
54050,0.123916
...,...
31020,0.061958
45560,0.061958
42300,0.061958
48590,0.061958


La columna OVER18 tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
Over18,Unnamed: 1_level_1
Y,44.17596


La columna OVERTIME tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
No,42.255266
Yes,15.861214


La columna PERFORMANCERATING tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
PerformanceRating,Unnamed: 1_level_1
30,74.659232
40,13.258984


La columna STANDARDHOURS tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
StandardHours,Unnamed: 1_level_1
800,25.960347


La columna TOTALWORKINGYEARS tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
TOTALWORKINGYEARS,Unnamed: 1_level_1
100,8.921933
80,5.328377
60,5.204461
90,4.275093
50,4.089219
70,3.469641
40,3.345725
10,3.283767
120,2.106568
30,1.982652


La columna WORKLIFEBALANCE tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
WORKLIFEBALANCE,Unnamed: 1_level_1
30,56.567534
20,22.242875
40,9.60347
10,4.894672


La columna YEARSINCURRENTROLE tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
YearsInCurrentRole,Unnamed: 1_level_1
20,0.681537
70,0.309789
0,0.247831
40,0.185874
10,0.185874
110,0.123916
60,0.123916
30,0.123916
130,0.061958
120,0.061958


La columna SAMEASMONTHLYINCOME tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
SameAsMonthlyIncome,Unnamed: 1_level_1
63470,0.247831
53040,0.247831
26570,0.185874
22580,0.185874
54050,0.123916
...,...
31020,0.061958
45560,0.061958
42300,0.061958
48590,0.061958


La columna SALARY tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
Salary,Unnamed: 1_level_1
1000000000$,100.0


La columna ROLEDEPARTAMENT tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
RoleDepartament,Unnamed: 1_level_1
MaNAgeR - Sales,0.123916
ManaGER - Research & Development,0.061958
ReseaRch scIENTisT - Research & Development,0.061958
ManufacTURInG DIRECtOR - Research & Development,0.061958
hEalthCaRe reprEseNTaTiVe - Research & Development,0.061958
...,...
saLES eXEcUTiVE - Sales,0.061958
mANUfacTURiNG dIRectOR - Research & Development,0.061958
huMAn ResOurces - Human Resources,0.061958
HUMAN ResoURCeS - Human Resources,0.061958


La columna REMOTEWORK tiene las siguientes valore únicos:


Unnamed: 0_level_0,count
RemoteWork,Unnamed: 1_level_1
1,22.304833
True,21.375465
0,19.144981
False,18.89715
Yes,18.277571


In [3]:
# Convertimos el csv a DataFrame
df = pd.read_csv("../data/HR_RAW_DATA.csv",index_col=0)

#Echamos un vistazo
df.head(10)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
0,51,No,,"684,0$",,6,3,,1,1620.0,1,0,51,3,5,resEArch DIREcToR,3,,195370.0,6462,7,Y,No,13,30,3,,0,,5,30.0,20,,15,15,195370.0,1972,1000000000$,,,Yes
1,52,No,,"699,0$",,1,4,Life Sciences,1,2590.0,3,0,65,2,5,ManAGeR,3,,199990.0,5678,0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,199990.0,1971,1000000000$,,,1
2,42,No,travel_rarely,"532,0$",Research & Development,4,2,Technical Degree,1,3190.0,3,0,58,3,5,ManaGER,4,Married,192320.0,4933,1,,No,11,30,4,,0,220.0,3,,22,,11,15,192320.0,1981,1000000000$,ManaGER - Research & Development,,1
3,47,No,travel_rarely,"359,0$",,2,4,Medical,1,,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690.0,26703,3,Y,,19,30,2,,2,,2,,20,,5,6,171690.0,1976,1000000000$,,,False
4,46,No,,"1319,0$",,3,3,Technical Degree,1,,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,,1977,1000000000$,,,0
5,48,No,,"117,0$",Research & Development,22,3,Medical,1,19000.0,4,1,58,3,4,MANAger,4,,171740.0,2437,3,,No,11,30,2,,1,,3,30.0,22,,4,7,171740.0,1975,1000000000$,MANAger - Research & Development,,Yes
6,59,No,,"1435,0$",,25,3,Life Sciences,1,810.0,1,1,99,3,3,Sales ExeCutIVe,1,,,2354,7,Y,,11,30,4,,0,280.0,3,20.0,21,,7,9,,1964,1000000000$,,,True
7,42,No,travel_rarely,"635,0$",,1,1,,1,3870.0,2,0,99,3,2,Sales eXEcUTiVe,3,Married,,24532,1,,No,25,40,3,,0,200.0,3,30.0,20,,11,6,,1981,1000000000$,,,0
8,41,No,,"1276,0$",,2,5,,1,,2,1,91,3,4,mANAGEr,1,Married,165950.0,5626,7,,No,16,30,2,,1,220.0,2,30.0,18,,11,8,165950.0,1982,1000000000$,,,True
9,41,No,travel_frequently,"840,0$",,9,3,,1,9990.0,1,0,64,3,5,reSEaRCH DIrectoR,3,,,3735,2,,No,17,30,2,,1,210.0,2,40.0,18,,0,11,,1982,1000000000$,,,0


pd.set_option('display.max_columns', None)pd.set_option('display.max_rows', None)

In [5]:
df.tail()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
1609,36,Yes,travel_rarely,"530,0$",,3,1,Life Sciences,1,9670.0,3,0,51,2,3,saLEs ExeCUTiVe,4,Married,103250.0,5518,1,Y,,11,,1,,1,,6,30,16,,3,7,103250.0,1987,1000000000$,,,0
1610,45,No,non-travel,"805,0$",,4,2,,1,9720.0,3,0,57,3,2,LAboRaTOry tECHNiCIAn,2,,44470.0,23163,1,,,12,30.0,2,,0,,5,20,9,,0,8,44470.0,1978,1000000000$,,,1
1611,39,No,travel_rarely,"903,0$",,-13,5,,1,,13,0,41,4,3,sAlES ExECUTivE,3,Single,,2560,0,,No,18,30.0,4,,0,90.0,3,30,8,,0,7,,1984,1000000000$,,,Yes
1612,36,No,non-travel,"1229,0$",,8,4,Technical Degree,1,9900.0,1,0,84,3,2,SaLes ExecUtIVe,4,Divorced,,25952,4,,No,13,,4,,2,120.0,3,30,7,,0,7,,1987,1000000000$,,,True
1613,46,No,,"566,0$",,7,2,Medical,1,,4,0,75,3,3,mAnUfactURInG DiRECTOr,3,,108450.0,24208,6,Y,,13,30.0,2,,1,,3,30,8,,0,7,108450.0,1977,1000000000$,,,0


In [6]:
df.sample(10)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
1579,34,No,,"1442,0$",,9,3,Medical,1,7170.0,4,1,46,2,3,HEaLTHCArE rEpreSeNtatiVE,2,,,17654,1,,,14,30,2,,0,90.0,3,40,8,,7,7,,1989,1000000000$,,,Yes
288,38,No,,"1084,0$",,29,3,,1,,4,0,54,3,2,MAnuFaCtURING DirECTOr,4,Married,62610.0,4185,3,Y,No,18,30,1,,1,90.0,3,10,7,,1,7,62610.0,1985,1000000000$,,,Yes
1150,38,No,travel_frequently,"1444,0$",Human Resources,1,4,Other,1,19720.0,4,0,Not Available,3,1,Human REsouRCeS,2,,,5224,0,Y,Yes,11,30,2,,1,,2,30,6,,1,2,,1985,1000000000$,Human REsouRCeS - Human Resources,,0
631,36,No,,"172,0$",,-47,4,Life Sciences,1,14350.0,47,0,Not Available,2,2,LabORAtORY TechNiciAN,4,Single,,22604,1,Y,,16,30,3,800.0,0,100.0,2,20,10,,1,8,,1987,1000000000$,,,True
816,34,No,travel_rarely,"1153,0$",,1,2,Medical,1,1100.0,1,0,94,3,2,MaNuFaCTUrING dirECtoR,2,Married,,17736,1,,No,15,30,3,800.0,0,,2,30,5,,1,3,,1989,1000000000$,,,True
953,34,No,,"829,0$",,3,2,,1,8470.0,3,0,Not Available,3,1,humAn resourCeS,4,,,2243,0,,,19,30,3,,1,40.0,1,10,3,,0,2,,1989,1000000000$,,,False
1292,30,Yes,travel_frequently,"464,0$",,4,3,,1,5140.0,3,0,40,3,1,rEsearCh sciEnTISt,4,Single,22850.0,3427,9,Y,Yes,23,40,3,,0,30.0,4,30,1,,0,0,22850.0,1993,1000000000$,,,False
29,34,No,non-travel,nan$,Sales,10,3,Life Sciences,1,17740.0,4,0,87,3,2,saLEs exEcutIVe,3,,40010.0,12313,1,Y,Yes,14,30,3,,1,150.0,3,30,15,,0,7,40010.0,1989,1000000000$,saLEs exEcutIVe - Sales,,1
378,40,No,travel_rarely,"300,0$",,26,3,,1,,3,0,74,3,2,SAles ExECutiVE,1,Married,,22217,1,,,14,30,2,800.0,1,80.0,3,20,7,,7,5,,1983,1000000000$,,,0
460,30,No,,"1092,0$",,10,3,Medical,1,18160.0,1,1,64,3,3,MANufaCturiNg dIrECtOr,3,Single,96670.0,2739,9,Y,No,14,30,2,,0,90.0,3,30,7,,0,2,96670.0,1993,1000000000$,,,False


In [4]:
print(f"El df tiene {df.shape[0]} filas y {df.shape[1]} columnas")

El df tiene 1614 filas y 41 columnas


In [None]:
# Se comprueban tipos de datos y y nulos 
df.info()

In [6]:
# Principales descriptivos para las columnas numéricas
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DistanceFromHome,1614.0,4.527261,14.591913,-49.0,2.0,5.0,11.0,29.0
Education,1614.0,2.925031,1.022357,1.0,2.0,3.0,4.0,5.0
employeecount,1614.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EnvironmentSatisfaction,1614.0,4.294919,6.993559,1.0,2.0,3.0,4.0,49.0
Gender,1614.0,0.398389,0.489718,0.0,0.0,0.0,1.0,1.0
JobInvolvement,1614.0,2.739777,0.711567,1.0,2.0,3.0,3.0,4.0
JobLevel,1614.0,2.068154,1.101344,1.0,1.0,2.0,3.0,5.0
JobSatisfaction,1614.0,2.738538,1.106163,1.0,2.0,3.0,4.0,4.0
MonthlyRate,1614.0,14284.495663,7110.414585,2094.0,8001.0,14248.5,20364.0,26999.0
NUMCOMPANIESWORKED,1614.0,2.673482,2.506152,0.0,1.0,2.0,4.0,9.0


In [7]:
# Principales descriptivos para las columnas de tipo texto
df.describe (include="object").T   

Unnamed: 0,count,unique,top,freq
Age,1614,54,35,84
Attrition,1614,2,No,1355
BusinessTravel,842,3,travel_rarely,586
DailyRate,1614,849,nan$,124
Department,302,3,Research & Development,196
EducationField,869,6,Life Sciences,349
employeenumber,1183,1079,4820,2
HourlyRate,1614,72,Not Available,84
JobRole,1614,1579,mANager,4
MaritalStatus,963,5,Married,404


In [8]:
#Comprobamos duplicados
df.duplicated().sum()

0

In [7]:
#Comprobamos nulos
df.isnull().sum()

Age                            0
Attrition                      0
BusinessTravel               772
DailyRate                      0
Department                  1312
DistanceFromHome               0
Education                      0
EducationField               745
employeecount                  0
employeenumber               431
EnvironmentSatisfaction        0
Gender                         0
HourlyRate                     0
JobInvolvement                 0
JobLevel                       0
JobRole                        0
JobSatisfaction                0
MaritalStatus                651
MonthlyIncome                843
MonthlyRate                    0
NUMCOMPANIESWORKED             0
Over18                       901
OverTime                     676
PercentSalaryHike              0
PerformanceRating            195
RelationshipSatisfaction       0
StandardHours               1195
StockOptionLevel               0
TOTALWORKINGYEARS            526
TrainingTimesLastYear          0
WORKLIFEBA

In [11]:
# Porcentage nulos
(df.isnull().sum()/df.shape[0]*100).round(2)

Age                           0.00
Attrition                     0.00
BusinessTravel               47.83
DailyRate                     0.00
Department                   81.29
DistanceFromHome              0.00
Education                     0.00
EducationField               46.16
employeecount                 0.00
employeenumber               26.70
EnvironmentSatisfaction       0.00
Gender                        0.00
HourlyRate                    0.00
JobInvolvement                0.00
JobLevel                      0.00
JobRole                       0.00
JobSatisfaction               0.00
MaritalStatus                40.33
MonthlyIncome                52.23
MonthlyRate                   0.00
NUMCOMPANIESWORKED            0.00
Over18                       55.82
OverTime                     41.88
PercentSalaryHike             0.00
PerformanceRating            12.08
RelationshipSatisfaction      0.00
StandardHours                74.04
StockOptionLevel              0.00
TOTALWORKINGYEARS   

In [18]:
# Valores únicos columnas tipo texto
for col in df.select_dtypes(include = "object"):
    print(f" Frecuencias valores únicos:")
    print(f"{df[col].value_counts()}")
    print("----------------------")

 Frecuencias valores únicos:
Age
35              84
34              83
31              82
29              78
36              74
32              64
38              64
30              63
33              61
40              60
27              54
37              53
28              53
42              50
45              48
41              46
26              45
39              45
43              40
46              38
44              34
50              31
25              27
24              26
47              26
49              25
55              24
54              20
51              20
53              20
48              20
52              18
56              17
21              16
22              16
23              15
58              14
20              11
59              10
19               9
18               8
60               5
57               5
thirty-two       2
fifty-eight      1
twenty-six       1
thirty-seven     1
thirty-one       1
thirty           1
fifty-two        1
fifty-five       

In [24]:
# Porcentage valores únicos columnas de texto
for col in df.select_dtypes(include = "object"):
    print(f" Frecuencias valores únicos:")
    print(f"{(df[col].value_counts()/df[col].shape[0])*100}")
    print("----------------------")

 Frecuencias valores únicos:
Age
35              5.204461
34              5.142503
31              5.080545
29              4.832714
36              4.584882
32              3.965304
38              3.965304
30              3.903346
33              3.779430
40              3.717472
27              3.345725
37              3.283767
28              3.283767
42              3.097893
45              2.973978
41              2.850062
26              2.788104
39              2.788104
43              2.478315
46              2.354399
44              2.106568
50              1.920694
25              1.672862
24              1.610905
47              1.610905
49              1.548947
55              1.486989
54              1.239157
51              1.239157
53              1.239157
48              1.239157
52              1.115242
56              1.053284
21              0.991326
22              0.991326
23              0.929368
58              0.867410
20              0.681537
59              0

In [9]:
df["Age"].value_counts() ##Hay valores no numéricos como fifty-eight. Hacer un replace de estos y cambiarlos a números.

Age
35              84
34              83
31              82
29              78
36              74
32              64
38              64
30              63
33              61
40              60
27              54
37              53
28              53
42              50
45              48
41              46
26              45
39              45
43              40
46              38
44              34
50              31
25              27
24              26
47              26
49              25
55              24
54              20
51              20
53              20
48              20
52              18
56              17
21              16
22              16
23              15
58              14
20              11
59              10
19               9
18               8
60               5
57               5
thirty-two       2
fifty-eight      1
twenty-six       1
thirty-seven     1
thirty-one       1
thirty           1
fifty-two        1
fifty-five       1
thirty-six       1
forty-se

In [10]:
df['Age'] = df['Age'].replace(['fifty_five'], '55') 
df['Age'].value_counts()

Age
35              84
34              83
31              82
29              78
36              74
32              64
38              64
30              63
33              61
40              60
27              54
37              53
28              53
42              50
45              48
41              46
26              45
39              45
43              40
46              38
44              34
50              31
25              27
24              26
47              26
49              25
55              24
54              20
51              20
53              20
48              20
52              18
56              17
21              16
22              16
23              15
58              14
20              11
59              10
19               9
18               8
60               5
57               5
thirty-two       2
fifty-eight      1
twenty-six       1
thirty-seven     1
thirty-one       1
thirty           1
fifty-two        1
fifty-five       1
thirty-six       1
forty-se

In [11]:
nuevas_columnas = {columna: columna.lower()for columna in df.columns}

df.rename (columns=nuevas_columnas, inplace= True)

df.columns

Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeecount',
       'employeenumber', 'environmentsatisfaction', 'gender', 'hourlyrate',
       'jobinvolvement', 'joblevel', 'jobrole', 'jobsatisfaction',
       'maritalstatus', 'monthlyincome', 'monthlyrate', 'numcompaniesworked',
       'over18', 'overtime', 'percentsalaryhike', 'performancerating',
       'relationshipsatisfaction', 'standardhours', 'stockoptionlevel',
       'totalworkingyears', 'trainingtimeslastyear', 'worklifebalance',
       'yearsatcompany', 'yearsincurrentrole', 'yearssincelastpromotion',
       'yearswithcurrmanager', 'sameasmonthlyincome', 'datebirth', 'salary',
       'roledepartament', 'numberchildren', 'remotework'],
      dtype='object')

In [12]:
# Crear un diccionario con el cambio de edades a valores numéricos
edades_cambio =  {'fifty-five': '55', 'thirty-six': '36', 'forty-seven': '47', 'twenty-four': '24', 'thirty-seven': '37', 'fifty-two': '52', 'fifty-eight': '58', 'twenty-six': '26', 'thirty-one': '31', 'thirty': '30', 'thirty-two': '32'} 

# Reemplazar las edades en la columna "Age" utilizando el diccionario
df['age'] = df['age'].replace(edades_cambio)

# Contar las ocurrencias de cada edad en la columna "Age"
conteo_edades = df['age'].value_counts()

# Imprimir el resultado
print(conteo_edades)


age
35    84
31    83
34    83
29    78
36    75
32    66
38    64
30    64
33    61
40    60
27    54
37    54
28    53
42    50
45    48
41    46
26    46
39    45
43    40
46    38
44    34
50    31
24    27
47    27
25    27
55    25
49    25
54    20
51    20
53    20
48    20
52    19
56    17
22    16
21    16
58    15
23    15
20    11
59    10
19     9
18     8
60     5
57     5
Name: count, dtype: int64


In [13]:
df['age']=df['age'].apply(pd.to_numeric)


In [14]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1614.0,36.920694,9.102157,18.0,30.0,36.0,43.0,60.0
distancefromhome,1614.0,4.527261,14.591913,-49.0,2.0,5.0,11.0,29.0
education,1614.0,2.925031,1.022357,1.0,2.0,3.0,4.0,5.0
employeecount,1614.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
environmentsatisfaction,1614.0,4.294919,6.993559,1.0,2.0,3.0,4.0,49.0
gender,1614.0,0.398389,0.489718,0.0,0.0,0.0,1.0,1.0
jobinvolvement,1614.0,2.739777,0.711567,1.0,2.0,3.0,3.0,4.0
joblevel,1614.0,2.068154,1.101344,1.0,1.0,2.0,3.0,5.0
jobsatisfaction,1614.0,2.738538,1.106163,1.0,2.0,3.0,4.0,4.0
monthlyrate,1614.0,14284.495663,7110.414585,2094.0,8001.0,14248.5,20364.0,26999.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   int64  
 1   attrition                 1614 non-null   object 
 2   businesstravel            842 non-null    object 
 3   dailyrate                 1614 non-null   object 
 4   department                302 non-null    object 
 5   distancefromhome          1614 non-null   int64  
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   employeecount             1614 non-null   int64  
 9   employeenumber            1183 non-null   object 
 10  environmentsatisfaction   1614 non-null   int64  
 11  gender                    1614 non-null   int64  
 12  hourlyrate                1614 non-null   object 
 13  jobinvolvement            1614 non-null   int64  
 14  joblevel     

In [16]:
df['attrition'].value_counts()

attrition
No     1355
Yes     259
Name: count, dtype: int64

In [17]:
df['businesstravel'].value_counts()

businesstravel
travel_rarely        586
travel_frequently    165
non-travel            91
Name: count, dtype: int64

In [18]:
df['businesstravel']=df['businesstravel'].str.replace("-","_")

In [19]:
df['businesstravel']=df['businesstravel'].replace(np.nan, "desconocido")

df['businesstravel']

0         desconocido
1         desconocido
2       travel_rarely
3       travel_rarely
4         desconocido
            ...      
1609    travel_rarely
1610       non_travel
1611    travel_rarely
1612       non_travel
1613      desconocido
Name: businesstravel, Length: 1614, dtype: object

In [20]:
df.rename(columns={"businesstravel": "business_travel"}, inplace=True)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   int64  
 1   attrition                 1614 non-null   object 
 2   business_travel           1614 non-null   object 
 3   dailyrate                 1614 non-null   object 
 4   department                302 non-null    object 
 5   distancefromhome          1614 non-null   int64  
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   employeecount             1614 non-null   int64  
 9   employeenumber            1183 non-null   object 
 10  environmentsatisfaction   1614 non-null   int64  
 11  gender                    1614 non-null   int64  
 12  hourlyrate                1614 non-null   object 
 13  jobinvolvement            1614 non-null   int64  
 14  joblevel     

In [22]:
df['dailyrate'].value_counts()

dailyrate
nan$      124
691,0$      7
329,0$      7
147,0$      6
530,0$      6
         ... 
317,0$      1
891,0$      1
759,0$      1
483,0$      1
105,0$      1
Name: count, Length: 849, dtype: int64

In [23]:
df.rename(columns={"dailyrate": "daily_rate"}, inplace=True)
df

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,51,No,desconocido,"684,0$",,6,3,,1,1620,1,0,51,3,5,resEArch DIREcToR,3,,195370,6462,7,Y,No,13,30,3,,0,,5,30,20,,15,15,195370,1972,1000000000$,,,Yes
1,52,No,desconocido,"699,0$",,1,4,Life Sciences,1,2590,3,0,65,2,5,ManAGeR,3,,199990,5678,0,,,14,30,1,,1,340,5,30,33,,11,9,199990,1971,1000000000$,,,1
2,42,No,travel_rarely,"532,0$",Research & Development,4,2,Technical Degree,1,3190,3,0,58,3,5,ManaGER,4,Married,192320,4933,1,,No,11,30,4,,0,220,3,,22,,11,15,192320,1981,1000000000$,ManaGER - Research & Development,,1
3,47,No,travel_rarely,"359,0$",,2,4,Medical,1,,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690,26703,3,Y,,19,30,2,,2,,2,,20,,5,6,171690,1976,1000000000$,,,False
4,46,No,desconocido,"1319,0$",,3,3,Technical Degree,1,,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,Y,No,12,30,4,,1,,5,30,19,,2,8,,1977,1000000000$,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,36,Yes,travel_rarely,"530,0$",,3,1,Life Sciences,1,9670,3,0,51,2,3,saLEs ExeCUTiVe,4,Married,103250,5518,1,Y,,11,,1,,1,,6,30,16,,3,7,103250,1987,1000000000$,,,0
1610,45,No,non_travel,"805,0$",,4,2,,1,9720,3,0,57,3,2,LAboRaTOry tECHNiCIAn,2,,44470,23163,1,,,12,30,2,,0,,5,20,9,,0,8,44470,1978,1000000000$,,,1
1611,39,No,travel_rarely,"903,0$",,-13,5,,1,,13,0,41,4,3,sAlES ExECUTivE,3,Single,,2560,0,,No,18,30,4,,0,90,3,30,8,,0,7,,1984,1000000000$,,,Yes
1612,36,No,non_travel,"1229,0$",,8,4,Technical Degree,1,9900,1,0,84,3,2,SaLes ExecUtIVe,4,Divorced,,25952,4,,No,13,,4,,2,120,3,30,7,,0,7,,1987,1000000000$,,,True


In [24]:
df['daily_rate']=df['daily_rate'].str.replace ("$", "").str.replace (",", ".")
df['daily_rate']

0        684.0
1        699.0
2        532.0
3        359.0
4       1319.0
         ...  
1609     530.0
1610     805.0
1611     903.0
1612    1229.0
1613     566.0
Name: daily_rate, Length: 1614, dtype: object

In [25]:
df['daily_rate'].isnull().sum()

0

In [26]:
df['daily_rate']=df['daily_rate'].replace ("nan", np.nan)
df['daily_rate']

0        684.0
1        699.0
2        532.0
3        359.0
4       1319.0
         ...  
1609     530.0
1610     805.0
1611     903.0
1612    1229.0
1613     566.0
Name: daily_rate, Length: 1614, dtype: object

In [27]:
df['daily_rate'].isnull().sum()

124

In [28]:
df['daily_rate']=df['daily_rate'].apply(pd.to_numeric)
df['daily_rate']

0        684.0
1        699.0
2        532.0
3        359.0
4       1319.0
         ...  
1609     530.0
1610     805.0
1611     903.0
1612    1229.0
1613     566.0
Name: daily_rate, Length: 1614, dtype: float64

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   int64  
 1   attrition                 1614 non-null   object 
 2   business_travel           1614 non-null   object 
 3   daily_rate                1490 non-null   float64
 4   department                302 non-null    object 
 5   distancefromhome          1614 non-null   int64  
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   employeecount             1614 non-null   int64  
 9   employeenumber            1183 non-null   object 
 10  environmentsatisfaction   1614 non-null   int64  
 11  gender                    1614 non-null   int64  
 12  hourlyrate                1614 non-null   object 
 13  jobinvolvement            1614 non-null   int64  
 14  joblevel     

In [30]:
df['department'].value_counts()

department
 Research & Development     196
 Sales                       91
 Human Resources             15
Name: count, dtype: int64

In [31]:
df['department']=df['department'].str.lower().str.replace("research & development","research_development").str.replace("human resources", "human_resources")

df['department'].value_counts()

department
 research_development     196
 sales                     91
 human_resources           15
Name: count, dtype: int64

In [32]:
df['department']=df['department'].replace (np.nan, "desconocido")

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       1614 non-null   int64  
 1   attrition                 1614 non-null   object 
 2   business_travel           1614 non-null   object 
 3   daily_rate                1490 non-null   float64
 4   department                1614 non-null   object 
 5   distancefromhome          1614 non-null   int64  
 6   education                 1614 non-null   int64  
 7   educationfield            869 non-null    object 
 8   employeecount             1614 non-null   int64  
 9   employeenumber            1183 non-null   object 
 10  environmentsatisfaction   1614 non-null   int64  
 11  gender                    1614 non-null   int64  
 12  hourlyrate                1614 non-null   object 
 13  jobinvolvement            1614 non-null   int64  
 14  joblevel     

In [34]:
df.rename (columns={"distancefromhome": "distance_from_home", 
                    "educationfield": "education_field", 
                    "employeecount": "employee_count",
                    "employeenumber": "employee_number",
                    "environmentsatisfaction": "environment_satisfaction",
                    "hourlyrate": "hourly_rate", 
                    "jobinvolvement": "job_involvement",
                    "joblevel": "job_level",
                    "jobrole": "job_role",
                    "jobsatisfaction": "job_satisfaction",
                    "maritalstatus": "marital_status",
                    "monthlyincome": "monthly_income",
                    "monthlyrate": " monthly_rate",
                    "numcompaniesworked": "num_companies_worked",
                    "over18": "over_18",
                    "percentsalaryhike": "percent_salary_hike",
                    "performancerating": "performance_rating",
                    "relationshipsatisfaction": "relationship_satisfaction",
                    "standardhours": "standard_hours",
                    "stockoptionlevel": "stock_option_level",
                    "totalworkingyears": "total_working_years",
                    "trainingtimeslastyear": "training_times_last_year",
                    "worklifebalance": "work_life_balance",
                    "yearsatcompany": "years_at_company",
                    "yearsincurrentrole": "years_in_current_role",
                    "yearssincelastpromotion": "years_since_last_promotion",
                    "yearswithcurrmanager": "years_with_curr_manager",
                    "sameasmonthlyincome": "same_as_monthly_income",
                    "datebirth": "date_birth",
                    "roledepartament": "role_departament",
                    "numberchildren": "number_children",
                    "remotework": "remote_work"}, inplace= True)

In [35]:
df["distance_from_home"].value_counts()
pd.set_option('display.max_rows', 70)


In [36]:
df["distance_from_home"].value_counts()

distance_from_home
 2     217
 1     203
 10     86
 9      85
 8      81
 7      80
 3      79
 5      62
 6      61
 4      61
 16     30
 11     28
 15     27
 23     26
 29     25
 25     24
 24     24
 18     22
 26     22
 14     21
 12     21
 21     19
 17     19
 13     19
 20     18
 28     17
 22     16
 19     16
 27     13
-13     11
-12     11
-24     11
-18      8
-47      7
-35      7
-38      7
-19      6
-36      6
-14      6
-31      6
-26      6
-25      6
-42      6
-22      5
-45      5
-48      5
-10      5
-46      5
-16      5
-27      4
-30      4
-20      4
-29      4
-41      4
-32      4
-37      4
-17      3
-11      3
-15      3
-23      3
-33      3
-44      3
-21      2
-49      2
-28      2
-34      2
-43      2
-39      1
-40      1
Name: count, dtype: int64

In [37]:
def tonull(numero):
    if numero <0:
        return np.nan
    else:
        return numero
      

In [38]:
df["distance_from_home"]= df["distance_from_home"].apply(tonull)

In [39]:
df.head()

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_count,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_18,overtime,percent_salary_hike,performance_rating,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,same_as_monthly_income,date_birth,salary,role_departament,number_children,remote_work
0,51,No,desconocido,684.0,desconocido,6.0,3,,1,1620.0,1,0,51,3,5,resEArch DIREcToR,3,,195370.0,6462,7,Y,No,13,30,3,,0,,5,30.0,20,,15,15,195370.0,1972,1000000000$,,,Yes
1,52,No,desconocido,699.0,desconocido,1.0,4,Life Sciences,1,2590.0,3,0,65,2,5,ManAGeR,3,,199990.0,5678,0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,199990.0,1971,1000000000$,,,1
2,42,No,travel_rarely,532.0,research_development,4.0,2,Technical Degree,1,3190.0,3,0,58,3,5,ManaGER,4,Married,192320.0,4933,1,,No,11,30,4,,0,220.0,3,,22,,11,15,192320.0,1981,1000000000$,ManaGER - Research & Development,,1
3,47,No,travel_rarely,359.0,desconocido,2.0,4,Medical,1,,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690.0,26703,3,Y,,19,30,2,,2,,2,,20,,5,6,171690.0,1976,1000000000$,,,False
4,46,No,desconocido,1319.0,desconocido,3.0,3,Technical Degree,1,,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,,1977,1000000000$,,,0


In [40]:
df["distance_from_home"].describe()

count    1422.000000
mean        8.892405
std         7.920281
min         1.000000
25%         2.000000
50%         7.000000
75%        13.000000
max        29.000000
Name: distance_from_home, dtype: float64

In [41]:
df[["education", "education_field" ]]
pd.set_option('display.max_rows', 100)


In [42]:
df[["education", "education_field" ]]

Unnamed: 0,education,education_field
0,3,
1,4,Life Sciences
2,2,Technical Degree
3,4,Medical
4,3,Technical Degree
...,...,...
1609,1,Life Sciences
1610,2,
1611,5,
1612,4,Technical Degree


In [43]:
df["education"].describe()

count    1614.000000
mean        2.925031
std         1.022357
min         1.000000
25%         2.000000
50%         3.000000
75%         4.000000
max         5.000000
Name: education, dtype: float64

In [44]:
df["education"].value_counts()

education
3    621
4    445
2    314
1    180
5     54
Name: count, dtype: int64

In [45]:
df["education_field"].value_counts()

education_field
Life Sciences       349
Medical             276
Marketing           104
Technical Degree     69
Other                59
Human Resources      12
Name: count, dtype: int64

In [46]:
mapa_education={1: "eso", 2:"fp", 3:"bachillerato", 4:"universidad", 5:"doctorado"}

In [47]:
df["education"]=df["education"].map(mapa_education)
df["education"]

0       bachillerato
1        universidad
2                 fp
3        universidad
4       bachillerato
            ...     
1609             eso
1610              fp
1611       doctorado
1612     universidad
1613              fp
Name: education, Length: 1614, dtype: object

In [48]:
df["education"].unique()

array(['bachillerato', 'universidad', 'fp', 'eso', 'doctorado'],
      dtype=object)

In [49]:
df["education_field"]=df["education_field"].replace(np.nan, "desconocido")

df["education_field"]

0            desconocido
1          Life Sciences
2       Technical Degree
3                Medical
4       Technical Degree
              ...       
1609       Life Sciences
1610         desconocido
1611         desconocido
1612    Technical Degree
1613             Medical
Name: education_field, Length: 1614, dtype: object

In [50]:
dict_education= {"Life Sciences":"life_sciences",
                 "Medical"  :"medical",
                 "Marketing": "marketing",
                 "Technical Degree" : "technical_degree",
                 "Other": "other",
                 "Human Resources": "human_resources",
                 "desconocido": "desconocido"}
                 

In [51]:
df["education_field"]=df["education_field"].map (dict_education)
df["education_field"]

0            desconocido
1          life_sciences
2       technical_degree
3                medical
4       technical_degree
              ...       
1609       life_sciences
1610         desconocido
1611         desconocido
1612    technical_degree
1613             medical
Name: education_field, Length: 1614, dtype: object

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         1614 non-null   int64  
 1   attrition                   1614 non-null   object 
 2   business_travel             1614 non-null   object 
 3   daily_rate                  1490 non-null   float64
 4   department                  1614 non-null   object 
 5   distance_from_home          1422 non-null   float64
 6   education                   1614 non-null   object 
 7   education_field             1614 non-null   object 
 8   employee_count              1614 non-null   int64  
 9   employee_number             1183 non-null   object 
 10  environment_satisfaction    1614 non-null   int64  
 11  gender                      1614 non-null   int64  
 12  hourly_rate                 1614 non-null   object 
 13  job_involvement             1614 non-n

In [53]:
df.drop("employee_count", axis=1, inplace= True)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 40 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         1614 non-null   int64  
 1   attrition                   1614 non-null   object 
 2   business_travel             1614 non-null   object 
 3   daily_rate                  1490 non-null   float64
 4   department                  1614 non-null   object 
 5   distance_from_home          1422 non-null   float64
 6   education                   1614 non-null   object 
 7   education_field             1614 non-null   object 
 8   employee_number             1183 non-null   object 
 9   environment_satisfaction    1614 non-null   int64  
 10  gender                      1614 non-null   int64  
 11  hourly_rate                 1614 non-null   object 
 12  job_involvement             1614 non-null   int64  
 13  job_level                   1614 non-n

In [55]:

#pd.set_option('display.max_rows', None)
df["employee_number"].value_counts()

employee_number
482,0     2
530,0     2
507,0     2
517,0     2
522,0     2
         ..
161,0     1
164,0     1
190,0     1
194,0     1
2040,0    1
Name: count, Length: 1079, dtype: int64

In [56]:
df["employee_number"].value_counts()
df[["attrition","employee_number"]].sort_values(by="employee_number")

Unnamed: 0,attrition,employee_number
1227,No,100
33,No,1000
977,No,10010
215,No,10020
1351,No,10030
...,...,...
1601,No,
1602,Yes,
1607,No,
1611,No,


In [57]:
df["employee_number"]==424

0       False
1       False
2       False
3       False
4       False
        ...  
1609    False
1610    False
1611    False
1612    False
1613    False
Name: employee_number, Length: 1614, dtype: bool

In [58]:
df[df["employee_number"]=="424,0"]

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_18,overtime,percent_salary_hike,performance_rating,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,same_as_monthly_income,date_birth,salary,role_departament,number_children,remote_work
872,31,No,travel_rarely,106.0,desconocido,2.0,bachillerato,human_resources,4240,1,0,62,2,2,hUmAN rESOuRCEs,1,,64100,17822,3,Y,No,12,30,4,,0,,1,30,2,,1,0,64100,1992,1000000000$,,,False
1513,31,No,travel_rarely,106.0,desconocido,2.0,bachillerato,human_resources,4240,1,0,62,2,2,hUMan rEsOuRCES,1,,64100,17822,3,Y,No,12,30,4,,0,,1,30,2,,1,0,64100,1992,1000000000$,,,True


In [59]:
df.duplicated().index #filas que tienen duplicados

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613],
      dtype='int64', length=1614)

In [60]:
df["environment_satisfaction"].value_counts()

environment_satisfaction
4     460
3     459
1     298
2     297
12      7
35      6
13      6
14      5
24      5
47      5
36      4
48      4
41      4
46      4
42      4
45      3
11      3
22      3
17      3
18      3
20      3
25      3
27      3
37      2
29      2
19      2
38      2
31      2
15      2
16      2
26      1
39      1
10      1
49      1
21      1
28      1
33      1
43      1
Name: count, dtype: int64

In [61]:
def primer_digito(numero):
    digitos = [int(d) for d in str(numero)]
    return digitos[0]

In [62]:
df["environment_satisfaction"]=df["environment_satisfaction"].apply(primer_digito)

In [63]:
df["environment_satisfaction"].value_counts()

environment_satisfaction
4    486
3    477
1    332
2    319
Name: count, dtype: int64

In [64]:
for columna in df.columns[10:]:
    print(columna)
    display (df[columna].value_counts())
    print("------------------------------")


gender


gender
0    971
1    643
Name: count, dtype: int64

------------------------------
hourly_rate


hourly_rate
Not Available    84
42               33
66               32
48               30
57               29
84               29
54               28
46               28
98               28
87               27
72               27
96               27
92               27
32               26
56               26
79               26
86               25
81               25
52               25
61               25
62               24
83               24
51               24
82               24
43               23
74               23
88               23
45               23
41               23
77               23
73               23
94               23
60               22
78               22
95               22
99               21
76               21
80               21
59               21
67               21
91               20
97               20
75               20
90               20
85               20
44               20
49               20
64               19
55               19
65      

------------------------------
job_involvement


job_involvement
3    955
2    406
4    164
1     89
Name: count, dtype: int64

------------------------------
job_level


job_level
2    597
1    586
3    242
4    113
5     76
Name: count, dtype: int64

------------------------------
job_role


job_role
 mANager                       4
 ManageR                       3
 ManagEr                       3
 mAnaGeR                       3
 MANAgER                       3
                              ..
 ResEArch ScieNTiST            1
 HealthcARE RePreSENtAtiVe     1
 ReSearcH scIEntist            1
 LAbOrATOry techNicIan         1
 mAnUfactURInG DiRECTOr        1
Name: count, Length: 1579, dtype: int64

------------------------------
job_satisfaction


job_satisfaction
4    514
3    481
1    317
2    302
Name: count, dtype: int64

------------------------------
marital_status


marital_status
Married     404
Single      325
Divorced    188
Marreid      35
divorced     11
Name: count, dtype: int64

------------------------------
monthly_income


monthly_income
6347,0     4
5304,0     4
2657,0     3
2258,0     3
5405,0     2
          ..
3102,0     1
4556,0     1
4230,0     1
4859,0     1
19431,0    1
Name: count, Length: 668, dtype: int64

------------------------------
 monthly_rate


 monthly_rate
9150     4
11737    3
17001    3
6069     3
21981    3
        ..
25470    1
10205    1
9973     1
17089    1
16642    1
Name: count, Length: 1427, dtype: int64

------------------------------
num_companies_worked


num_companies_worked
1    573
0    226
3    169
4    157
2    156
7     84
6     73
5     66
9     59
8     51
Name: count, dtype: int64

------------------------------
over_18


over_18
Y    713
Name: count, dtype: int64

------------------------------
overtime


overtime
No     682
Yes    256
Name: count, dtype: int64

------------------------------
percent_salary_hike


percent_salary_hike
11    232
13    230
12    225
14    220
15    110
18     98
17     88
16     86
19     82
20     60
22     59
21     51
23     29
24     25
25     19
Name: count, dtype: int64

------------------------------
performance_rating


performance_rating
3,0    1205
4,0     214
Name: count, dtype: int64

------------------------------
relationship_satisfaction


relationship_satisfaction
3    504
4    468
2    339
1    303
Name: count, dtype: int64

------------------------------
standard_hours


standard_hours
80,0    419
Name: count, dtype: int64

------------------------------
stock_option_level


stock_option_level
0    687
1    666
2    172
3     89
Name: count, dtype: int64

------------------------------
total_working_years


total_working_years
10,0    144
8,0      86
6,0      84
9,0      69
5,0      66
7,0      56
4,0      54
1,0      53
12,0     34
3,0      32
14,0     30
13,0     30
11,0     29
15,0     28
16,0     28
20,0     28
18,0     27
21,0     23
17,0     22
2,0      21
22,0     18
19,0     17
24,0     14
23,0     13
28,0     13
26,0      8
0,0       8
29,0      6
36,0      6
25,0      6
33,0      6
37,0      5
27,0      5
31,0      4
30,0      3
32,0      3
35,0      3
40,0      3
34,0      2
38,0      1
Name: count, dtype: int64

------------------------------
training_times_last_year


training_times_last_year
2    598
3    534
4    137
5    136
1     77
6     72
0     60
Name: count, dtype: int64

------------------------------
work_life_balance


work_life_balance
3,0    913
2,0    359
4,0    155
1,0     79
Name: count, dtype: int64

------------------------------
years_at_company


years_at_company
5     208
1     171
3     141
2     141
10    133
7     115
4     114
8     106
9      94
6      78
0      44
11     36
20     29
13     26
15     21
14     19
22     17
12     15
18     15
16     14
21     14
19     12
17      9
24      7
25      5
33      5
26      4
32      3
27      3
31      3
36      3
29      2
23      2
40      2
34      1
37      1
30      1
Name: count, dtype: int64

------------------------------
years_in_current_role


years_in_current_role
2,0     11
7,0      5
0,0      4
4,0      3
1,0      3
11,0     2
6,0      2
3,0      2
13,0     1
12,0     1
Name: count, dtype: int64

------------------------------
years_since_last_promotion


years_since_last_promotion
0     625
1     384
2     177
7      93
4      67
3      62
5      53
6      37
11     26
8      20
9      18
15     15
12     11
13     10
14     10
10      6
Name: count, dtype: int64

------------------------------
years_with_curr_manager


years_with_curr_manager
2     380
0     270
7     267
3     148
8     115
4     104
1      84
9      70
5      36
10     31
6      30
11     22
12     20
13     16
17      8
15      5
14      5
16      3
Name: count, dtype: int64

------------------------------
same_as_monthly_income


same_as_monthly_income
6347,0     4
5304,0     4
2657,0     3
2258,0     3
5405,0     2
          ..
3102,0     1
4556,0     1
4230,0     1
4859,0     1
19431,0    1
Name: count, Length: 668, dtype: int64

------------------------------
date_birth


date_birth
1988    84
1992    83
1989    83
1994    78
1987    75
1991    65
1985    64
1993    64
1990    61
1983    60
1986    55
1996    54
1995    53
1981    50
1978    48
1982    46
1997    46
1984    45
1980    40
1977    38
1979    34
1973    31
1999    27
1976    27
1998    27
1968    25
1974    25
1969    20
1972    20
1970    20
1975    20
1971    19
1967    17
2001    16
2002    16
1965    15
2000    15
2003    11
1964    10
2004     9
2005     8
1963     5
1966     5
Name: count, dtype: int64

------------------------------
salary


salary
1000000000$    1614
Name: count, dtype: int64

------------------------------
role_departament


role_departament
 MaNAgeR  -  Sales                                        2
 ManaGER  -  Research & Development                       1
 ReseaRch scIENTisT  -  Research & Development            1
 ManufacTURInG DIRECtOR  -  Research & Development        1
 hEalthCaRe reprEseNTaTiVe  -  Research & Development     1
                                                         ..
 saLES eXEcUTiVE  -  Sales                                1
 mANUfacTURiNG dIRectOR  -  Research & Development        1
 huMAn ResOurces  -  Human Resources                      1
 HUMAN ResoURCeS  -  Human Resources                      1
 sAleS EXECUtIvE  -  Sales                                1
Name: count, Length: 301, dtype: int64

------------------------------
number_children


Series([], Name: count, dtype: int64)

------------------------------
remote_work


remote_work
1        360
True     345
0        309
False    305
Yes      295
Name: count, dtype: int64

------------------------------


In [65]:
mapa ={0:"male", 1:"female"}
df["gender"]=df["gender"].map(mapa)
df["gender"].unique()

array(['male', 'female'], dtype=object)

In [66]:
df["gender"].value_counts()

gender
male      971
female    643
Name: count, dtype: int64

In [67]:
df["hourly_rate"]= df["hourly_rate"].replace("Not Available", np.nan)

In [68]:
df["hourly_rate"].isnull().sum()

84

In [69]:
df["hourly_rate"]= df["hourly_rate"].apply(pd.to_numeric)

In [70]:
df["job_role"]=df["job_role"].str.lower()

In [71]:
df["job_role"].value_counts()

job_role
 sales executive               369
 research scientist            314
 laboratory technician         278
 manufacturing director        158
 healthcare representative     149
 manager                       111
 sales representative           90
 research director              88
 human resources                57
Name: count, dtype: int64

In [72]:
df["job_role"].str.replace(" ", "_")

0            _research_director_
1                      _manager_
2                      _manager_
3            _research_director_
4              _sales_executive_
                  ...           
1609           _sales_executive_
1610     _laboratory_technician_
1611           _sales_executive_
1612           _sales_executive_
1613    _manufacturing_director_
Name: job_role, Length: 1614, dtype: object

In [73]:
df["job_role"]=df["job_role"].str.strip().str.replace(" ", "_")

In [74]:
df["job_role"].unique()

array(['research_director', 'manager', 'sales_executive',
       'manufacturing_director', 'research_scientist',
       'healthcare_representative', 'laboratory_technician',
       'sales_representative', 'human_resources'], dtype=object)

In [75]:
df["marital_status"] =df["marital_status"].str.lower().str.replace("marreid", "married")

In [76]:
df["marital_status"].value_counts()

marital_status
married     439
single      325
divorced    199
Name: count, dtype: int64

In [77]:
df["monthly_income"].sample(10)

1001     2791,0
903     11935,0
1125        NaN
918         NaN
364         NaN
1586     8793,0
341     11159,0
474         NaN
1275     3161,0
1313        NaN
Name: monthly_income, dtype: object

In [78]:
df.drop("same_as_monthly_income", axis=1, inplace=True)

In [79]:
df["monthly_income"] = df["monthly_income"].str.replace(",",".")
df["monthly_income"]

0       19537.0
1       19999.0
2       19232.0
3       17169.0
4           NaN
         ...   
1609    10325.0
1610     4447.0
1611        NaN
1612        NaN
1613    10845.0
Name: monthly_income, Length: 1614, dtype: object

In [80]:
df["monthly_income"] = df["monthly_income"].apply(pd.to_numeric)

In [81]:
df.rename(columns={" monthly_rate": "monthly_rate"}, inplace=True)

In [82]:
df[["monthly_rate", "monthly_income"]].sample(10)

Unnamed: 0,monthly_rate,monthly_income
999,13401,
1559,9558,
185,4609,
379,17323,
837,6984,3929.0
174,22021,15427.0
1498,5033,3539.0
982,13119,4240.0
1478,10942,9613.0
648,20586,3989.0


In [83]:
df.drop(["over_18", "number_children"], axis=1, inplace=True)

In [84]:
df["overtime"]=df["overtime"].replace(np.nan, "desconocido")

In [85]:
df.columns

Index(['age', 'attrition', 'business_travel', 'daily_rate', 'department',
       'distance_from_home', 'education', 'education_field', 'employee_number',
       'environment_satisfaction', 'gender', 'hourly_rate', 'job_involvement',
       'job_level', 'job_role', 'job_satisfaction', 'marital_status',
       'monthly_income', 'monthly_rate', 'num_companies_worked', 'overtime',
       'percent_salary_hike', 'performance_rating',
       'relationship_satisfaction', 'standard_hours', 'stock_option_level',
       'total_working_years', 'training_times_last_year', 'work_life_balance',
       'years_at_company', 'years_in_current_role',
       'years_since_last_promotion', 'years_with_curr_manager', 'date_birth',
       'salary', 'role_departament', 'remote_work'],
      dtype='object')

In [86]:
df["performance_rating"]=df["performance_rating"].replace(np.nan, "3,0").str.replace(",", ".").apply(pd.to_numeric)

In [87]:
df.drop("standard_hours", axis=1, inplace=True)

In [88]:
df["stock_option_level"].describe()

count    1614.000000
mean        0.791202
std         0.842396
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: stock_option_level, dtype: float64

In [89]:
df["total_working_years"]=df["total_working_years"].str.replace(",", ".").apply(pd.to_numeric)

In [90]:
df["work_life_balance"]=df["work_life_balance"].replace(np.nan, "3,0").str.replace(",", ".").apply(pd.to_numeric)

In [91]:
df.drop("years_in_current_role", axis=1, inplace=True)

In [92]:
df["date_birth"]=pd.to_datetime(df["date_birth"], format='%Y')

In [93]:
df["date_birth"]=df["date_birth"].dt.year

In [94]:
df["date_birth"].dtypes

dtype('int32')

In [95]:
df.drop("salary", axis=1, inplace=True)

In [96]:
df["role_departament"]=df["role_departament"].str.lower()

In [97]:
df[["role_departament", "job_role", "department"]].sample(10)

Unnamed: 0,role_departament,job_role,department
892,,laboratory_technician,desconocido
1069,,healthcare_representative,desconocido
1330,,research_scientist,desconocido
372,,research_director,desconocido
923,,laboratory_technician,desconocido
482,sales executive - sales,sales_executive,sales
194,,sales_executive,desconocido
1087,,manufacturing_director,desconocido
989,,research_scientist,desconocido
754,,manufacturing_director,desconocido


In [98]:
df.drop("role_departament", axis=1, inplace=True)

In [99]:
df["remote_work"]=df["remote_work"].map({"1": "yes", "0":"no", "True":"yes", "False":"no", "Yes":"yes"})

In [100]:
df["remote_work"].unique()

array(['yes', 'no'], dtype=object)

In [101]:
df

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,overtime,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_since_last_promotion,years_with_curr_manager,date_birth,remote_work
0,51,No,desconocido,684.0,desconocido,6.0,bachillerato,desconocido,1620,1,male,51.0,3,5,research_director,3,,19537.0,6462,7,No,13,3.0,3,0,,5,3.0,20,15,15,1972,yes
1,52,No,desconocido,699.0,desconocido,1.0,universidad,life_sciences,2590,3,male,65.0,2,5,manager,3,,19999.0,5678,0,desconocido,14,3.0,1,1,34.0,5,3.0,33,11,9,1971,yes
2,42,No,travel_rarely,532.0,research_development,4.0,fp,technical_degree,3190,3,male,58.0,3,5,manager,4,married,19232.0,4933,1,No,11,3.0,4,0,22.0,3,3.0,22,11,15,1981,yes
3,47,No,travel_rarely,359.0,desconocido,2.0,universidad,medical,,1,female,82.0,3,4,research_director,3,married,17169.0,26703,3,desconocido,19,3.0,2,2,,2,3.0,20,5,6,1976,no
4,46,No,desconocido,1319.0,desconocido,3.0,bachillerato,technical_degree,,1,female,45.0,4,4,sales_executive,1,divorced,,7739,2,No,12,3.0,4,1,,5,3.0,19,2,8,1977,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,36,Yes,travel_rarely,530.0,desconocido,3.0,eso,life_sciences,9670,3,male,51.0,2,3,sales_executive,4,married,10325.0,5518,1,desconocido,11,3.0,1,1,,6,3.0,16,3,7,1987,no
1610,45,No,non_travel,805.0,desconocido,4.0,fp,desconocido,9720,3,male,57.0,3,2,laboratory_technician,2,,4447.0,23163,1,desconocido,12,3.0,2,0,,5,2.0,9,0,8,1978,yes
1611,39,No,travel_rarely,903.0,desconocido,,doctorado,desconocido,,1,male,41.0,4,3,sales_executive,3,single,,2560,0,No,18,3.0,4,0,9.0,3,3.0,8,0,7,1984,yes
1612,36,No,non_travel,1229.0,desconocido,8.0,universidad,technical_degree,9900,1,male,84.0,3,2,sales_executive,4,divorced,,25952,4,No,13,3.0,4,2,12.0,3,3.0,7,0,7,1987,yes


In [102]:
pd.set_option('display.max_rows', None)
dfduplis = df[df["employee_number"].duplicated(keep= False)].sort_values(by="employee_number")



In [103]:
df["employee_number"].isnull().sum()

431

In [104]:
df.loc[df["employee_number"].isnull(), "employee_number"] = df["employee_number"].isnull().cumsum()

In [105]:
df["employee_number"]

0        162,0
1        259,0
2        319,0
3            1
4            2
5       1900,0
6         81,0
7        387,0
8            3
9        999,0
10      1035,0
11           4
12           5
13        58,0
14           6
15       558,0
16       825,0
17           7
18           8
19           9
20      1712,0
21        73,0
22          10
23       820,0
24          11
25          12
26          13
27      1294,0
28          14
29      1774,0
30      1815,0
31      1993,0
32        32,0
33       100,0
34          15
35       244,0
36       327,0
37       373,0
38          16
39       729,0
40       830,0
41          17
42       981,0
43      1042,0
44      1080,0
45      1995,0
46       145,0
47       158,0
48       391,0
49       436,0
50          18
51      1527,0
52          19
53      1866,0
54      2026,0
55          20
56       107,0
57       170,0
58          21
59       215,0
60       304,0
61       529,0
62       691,0
63       692,0
64      1005,0
65      1024,0
66      10

In [106]:
df.drop_duplicates(subset="employee_number",keep='last', inplace=True)

In [107]:
df["employee_number"].duplicated().sum()

0

In [108]:
df.drop("employee_number", axis=1, inplace=True)

In [109]:
df.insert(0, 'employee_number', range(1, 1 + len(df)))

In [110]:
len(df["employee_number"].unique())

1510

In [111]:
df.shape

(1510, 33)

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1510 entries, 0 to 1613
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   employee_number             1510 non-null   int64  
 1   age                         1510 non-null   int64  
 2   attrition                   1510 non-null   object 
 3   business_travel             1510 non-null   object 
 4   daily_rate                  1394 non-null   float64
 5   department                  1510 non-null   object 
 6   distance_from_home          1332 non-null   float64
 7   education                   1510 non-null   object 
 8   education_field             1510 non-null   object 
 9   environment_satisfaction    1510 non-null   int64  
 10  gender                      1510 non-null   object 
 11  hourly_rate                 1430 non-null   float64
 12  job_involvement             1510 non-null   int64  
 13  job_level                   1510 non-n

In [113]:
df.isnull().sum()/df.shape[0]

employee_number               0.000000
age                           0.000000
attrition                     0.000000
business_travel               0.000000
daily_rate                    0.076821
department                    0.000000
distance_from_home            0.117881
education                     0.000000
education_field               0.000000
environment_satisfaction      0.000000
gender                        0.000000
hourly_rate                   0.052980
job_involvement               0.000000
job_level                     0.000000
job_role                      0.000000
job_satisfaction              0.000000
marital_status                0.402649
monthly_income                0.529139
monthly_rate                  0.000000
num_companies_worked          0.000000
overtime                      0.000000
percent_salary_hike           0.000000
performance_rating            0.000000
relationship_satisfaction     0.000000
stock_option_level            0.000000
total_working_years      

In [114]:
df["marital_status"]=df["marital_status"].replace(np.nan, "desconocido")

In [115]:
lista_nulos=["hourly_rate","daily_rate", "distance_from_home", "monthly_income", "total_working_years"]

In [116]:
dfcopia=df.copy()

In [117]:
imputer_knn = KNNImputer(n_neighbors = 5)

# ajustamos y transformamos los datos
imputer_knn_imputado = imputer_knn.fit_transform(dfcopia[lista_nulos])

# comprobamos que es lo que nos devuelve, que sigue siendo un array
imputer_knn_imputado

array([[5.1000e+01, 6.8400e+02, 6.0000e+00, 1.9537e+04, 6.6000e+00],
       [6.5000e+01, 6.9900e+02, 1.0000e+00, 1.9999e+04, 3.4000e+01],
       [5.8000e+01, 5.3200e+02, 4.0000e+00, 1.9232e+04, 2.2000e+01],
       ...,
       [4.1000e+01, 9.0300e+02, 1.0600e+01, 7.1594e+03, 9.0000e+00],
       [8.4000e+01, 1.2290e+03, 8.0000e+00, 5.3088e+03, 1.2000e+01],
       [7.5000e+01, 5.6600e+02, 7.0000e+00, 1.0845e+04, 9.0000e+00]])

In [118]:
dfcopia[["hourly_rate_knn","daily_rate_knn", "distance_from_home_knn", "monthly_income_knn", "total_working_years_knn"]]= imputer_knn_imputado

In [119]:
dfcopia.describe()[["hourly_rate","hourly_rate_knn","daily_rate","daily_rate_knn", "distance_from_home","distance_from_home_knn", "monthly_income", "monthly_income_knn", "total_working_years", "total_working_years_knn"]]

Unnamed: 0,hourly_rate,hourly_rate_knn,daily_rate,daily_rate_knn,distance_from_home,distance_from_home_knn,monthly_income,monthly_income_knn,total_working_years,total_working_years_knn
count,1430.0,1510.0,1394.0,1510.0,1332.0,1510.0,711.0,1510.0,1016.0,1510.0
mean,66.025175,65.979735,807.050215,807.261854,8.960961,9.057351,6527.918425,6201.509272,11.314961,10.848874
std,20.270996,19.814085,401.842182,390.086253,7.954886,7.565011,4810.541121,3627.729599,7.7941,6.619995
min,30.0,30.0,103.0,103.0,1.0,1.0,1009.0,1009.0,0.0,0.0
25%,48.0,49.0,472.5,499.25,2.0,2.0,2906.5,3902.5,6.0,7.0
50%,66.0,66.0,805.5,804.5,7.0,7.2,4850.0,5368.0,10.0,9.6
75%,84.0,83.0,1157.75,1140.65,13.0,13.0,8456.0,7260.2,15.0,13.2
max,100.0,100.0,1499.0,1499.0,29.0,29.0,19999.0,19999.0,40.0,40.0


In [120]:
dfcopia.drop(lista_nulos,axis=1,inplace= True)

In [121]:
nuevo_nm={"hourly_rate_knn":"hourly_rate","daily_rate_knn":"daily_rate", "distance_from_home_knn":"distance_from_home", "monthly_income_knn":"monthly_income", "total_working_years_knn":"total_working_years"}

In [122]:
dfcopia.rename(columns=nuevo_nm, inplace= True)

In [123]:
dfcopia.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1510 entries, 0 to 1613
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   employee_number             1510 non-null   int64  
 1   age                         1510 non-null   int64  
 2   attrition                   1510 non-null   object 
 3   business_travel             1510 non-null   object 
 4   department                  1510 non-null   object 
 5   education                   1510 non-null   object 
 6   education_field             1510 non-null   object 
 7   environment_satisfaction    1510 non-null   int64  
 8   gender                      1510 non-null   object 
 9   job_involvement             1510 non-null   int64  
 10  job_level                   1510 non-null   int64  
 11  job_role                    1510 non-null   object 
 12  job_satisfaction            1510 non-null   int64  
 13  marital_status              1510 non-n

In [124]:
cambionumeros= ["employee_number","hourly_rate","total_working_years","work_life_balance"]

In [125]:
pd.set_option('display.max_rows', None)
dfcopia["employee_number"]

0          1
1          2
2          3
3          4
4          5
5          6
6          7
7          8
8          9
9         10
10        11
11        12
12        13
13        14
14        15
15        16
16        17
17        18
18        19
19        20
20        21
21        22
22        23
23        24
24        25
25        26
26        27
27        28
28        29
29        30
30        31
31        32
32        33
33        34
34        35
35        36
36        37
37        38
38        39
39        40
40        41
41        42
42        43
43        44
44        45
45        46
46        47
47        48
48        49
49        50
50        51
51        52
52        53
53        54
54        55
55        56
56        57
57        58
58        59
59        60
60        61
61        62
62        63
63        64
64        65
65        66
66        67
67        68
68        69
69        70
70        71
71        72
72        73
73        74
74        75
75        76
76        77

In [126]:
df['employee_number']

0          1
1          2
2          3
3          4
4          5
5          6
6          7
7          8
8          9
9         10
10        11
11        12
12        13
13        14
14        15
15        16
16        17
17        18
18        19
19        20
20        21
21        22
22        23
23        24
24        25
25        26
26        27
27        28
28        29
29        30
30        31
31        32
32        33
33        34
34        35
35        36
36        37
37        38
38        39
39        40
40        41
41        42
42        43
43        44
44        45
45        46
46        47
47        48
48        49
49        50
50        51
51        52
52        53
53        54
54        55
55        56
56        57
57        58
58        59
59        60
60        61
61        62
62        63
63        64
64        65
65        66
66        67
67        68
68        69
69        70
70        71
71        72
72        73
73        74
74        75
75        76
76        77

In [127]:
def cambio_coma(cadena):
    try:
        if "," in cadena:
            return cadena.replace(",",".")
        else:
            return cadena
    except:
        return cadena    

In [128]:
dfcopia["employee_number"]=dfcopia["employee_number"].apply(pd.to_numeric)

In [129]:
def to_int(num):
    return int(num)

In [130]:
for col in cambionumeros:
    dfcopia[col]=dfcopia[col].apply(to_int).astype(int)

In [131]:
dfcopia[["employee_number","hourly_rate","total_working_years","work_life_balance"]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1510 entries, 0 to 1613
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   employee_number      1510 non-null   int64
 1   hourly_rate          1510 non-null   int64
 2   total_working_years  1510 non-null   int64
 3   work_life_balance    1510 non-null   int64
dtypes: int64(4)
memory usage: 59.0 KB


In [132]:
dfcopia.to_csv("../data/datos_empleados.csv")