In [60]:
# Imports 📥

# Packages
#-----------------------------------------------------------------------
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Visualization
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Data 
# -----------------------------------------------------------------------
abc_coorp_df = pd.read_csv("HR_RAW_DATA.csv", index_col=0)
abc_coorp_df.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
0,51,No,,"684,0$",,6,3,,1,1620,1,0,51,3,5,resEArch DIREcToR,3,,195370,6462,7,Y,No,13,30,3,,0,,5,30,20,,15,15,195370,1972,1000000000$,,,Yes
1,52,No,,"699,0$",,1,4,Life Sciences,1,2590,3,0,65,2,5,ManAGeR,3,,199990,5678,0,,,14,30,1,,1,340.0,5,30,33,,11,9,199990,1971,1000000000$,,,1


In [50]:
abc_coorp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1614 non-null   object 
 1   Attrition                 1614 non-null   object 
 2   BusinessTravel            842 non-null    object 
 3   DailyRate                 1614 non-null   object 
 4   Department                302 non-null    object 
 5   DistanceFromHome          1614 non-null   int64  
 6   Education                 1614 non-null   int64  
 7   EducationField            869 non-null    object 
 8   employeecount             1614 non-null   int64  
 9   employeenumber            1183 non-null   object 
 10  EnvironmentSatisfaction   1614 non-null   int64  
 11  Gender                    1614 non-null   int64  
 12  HourlyRate                1614 non-null   object 
 13  JobInvolvement            1614 non-null   int64  
 14  JobLevel     

In [55]:
# DataTransformer class definition ✍️

class DataTransformer:
    def __init__(self, dataframe):
        self.df = dataframe

    def replace_gender_values(self):
        """Replaces the values in the Gender column with 'Male' and 'Female'."""
        self.df['Gender'] = self.df['Gender'].replace({0: 'Male', 1: 'Female'})

    def convert_object_to_float_eliminate_dolar(self, column_name):
        """
        Converts a column with object type values containing $ signs at the end
        and commas as decimal separators to numeric float values.
        """
        if column_name in self.df.columns:
            # Remove dollar signs from the end
            self.df[column_name] = self.df[column_name].str.replace('$', '', regex=False)
            
            # Replace commas with dots for decimal points
            self.df[column_name] = self.df[column_name].str.replace(',', '.', regex=False)
            
            # Convert the cleaned column to float type
            self.df[column_name] = self.df[column_name].astype(float)
        else:
            print(f"Column {column_name} doesn't exist in the DataFrame.")
        

    def convert_role_to_department_normalize_job_role(self):
        # First change de type of data to be capitalize and the same way
        self.df['JobRole'] = self.df['JobRole'].str.title()
        self.df['Department'] = self.df['Department'].str.title()
       # Clean empty spaces
        self.df['JobRole'] = self.df['JobRole'].str.strip()
        self.df['Department'] = self.df['Department'].str.strip()
        
        conversion_dictionary = {
        'Healthcare Representative': 'Research & Development',
        'Sales Executive': 'Sales',
        'Healthcare Representative': 'Research & Development',
        'Laboratory Technician': 'Research & Development',
        'Manufacturing Director': 'Research & Development',
        'Research Scientist': 'Research & Development',
        'Sales Executive': 'Sales',
        'Sales Representative':'Sales',
        'Research Director': 'Research & Development',
        'Human Resources': 'Human Resources',
         }
        # Iterate over the rows of the DataFrame
        for index, row in self.df.iterrows():
            job_role = row['JobRole']
            # Assign the corresponding value to Department using the dictionary
            if job_role in conversion_dictionary:
                self.df.at[index, 'Department'] = conversion_dictionary[job_role]
                print(f"Value '{job_role}' was transformed into '{conversion_dictionary[job_role]}'")

    def change_marital_status(self):
        
        self.df['MaritalStatus'] = self.df['MaritalStatus'].replace({
        "Marreid": "Married",
        "divorced": "Divorced"})
        self.df["MaritalStatus"] = self.df["MaritalStatus"].fillna("Unknown")
        
    def map_column_remote_work(self):
        dicc = {1: "True", 0: "False", "Yes": "True"}

        # Replace the values in the RemoteWork column according to the dictionary
        self.df["RemoteWork"] = self.df["RemoteWork"].replace(dicc)


    def change_null_for_unknown(self, column_list): # when doesnt exist a dominant category in categorical variable
        # Iterate through the list of columns to replace nulls with "Unknown"
        for column in column_list:
            if column in self.df.columns:
                # Replace nulls with the value "Unknown" for each column in the list
                self.df[column] = self.df[column].fillna("Unknown")
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df
    
    def change_null_for_mode(self, column_list): # When we have a dominant category in categorical variables
        for column in column_list:
            if column in self.df.columns:
                # Calculate the mode of the column
                mode = self.df[column].mode()[0]
                # Replace nulls with the mode for each column in the list
                self.df[column] = self.df[column].fillna(mode)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df
    
    def change_null_for_mean(self, column_list): # when we have a 0-10% of nulls in numerical category and distribution is normal
        # Iterate through the list of columns to replace nulls with mean
        for column in column_list:
            if column in self.df.columns:
                    mean= self.df[column].mean()
                # Replace nulls with the mode for each column in the list
                    self.df[column] = self.df[column].fillna(mean)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df
    
    def change_null_for_median(self, column_list): # when we have a 0-10% of nulls in numerical category and distribution is atypical
        # Iterate through the list of columns to replace nulls with median
        for column in column_list:
            if column in self.df.columns:
                    median= self.df[column].median()
                # Replace nulls with the mode for each column in the list
                    self.df[column] = self.df[column].fillna(median)
            else:
                print(f"Warning: The column '{column}' does not exist in the DataFrame.")
        return self.df
    
    def impute_with_knn(self, column_list, n_neighbors=5): # when we have a numerical variable with more than 10% of nulls
        # Create an instance of KNNImputer
        imputer_knn = KNNImputer(n_neighbors=n_neighbors)

        # Fit and transform the data
        imputed_data = imputer_knn.fit_transform(self.df[column_list])

        # Convert the result to a DataFrame
        imputed_df = pd.DataFrame(imputed_data, columns=column_list)

        # Add the imputed columns to the original DataFrame
        for column in column_list:
            self.df[f"{column}_knn"] = imputed_df[column]

        return self.df

In [56]:
# Instantiating a class object 🐣

abc_data = DataTransformer(abc_coorp_df)

In [None]:
# ▶️ convert_object_to_float_eliminate_dolar

abc_data.convert_object_to_float_eliminate_dolar()
print(abc_data)

abc_data.df["DailyRate"].sample(50)
abc_data.df["DailyRate"].unique()
abc_data.df["DailyRate"].value_counts()
abc_coorp_df

In [None]:
# ▶️ Convert_role_to_department and normalize jobrole letters
abc_data.convert_role_to_department_normalize_job_role()
print(abc_data)

abc_data.df["JobRole"].sample(50)
abc_data.df["JobRole"].unique()
abc_data.df["Department"].value_counts()
abc_data.df[["JobRole","Department"]].sample(50)
abc_coorp_df

In [None]:
# ▶️ Transform "marital_status" data

abc_data.change_marital_status()
abc_data.df["MaritalStatus"].unique()
abc_data.df["MaritalStatus"].value_counts()
abc_data.df[["MaritalStatus"]].sample(50)
abc_coorp_df



In [6]:
# ▶️ Transform "remote work"
abc_data.map_column_remote_work()
abc_coorp_df["RemoteWork"].unique()

abc_coorp_df["RemoteWork"].value_counts()

RemoteWork
True     640
1        360
0        309
False    305
Name: count, dtype: int64

In [57]:
# ▶️ Change nulls for unknown category in categorial variable

# if we want to change more than one column , put a list
columns_modify = []
abc_data.change_null_for_unknown(["Over18"])
abc_coorp_df["Over18"].isnull().sum()

0

In [58]:
# ▶️ Change nulls for mode in categorical variable

# if we want to change more than one column , put a list
columns_modify = []
abc_data.change_null_for_mode(["OverTime"])
abc_coorp_df["OverTime"].isnull().sum()

0

In [59]:
# ▶️ Change nulls for mean in numerical variable when we have a 0-10% of nulls in numerical category and distribution is normal

# if we want to change more than one column , put a list
columns_modify = []
abc_data.change_null_for_mean(["DailyRate"])
abc_coorp_df["DailyRate"].isnull().sum()

TypeError: Could not convert string '684,0$699,0$532,0$359,0$1319,0$117,0$1435,0$635,0$1276,0$840,0$247,0$1369,0$201,0$1360,0$692,0$1398,0$286,0$1402,0$819,0$884,0$1238,0$515,0$1223,0$202,0$928,0$607,0$266,0$429,0$589,0$nan$nan$1180,0$1282,0$776,0$665,0$526,0$1034,0$1403,0$1499,0$580,0$859,0$263,0$1376,0$885,0$1003,0$1321,0$nan$394,0$1372,0$nan$1333,0$228,0$737,0$823,0$667,0$301,0$573,0$nan$1329,0$630,0$nan$1063,0$1017,0$1296,0$939,0$1355,0$1448,0$200,0$1202,0$404,0$208,0$813,0$465,0$1189,0$nan$1001,0$1394,0$161,0$288,0$682,0$1354,0$147,0$119,0$1413,0$452,0$334,0$1132,0$nan$982,0$480,0$1099,0$672,0$1379,0$583,0$1492,0$1050,0$117,0$469,0$nan$237,0$1440,0$1291,0$1157,0$1336,0$1224,0$735,0$nan$1389,0$638,0$1240,0$194,0$1339,0$111,0$nan$1469,0$470,0$1232,0$1249,0$117,0$798,0$672,0$549,0$570,0$1232,0$541,0$164,0$1117,0$619,0$319,0$956,0$1245,0$1397,0$527,0$213,0$nan$527,0$882,0$330,0$406,0$217,0$481,0$669,0$1465,0$685,0$1062,0$1099,0$177,0$1199,0$648,0$412,0$282,0$770,0$601,0$855,0$332,0$1326,0$748,0$405,0$790,0$1050,0$971,0$1370,0$750,0$880,0$678,0$1179,0$350,0$1266,0$574,0$189,0$nan$692,0$310,0$722,0$1219,0$994,0$721,0$853,0$427,0$852,0$669,0$1334,0$1093,0$nan$967,0$465,0$989,0$1195,0$1045,0$829,0$663,0$319,0$1300,0$1105,0$906,0$849,0$1275,0$1218,0$422,0$1398,0$1274,0$1076,0$1207,0$430,0$933,0$1184,0$nan$586,0$890,0$379,0$1425,0$1188,0$938,0$1053,0$722,0$248,0$796,0$174,0$408,0$nan$977,0$536,0$135,0$1476,0$587,0$495,0$523,0$507,0$1055,0$1445,0$218,0$447,0$829,0$1029,0$674,0$nan$538,0$740,0$1245,0$1234,0$170,0$1130,0$584,0$918,0$792,0$611,0$nan$1383,0$867,0$nan$574,0$706,0$728,0$311,0$556,0$937,0$1125,0$1224,0$119,0$444,0$911,0$nan$585,0$nan$1392,0$1005,0$nan$1299,0$nan$655,0$746,0$632,0$658,0$1467,0$1312,0$750,0$249,0$802,0$1355,0$689,0$1395,0$1436,0$1496,0$1480,0$nan$313,0$894,0$290,0$1084,0$1002,0$1242,0$1052,0$944,0$465,0$771,0$1416,0$1327,0$488,0$1385,0$666,0$1315,0$442,0$147,0$950,0$218,0$nan$691,0$1018,0$nan$559,0$318,0$1130,0$144,0$1319,0$933,0$1171,0$143,0$1178,0$1107,0$906,0$645,0$1485,0$1116,0$1276,0$163,0$561,0$426,0$509,0$258,0$525,0$nan$238,0$482,0$397,0$243,0$806,0$1442,0$408,0$929,0$827,0$nan$1018,0$703,0$294,0$314,0$654,0$427,0$501,0$nan$954,0$1434,0$657,0$240,0$791,0$570,0$676,0$nan$436,0$760,0$nan$661,0$1085,0$154,0$1283,0$616,0$498,0$530,0$805,0$903,0$1229,0$566,0$1441,0$153,0$1066,0$135,0$145,0$1109,0$124,0$300,0$444,0$1366,0$857,0$1204,0$827,0$1219,0$770,0$950,0$1082,0$203,0$1308,0$718,0$593,0$1384,0$819,0$1223,0$1361,0$1146,0$nan$853,0$nan$1316,0$363,0$1103,0$920,0$694,0$1429,0$531,0$621,0$806,0$604,0$325,0$1030,0$524,0$943,0$1009,0$607,0$817,0$nan$nan$930,0$1147,0$652,0$337,0$971,0$1136,0$663,0$326,0$377,0$1038,0$1490,0$1246,0$224,0$441,0$898,0$nan$607,0$1313,0$1015,0$nan$426,0$1387,0$1302,0$1329,0$602,0$772,0$809,0$nan$786,0$921,0$530,0$717,0$1370,0$1312,0$979,0$1283,0$953,0$244,0$196,0$303,0$1180,0$1092,0$261,0$589,0$1422,0$nan$1137,0$511,0$1396,0$1096,0$nan$1169,0$1239,0$157,0$954,0$754,0$1303,0$697,0$1395,0$501,0$116,0$557,0$582,0$704,0$nan$613,0$419,0$477,0$1211,0$857,0$505,0$269,0$553,0$1296,0$1277,0$302,0$427,0$975,0$621,0$1318,0$249,0$881,0$1137,0$466,0$945,0$509,0$1376,0$469,0$994,0$433,0$991,0$482,0$1083,0$136,0$333,0$685,0$1462,0$722,0$nan$1277,0$1131,0$300,0$1023,0$153,0$1065,0$1115,0$1195,0$1316,0$665,0$1479,0$304,0$1368,0$828,0$799,0$142,0$446,0$691,0$1246,0$254,0$1410,0$883,0$304,0$121,0$950,0$1343,0$827,0$1107,0$490,0$1400,0$1179,0$1117,0$1332,0$583,0$465,0$1413,0$1200,0$977,0$530,0$1093,0$1102,0$809,0$1328,0$548,0$1214,0$nan$120,0$240,0$441,0$841,0$1033,0$nan$1452,0$120,0$1268,0$nan$121,0$nan$813,0$307,0$228,0$797,0$555,0$1377,0$210,0$483,0$921,0$575,0$1311,0$575,0$759,0$891,0$1082,0$319,0$532,0$317,0$144,0$688,0$195,0$622,0$646,0$719,0$1012,0$930,0$1278,0$1222,0$1231,0$1383,0$462,0$231,0$nan$1089,0$1082,0$1240,0$1262,0$439,0$528,0$647,0$1097,0$942,0$147,0$966,0$168,0$nan$1089,0$299,0$999,0$1498,0$1439,0$1111,0$1485,0$258,0$1174,0$172,0$nan$401,0$866,0$1099,0$571,0$676,0$950,0$1342,0$995,0$118,0$990,0$241,0$nan$1470,0$492,0$1040,0$1320,0$146,0$163,0$1153,0$185,0$750,0$1297,0$337,0$217,0$1064,0$1351,0$930,0$468,0$nan$1218,0$1273,0$1141,0$1488,0$653,0$989,0$1225,0$530,0$1217,0$nan$142,0$1127,0$1189,0$1427,0$303,0$334,0$702,0$nan$926,0$878,0$334,0$896,0$1401,0$675,0$1070,0$496,0$1117,0$1476,0$390,0$1220,0$928,0$1124,0$746,0$1046,0$1448,0$1400,0$1303,0$587,0$635,0$267,0$443,0$703,0$1287,0$798,0$1420,0$562,0$650,0$141,0$715,0$nan$376,0$571,0$691,0$932,0$471,0$180,0$638,0$141,0$1265,0$1075,0$1086,0$152,0$974,0$853,0$217,0$715,0$1234,0$788,0$124,0$921,0$192,0$nan$477,0$852,0$1372,0$1329,0$150,0$nan$nan$1167,0$206,0$nan$736,0$322,0$683,0$1475,0$560,0$nan$nan$115,0$nan$507,0$575,0$nan$888,0$580,0$671,0$567,0$311,0$148,0$1395,0$1278,0$581,0$1082,0$371,0$855,0$548,0$429,0$1181,0$1253,0$616,0$370,0$1194,0$572,0$355,0$1480,0$783,0$1354,0$682,0$1103,0$136,0$155,0$771,0$508,0$557,0$642,0$1382,0$1037,0$878,0$1120,0$374,0$1194,0$287,0$nan$nan$591,0$670,0$1346,0$103,0$334,0$371,0$673,0$699,0$nan$705,0$1459,0$nan$890,0$1434,0$1443,0$1142,0$664,0$397,0$nan$1153,0$432,0$1353,0$489,0$807,0$1420,0$1280,0$957,0$809,0$542,0$216,0$1150,0$nan$364,0$201,0$1256,0$691,0$440,0$1157,0$713,0$nan$140,0$629,0$328,0$1084,0$nan$472,0$905,0$1136,0$1151,0$644,0$1005,0$992,0$1147,0$147,0$1323,0$818,0$515,0$1431,0$976,0$1327,0$832,0$1199,0$916,0$1247,0$685,0$128,0$1158,0$996,0$728,0$688,0$1449,0$636,0$444,0$889,0$691,0$106,0$nan$723,0$1157,0$1482,0$738,0$1192,0$1309,0$544,0$641,0$756,0$593,0$895,0$408,0$1283,0$1469,0$1261,0$329,0$1362,0$1371,0$201,0$821,0$1381,0$313,0$1473,0$329,0$688,0$192,0$1490,0$296,0$1349,0$986,0$408,0$1009,0$1125,0$413,0$988,0$1474,0$1368,0$232,0$1034,0$1474,0$nan$538,0$622,0$1236,0$1112,0$204,0$1343,0$1315,0$nan$991,0$913,0$1115,0$885,0$810,0$817,0$693,0$1179,0$316,0$381,0$217,0$933,0$775,0$804,0$1090,0$346,0$310,0$725,0$575,0$182,0$829,0$384,0$921,0$1111,0$1325,0$118,0$1258,0$890,0$1041,0$702,0$829,0$625,0$nan$661,0$nan$986,0$nan$583,0$1418,0$1269,0$395,0$341,0$nan$821,0$500,0$1454,0$617,0$995,0$1122,0$1198,0$188,0$1219,0$335,0$461,0$1134,0$138,0$1206,0$622,0$109,0$277,0$549,0$1055,0$802,0$265,0$1038,0$342,0$1186,0$430,0$769,0$1176,0$1277,0$1091,0$654,0$895,0$618,0$1017,0$970,0$984,0$793,0$1182,0$1003,0$603,0$874,0$367,0$199,0$nan$718,0$1457,0$805,0$1421,0$1450,0$116,0$1212,0$1010,0$nan$131,0$791,0$735,0$193,0$640,0$266,0$848,0$nan$1138,0$nan$256,0$nan$935,0$458,0$882,0$329,0$nan$793,0$662,0$693,0$nan$541,0$1200,0$499,0$1372,0$nan$nan$1462,0$200,0$949,0$nan$182,0$329,0$383,0$1255,0$nan$1398,0$523,0$1221,0$1107,0$981,0$1495,0$1467,0$1496,0$1193,0$1229,0$467,0$271,0$410,0$495,0$561,0$1142,0$1157,0$1242,0$1288,0$903,0$1108,0$1351,0$437,0$nan$977,0$1302,0$653,0$457,0$634,0$715,0$559,0$546,0$1176,0$711,0$448,0$365,0$763,0$486,0$591,0$1329,0$469,0$711,0$301,0$1141,0$894,0$392,0$nan$1225,0$727,0$1351,0$528,0$1441,0$427,0$390,0$585,0$741,0$552,0$506,0$1456,0$160,0$897,0$600,0$1003,0$1054,0$428,0$461,0$661,0$511,0$942,0$589,0$849,0$343,0$nan$1125,0$1217,0$723,0$1216,0$350,0$207,0$280,0$414,0$nan$836,0$592,0$219,0$967,0$nan$335,0$1079,0$735,0$471,0$nan$1251,0$1206,0$976,0$1168,0$1444,0$nan$571,0$977,0$1154,0$1490,0$581,0$267,0$234,0$501,0$688,0$1092,0$529,0$1322,0$1199,0$410,0$1167,0$884,0$155,0$924,0$852,0$1082,0$nan$1450,0$134,0$635,0$1017,0$1311,0$1302,0$1091,0$755,0$253,0$922,0$1365,0$306,0$970,0$970,0$nan$1169,0$1271,0$618,0$1469,0$625,0$1404,0$1391,0$216,0$1333,0$1464,0$1330,0$773,0$1485,0$902,0$645,0$654,0$107,0$504,0$326,0$634,0$1448,0$896,0$1358,0$nan$824,0$329,0$1313,0$594,0$734,0$234,0$766,0$920,0$431,0$1213,0$238,0$1146,0$1404,0$1313,0$1373,0$1324,0$1358,0$1123,0$391,0$125,0$895,0$813,0$nan$1240,0$1357,0$nan$1229,0$626,0$1097,0$836,0$1339,0$318,0$132,0$193,0$111,0$541,0$827,0$871,0$1040,0$nan$1031,0$922,0$528,0$nan$nan$1169,0$1145,0$602,0$303,0$556,0$1261,0$1180,0$896,0$142,0$854,0$1411,0$252,0$504,0$833,0$529,0$1210,0$1463,0$322,0$920,0$506,0$nan$566,0$812,0$1162,0$1001,0$1309,0$810,0$1062,0$530,0$1332,0$845,0$350,0$1144,0$156,0$202,0$464,0$1305,0$555,0$1300,0$1490,0$489,0$210,0$983,0$548,0$534,0$1306,0$1094,0$775,0$471,0$1495,0$823,0$448,0$558,0$959,0$782,0$362,0$1216,0$160,0$nan$1362,0$727,0$1225,0$168,0$1396,0$1225,0$268,0$167,0$1243,0$1092,0$805,0$720,0$nan$252,0$443,0$342,0$300,0$868,0$1252,0$374,0$781,0$177,0$1427,0$546,0$1272,0$nan$1184,0$867,0$658,0$419,0$129,0$1069,0$289,0$1479,0$906,0$955,0$287,0$1097,0$265,0$804,0$1141,0$nan$660,0$1144,0$415,0$1334,0$1323,0$309,0$1009,0$nan$697,0$130,0$188,0$247,0$167,0$982,0$862,0$1256,0$111,0$906,0$1184,0$984,0$458,0$1098,0$969,0$1329,0$715,0$1320,0$265,0$373,0$599,0$603,0$968,0$364,0$1291,0$1124,0$703,0$nan$202,0$1377,0$592,0$383,0$990,0$660,0$381,0$830,0$289,0$1423,0$516,0$1089,0$1210,0$598,0$992,0$104,0$479,0$474,0$884,0$264,0$1059,0$1349,0$563,0$544,0$991,0$1112,0$1206,0$1495,0$1259,0$nan$240,0$369,0$793,0$543,0$964,0$176,0$181,0$211,0$1079,0$590,0$305,0$833,0$807,0$478,0$337,0$1294,0$1239,0$1128,0$1431,0$359,0$430,0$1318,0$726,0$1142,0$352,0$nan$1172,0$945,0$391,0$482,0$170,0$746,0$1475,0$1443,0$867,0$605,0$419,0$1337,0$1404,0$1373,0$1276,0$309,0$543,0$105,0$638,0$1146,0$1440,0$1103,0$136,0$147,0$119,0$1413,0$452,0$334,0$1132,0$nan$982,0$480,0$1099,0$1038,0$342,0$1186,0$430,0$984,0$793,0$1182,0$1003,0$603,0$874,0$367,0$199,0$nan$718,0$1457,0$244,0$196,0$303,0$1180,0$1092,0$261,0$589,0$1422,0$nan$1137,0$511,0$1396,0$1096,0$nan$691,0$106,0$nan$723,0$1157,0$1482,0$738,0$1192,0$1309,0$544,0$641,0$756,0$593,0$895,0$408,0$1283,0$1469,0$1261,0$329,0$1362,0$1371,0$201,0$821,0$1381,0$313,0$1473,0$329,0$688,0$192,0$1385,0$666,0$1315,0$442,0$147,0$950,0$218,0$nan$691,0$1018,0$nan$559,0$318,0$1130,0$144,0$1319,0$933,0$1171,0$143,0$1178,0$1107,0$906,0$645,0$1485,0$1116,0$1276,0$163,0$561,0$426,0$509,0$258,0$525,0$nan$238,0$482,0$397,0$243,0$806,0$1442,0$408,0$929,0$827,0$nan$1018,0$703,0$294,0$314,0$654,0$427,0$501,0$nan$954,0$1434,0$657,0$240,0$791,0$570,0$676,0$nan$436,0$760,0$nan$661,0$1085,0$154,0$1283,0$616,0$498,0$530,0$805,0$903,0$1229,0$566,0$' to numeric

In [None]:
# ▶️ Change nulls for median in numerical variable when we have a 0-10% of nulls in numerical category and distribution is atypical

# if we want to change more than one column , put a list
columns_modify = []
abc_data.change_null_for_mean(["column"])
abc_coorp_df["column"].isnull().sum()

In [None]:
# ▶️ Change nulls for distribution KNN in numerical variable when we have more 10% of nulls in numerical category 

# if we want to change more than one column , put a list
columns_modify = []
abc_data.change_null_for_mean(["column"])
abc_coorp_df["column"].isnull().sum()

In [4]:
# ▶️ Eliminate duplicated "employee_number"

duplicate_employeenumbers = abc_coorp_df['employeenumber'].value_counts()

abc_coorp_df['employeenumber'].isnull().sum()

duplicate_employeenumbers

employeenumber
482,0     2
530,0     2
507,0     2
517,0     2
522,0     2
         ..
161,0     1
164,0     1
190,0     1
194,0     1
2040,0    1
Name: count, Length: 1079, dtype: int64

In [7]:
#employeenumber`: Variable categórica nominal, al tratarse de un identificador único para cada empleado (número). 
# Pero estos números están en formato de texto y posiblemente con un formato de decimal que en realidad no es necesario. 
# Hay 534 duplicados en esta columna. Los valores deben ser unicos.  Hablamos con César. debemos analizar los duplicados mejor 
# y tomar decisiones en base a las conclusiones. Por lo que vimos parece que la diferencia entre los duplicados está en Remote Work, 
# puede que sea un error en la carga de datos o que se haya hecho mas de una vez la encuesta. Explorar mas.

duplicados= abc_coorp_df.loc[abc_coorp_df['employeenumber'].duplicated(keep=False), :].sort_values(by='employeenumber')

    
# Identify duplicates
abc_coorp_df.loc[abc_coorp_df['employeenumber'].duplicated(keep=False), :].sort_values(by='employeenumber').sample(100)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
1555,29,No,travel_rarely,"144,0$",,10,1,,1,,4,1,39,2,2,SAles EXecuTiVe,2,Divorced,,11866,1,Y,Yes,14,30,1,,2,70,2,30,7,,1,7,,1994,1000000000$,,,Yes
526,45,No,,"1316,0$",,29,3,,1,,3,0,83,3,1,rESEArch SciEntIST,4,Single,34520,9752,5,,,13,30,2,,0,90,2,20,6,,0,3,34520,1978,1000000000$,,,1
22,36,No,,"1223,0$",,8,3,,1,,3,1,59,3,3,hEaltHCarE ReprESENTAtIvE,3,Divorced,,8202,1,Y,,13,,2,,3,170,2,30,17,,12,8,,1987,1000000000$,,,1
937,29,No,,"1090,0$",Sales,10,3,,1,,4,0,83,3,1,SaLes rePrESENTaTivE,2,,22970,17967,1,Y,No,14,30,4,800,2,,2,30,2,,2,2,22970,1994,1000000000$,SaLes rePrESENTaTivE - Sales,,0
781,28,No,non-travel,"1103,0$",,16,3,Medical,1,19470,3,0,49,3,1,reSeArch sCieNTiSt,3,Single,21440,2122,1,,No,14,30,3,,0,50,3,20,5,,1,4,21440,1995,1000000000$,,,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1580,39,No,,"408,0$",,2,4,Technical Degree,1,7210,4,1,80,2,2,HeaLTHcARE reprESenTAtIve,3,Single,,20978,1,,No,11,30,1,,0,200,4,30,20,70,11,10,,1984,1000000000$,,,0
878,37,No,,"1192,0$",,5,2,Medical,1,4600,4,0,61,3,2,ManUFaCTURing dIRectOR,4,,63470,23177,7,Y,No,16,30,3,,2,80,2,20,6,,0,4,63470,1986,1000000000$,,,Yes
329,31,No,travel_rarely,"525,0$",,6,4,Medical,1,6530,1,0,66,4,2,SALes EXEcUTiVe,4,Divorced,,6219,4,Y,No,22,40,4,,2,130,4,40,7,,5,7,,1992,1000000000$,,,1
1495,35,No,travel_frequently,nan$,,4,4,Other,1,11850,4,0,47,2,1,laboRatORY teCHnICIaN,4,Married,23760,26537,1,,,13,30,2,,1,20,2,40,2,,2,2,23760,1988,1000000000$,,,True


In [6]:
# Al verificar los nulos, comprobamos que el número ímpar se da por esa razón y subtraímos las filas nulas de las filas totales para saber cuantos duplicados hay.
# el resultado de la suma tiene que ser un número par.
duplicados.shape[0] - duplicados['employeenumber'].isnull().sum()

208

In [8]:
# Concluímos que tenemos 208 empleados con el ID de empleado duplicado. 
# Ahora vamos a remover estos nulos de nuestro df de duplicados para poder tratar los duplicados. 
duplicados.dropna(subset='employeenumber', inplace=True)

In [11]:
# Ahora borramos todos los duplicados que tengan el menor indice, ya que suponemos que los de indice mayor son los registros mas actualizados. 
duplicados.index.name = 'index'  # Agregar nombre al índice
duplicados.shape

(208, 41)

In [10]:
# Ordenar el DataFrame por 'employeenumber' y por el índice en orden descendente
df_sorted = duplicados.sort_values(by=['employeenumber', duplicados.index.name], ascending=[True, False])
df_sorted

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
1483,33,No,non-travel,"1038,0$",Sales,8,1,Life Sciences,1,10440,2,1,88,2,1,sales repResENTaTiVE,4,,,21437,0,Y,,19,30,4,,0,,2,20,2,,2,2,,1990,1000000000$,sales repResENTaTiVE - Sales,,0
987,33,No,non-travel,"1038,0$",Sales,8,1,Life Sciences,1,10440,2,1,Not Available,2,1,sALES ReprEsentatiVE,4,,,21437,0,Y,,19,30,4,,0,,2,20,2,,2,2,,1990,1000000000$,sALES ReprEsentatiVE - Sales,,False
1484,26,Yes,,"342,0$",,2,3,Life Sciences,1,10530,1,0,57,3,1,reSEArcH SCiEnTIst,1,Married,20420,15346,6,Y,,14,30,2,,1,60,2,30,3,,1,2,20420,1997,1000000000$,,,Yes
988,26,Yes,,"342,0$",,2,3,Life Sciences,1,10530,1,0,57,3,1,rEsEaRCH SCIenTiSt,1,Married,20420,15346,6,Y,,14,30,2,,1,60,2,30,3,,1,2,20420,1997,1000000000$,,,1
1486,46,No,,"430,0$",,1,4,Medical,1,10690,4,0,40,3,5,rESEArCh DIrecTOR,4,,,21445,9,,No,17,30,4,800,2,230,0,30,2,,2,2,,1977,1000000000$,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,36,Yes,travel_rarely,"530,0$",,3,1,Life Sciences,1,9670,3,0,51,2,3,SALes EXECuTiVe,4,Married,103250,5518,1,Y,,11,,1,,1,,6,30,16,,3,7,103250,1987,1000000000$,,,0
1610,45,No,non-travel,"805,0$",,4,2,,1,9720,3,0,57,3,2,LAboRaTOry tECHNiCIAn,2,,44470,23163,1,,,12,30,2,,0,,5,20,9,,0,8,44470,1978,1000000000$,,,1
367,45,No,non-travel,"805,0$",,-47,2,,1,9720,47,0,57,3,2,lABORaTory tecHNiCiAN,2,,44470,23163,1,,,12,30,2,,0,,5,20,9,,0,8,44470,1978,1000000000$,,,1
1612,36,No,non-travel,"1229,0$",,8,4,Technical Degree,1,9900,1,0,84,3,2,SaLes ExecUtIVe,4,Divorced,,25952,4,,No,13,,4,,2,120,3,30,7,,0,7,,1987,1000000000$,,,True


In [11]:
# Eliminar duplicados conservando el primer registro encontrado (índice más alto)
df_unique = df_sorted.drop_duplicates(subset='employeenumber', keep='first')
df_unique # nos hemos quedado con 104 filas

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
1483,33,No,non-travel,"1038,0$",Sales,8,1,Life Sciences,1,10440,2,1,88,2,1,sales repResENTaTiVE,4,,,21437,0,Y,,19,30,4,,0,,2,20,2,,2,2,,1990,1000000000$,sales repResENTaTiVE - Sales,,0
1484,26,Yes,,"342,0$",,2,3,Life Sciences,1,10530,1,0,57,3,1,reSEArcH SCiEnTIst,1,Married,20420,15346,6,Y,,14,30,2,,1,60,2,30,3,,1,2,20420,1997,1000000000$,,,Yes
1486,46,No,,"430,0$",,1,4,Medical,1,10690,4,0,40,3,5,rESEArCh DIrecTOR,4,,,21445,9,,No,17,30,4,800,2,230,0,30,2,,2,2,,1977,1000000000$,,,1
1487,21,No,,"984,0$",,-25,1,,1,11310,4,1,70,2,1,REsEaRch scIenTIsT,2,Single,20700,25326,1,Y,Yes,11,30,3,800,0,20,6,40,2,,2,2,20700,2002,1000000000$,,,1
1488,31,No,travel_frequently,"793,0$",,20,3,,1,11350,3,0,67,4,1,saLES RePRESenTAtIve,4,Married,27910,21981,0,,No,12,,1,,1,30,4,30,2,,2,2,27910,1992,1000000000$,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,26,No,travel_frequently,"1283,0$",Sales,1,3,Medical,1,9560,3,0,52,2,2,sAleS EXECUtIvE,1,,42940,11148,1,Y,,12,30,2,,0,,2,30,7,,0,7,42940,1997,1000000000$,sAleS EXECUtIvE - Sales,,1
1608,32,No,,"498,0$",,3,4,Medical,1,9660,3,1,93,3,2,MANUfaCtuRiNG DiReCtor,1,Married,67250,13554,1,,,12,30,3,,1,80,2,40,8,,6,3,67250,1991,1000000000$,,,1
1609,36,Yes,travel_rarely,"530,0$",,3,1,Life Sciences,1,9670,3,0,51,2,3,saLEs ExeCUTiVe,4,Married,103250,5518,1,Y,,11,,1,,1,,6,30,16,,3,7,103250,1987,1000000000$,,,0
1610,45,No,non-travel,"805,0$",,4,2,,1,9720,3,0,57,3,2,LAboRaTOry tECHNiCIAn,2,,44470,23163,1,,,12,30,2,,0,,5,20,9,,0,8,44470,1978,1000000000$,,,1


In [12]:
# Restaurar el orden original 
df_unique = df_unique.sort_index()
# Remover el nombre del indice
df_unique.index.name = None 

In [13]:
df_unique['employeenumber'].duplicated().sum()

0

In [1]:
df_unique # Este es el df resultante que tiene las filas que queremos mantener en el df completo. 

NameError: name 'df_unique' is not defined

## Dirty code below ⬇️⬇️

In [None]:
def change_commas(string):
    """
    Replaces commas with periods in a given string that represents a decimal number
    in international format (with commas as thousand separators and period as decimal separator).

    Note:
        If an error occurs during the replacement process (e.g., if the argument is not a string),
        the function will return np.nan (Not a Number) to indicate an invalid or unavailable value.
    """

    try:
        # Replace commas with periods in the string
        return float(string.replace(",", "."))
    
    except:
        # If an error occurs (e.g., if the argument is not a string),
        # return np.nan (Not a Number) to indicate an invalid or unavailable value.
        return np.nan


In [None]:
def categorize_age(number):
    """
    Categorizes age into specific groups.

    This function takes an input number representing age and returns a specific category
    based on the age range.
    """
    if number >= 17 and number <= 25:
        return "Young Adults"
    
    elif number >= 26 and number <= 39:
        return "Young Adults"

    elif number >= 40 and number <= 59:
        return "Middle-aged"
    
    else:
        return "Older Adults"

# We have already created the function and verified that it works.
# The next step is to apply it to our entire DataFrame using the `apply()` method.
# This will return a Series, but we haven't stored this result in a variable.
# So, the next thing we'll do is create a new column in the DataFrame with the result of this apply.
df["age_category"] = df["age"].apply(categorize_age)
