In [None]:
# Psssss READ ME! 👋👓
# The objective of the following code it to clean and transform ABC Coorporation data to make it available for a subsequent analysis. 
# The transformation methods of the data_transformer were developed based on the conclusions from the preliminary Exploratory Data Analysis of the raw data provided by the HR department of BC Coorporation.
# The highlevel code structure is as follows: 
    # 1) Import of packages and data
    # 2) Definition of the DataTransformer class, this is the core of the transformation
    # 3) Execution of the transformation with follows this structure:
        # 3.1) Creation of an object from DataTransformer class
        # 3.2) Rename all columns
        # 3.3) Drop unnecessary columns
        # 3.4) Transform data per column
# Testing code: 
    # For each step of the process the program prints the result so that it can be reviewed and confirm that is working correctly



In [70]:
# Imports 📥

# Packages
#-----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # to see all columns in DataFrames


# Data 
# -----------------------------------------------------------------------
df = pd.read_csv("HR RAW DATA.csv", index_col=0)

In [None]:
# print for testing

df.head()

In [64]:
# DataTransformer class definition ✍️

class DataTransformer:
    def __init__(self, dataframe):
        self.df = dataframe

    def rename_columns (self): 
        """Transform columns' names to snake format. E.g. 'EducationField' to 'education_field' """
        columns_names = {
            'Age': 'age',
            'Attrition': 'attrition',
            'BusinessTravel': 'business_travel',
            'DailyRate': 'daily_rate',
            'Department': 'department',
            'DistanceFromHome': 'distance_from_home',
            'Education': 'education',
            'EducationField': 'education_field',
            'employeecount': 'employee_count',
            'employeenumber': 'employee_number',
            'EnvironmentSatisfaction': 'environment_satisfaction',
            'Gender': 'gender',
            'HourlyRate': 'hourly_rate',
            'JobInvolvement': 'job_involvement',
            'JobLevel': 'job_level',
            'JobRole': 'job_role',
            'JobSatisfaction': 'job_satisfaction',
            'MaritalStatus': 'marital_status',
            'MonthlyIncome': 'monthly_income',
            'MonthlyRate': 'monthly_rate',
            'NUMCOMPANIESWORKED': 'num_companies_worked',
            'Over18': 'over_18',
            'OverTime': 'over_time',
            'PercentSalaryHike': 'percent_salary_hike',
            'PerformanceRating': 'performance_rating',
            'RelationshipSatisfaction': 'relationship_satisfaction',
            'StandardHours': 'standard_hours',
            'StockOptionLevel': 'stock_option_level',
            'TOTALWORKINGYEARS': 'total_working_years',
            'TrainingTimesLastYear': 'training_times_last_year',
            'WORKLIFEBALANCE': 'work_life_balance',
            'YearsAtCompany': 'years_at_company',
            'YearsInCurrentRole': 'years_in_current_role',
            'YearsSinceLastPromotion': 'years_since_last_promotion',
            'YEARSWITHCURRMANAGER': 'years_with_curr_manager',
            'SameAsMonthlyIncome': 'same_as_monthly_income',
            'DateBirth': 'date_birth',
            'Salary': 'salary',
            'RoleDepartament': 'role_departament',
            'NUMBERCHILDREN': 'number_children',
            'RemoteWork': 'remote_work'
        }
        self.df = self.df.rename(columns=columns_names)

    def replace_gender_values(self):
        """Replaces the values in the Gender column with 'Male' and 'Female'."""
        self.df['gender'] = self.df['gender'].replace({0: 'Male', 1: 'Female'})
    
    def convert_age_to_numbers(self):
        """Converts numbers written in letters (e.g: 'thirty-two') into numbers written in numbers ('32'). It does NOT change the data type!"""
        # create a dictionary with numbers written in letters as keys and numbers in numbers as values
        conversion_dictionary = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "ten": 10,
        "eleven": 11,
        "twelve": 12,
        "thirteen": 13,
        "fourteen": 14,
        "fifteen": 15,
        "sixteen": 16,
        "seventeen": 17,
        "eighteen": 18,
        "nineteen": 19,
        "twenty": 20,
        "twenty-one": 21,
        "twenty-two": 22,
        "twenty-three": 23,
        "twenty-four": 24,
        "twenty-five": 25,
        "twenty-six": 26,
        "twenty-seven": 27,
        "twenty-eight": 28,
        "twenty-nine": 29,
        "thirty": 30,
        "thirty-one": 31,
        "thirty-two": 32,
        "thirty-three": 33,
        "thirty-four": 34,
        "thirty-five": 35,
        "thirty-six": 36,
        "thirty-seven": 37,
        "thirty-eight": 38,
        "thirty-nine": 39,
        "forty": 40,
        "forty-one": 41,
        "forty-two": 42,
        "forty-three": 43,
        "forty-four": 44,
        "forty-five": 45,
        "forty-six": 46,
        "forty-seven": 47,
        "forty-eight": 48,
        "forty-nine": 49,
        "fifty": 50,
        "fifty-one": 51,
        "fifty-two": 52,
        "fifty-three": 53,
        "fifty-four": 54,
        "fifty-five": 55,
        "fifty-six": 56,
        "fifty-seven": 57,
        "fifty-eight": 58,
        "fifty-nine": 59,
        "sixty": 60,
        "sixty-one": 61,
        "sixty-two": 62,
        "sixty-three": 63,
        "sixty-four": 64,
        "sixty-five": 65,
        "sixty-six": 66,
        "sixty-seven": 67,
        "sixty-eight": 68,
        "sixty-nine": 69,
        "seventy": 70,
        "seventy-one": 71,
        "seventy-two": 72,
        "seventy-three": 73,
        "seventy-four": 74,
        "seventy-five": 75,
        "seventy-six": 76,
        "seventy-seven": 77,
        "seventy-eight": 78,
        "seventy-nine": 79,
        "eighty": 80,
        "eighty-one": 81,
        "eighty-two": 82,
        "eighty-three": 83,
        "eighty-four": 84,
        "eighty-five": 85,
        "eighty-six": 86,
        "eighty-seven": 87,
        "eighty-eight": 88,
        "eighty-nine": 89,
        "ninety": 90,
        "ninety-one": 91,
        "ninety-two": 92,
        "ninety-three": 93,
        "ninety-four": 94,
        "ninety-five": 95,
        "ninety-six": 96,
        "ninety-seven": 97,
        "ninety-eight": 98,
        "ninety-nine": 99,
        "one hundred": 100
        }
        for index, value in enumerate(self.df["age"].values): 
            if value in conversion_dictionary:
                self.df["age"][index] = conversion_dictionary[value]
                print (f"Value '{value}' was transformed into '{conversion_dictionary[value]}'")
                
    def convert_to_numeric(self, column_name, downcast):
        """Converts a column to numeric type. Downcast = 'integer' or 'float'"""
        self.df[column_name] = pd.to_numeric(self.df[column_name], errors='raise', downcast=downcast)

    def fix_negative_distances(self):
        """Corrects negative values in the DistanceFromHome column."""
        self.df['distance_from_home'] = self.df['distance_from_home'].abs()

    def drop_redundant_columns(self, columns):
            """Drops redundant columns like 'same_as_monthly_income'. Param columns is a list of columns to de dropped"""
            self.df.drop(columns=columns, inplace=True, errors='ignore')

    def correct_env_satisfaction_values(self):
        """Transform values higher than 4 into NaN"""
        self.df['environment_satisfaction'] = self.df['environment_satisfaction'].apply(lambda x: x if x <= 4 else np.nan)

    def correct_hourly_rate(self):
        """Transform all values from the column into into numeric and 'Not Available' into NaN"""
        
        def transform_hourly_rate_individual (value):
            """Transform a value into numeric and 'Not Available' into NaN"""
            if value == "Not Available":
                value = np.nan
            return float(value)

        self.df['hourly_rate'] = self.df['hourly_rate'].apply(transform_hourly_rate_individual)

    def transform_to_float(self,column_name):
        """Transform strings with format '3579,0' in float"""

        self.df[column_name] = self.df[column_name].str.replace(',','.').astype(float)

    def quick_check(self,column_name):
        """ This function is for testiing purposes, to quicky check data type and unique values of a column"""
        print (f"Column name: {column_name}")
        print (f"Data type: {self.df[column_name].dtype}")
        print (f"Unique values: {self.df[column_name].unique()}")
        print (f"Not null count: {self.df[column_name].notnull().sum()}")
        print (f"Null count: {self.df[column_name].isnull().sum()}")
        print (f"Duplicated values: {self.df[column_name].duplicated().sum()}")

    def correct_typographical_errors(self):
        """Corrects typographical errors in the marital_status column."""
        self.df['marital_status'] = self.df['marital_status'].replace({'Marreid': 'Married','divorced':'Divorced'})

    def get_dataframe(self):
        """Returns the transformed DataFrame."""
        return self.df

    def remove_duplicates(self): # In progress (NOT FINISHED)
        self.df.drop_duplicates(subset=['employee_number'], keep='last', inplace=True)

In [71]:
# ▶️ Instantiating a class object 🐣

abc_data = DataTransformer(df)

# print for testing
abc_data.df.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
0,51,No,,"684,0$",,6,3,,1,1620,1,0,51,3,5,resEArch DIREcToR,3,,195370,6462,7,Y,No,13,30,3,,0,,5,30,20,,15,15,195370,1972,1000000000$,,,Yes
1,52,No,,"699,0$",,1,4,Life Sciences,1,2590,3,0,65,2,5,ManAGeR,3,,199990,5678,0,,,14,30,1,,1,340.0,5,30,33,,11,9,199990,1971,1000000000$,,,1


In [81]:
# ▶️ Rename columns


abc_data.rename_columns()

# print for checking

abc_data.df.head(1)

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_count,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_18,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,same_as_monthly_income,date_birth,salary,role_departament,number_children,remote_work
0,51,No,,"684,0$",,6,3,,1,1620,1,0,51,3,5,resEArch DIREcToR,3,,195370,6462,7,Y,No,13,30,3,,0,,5,30,20,,15,15,195370,1972,1000000000$,,,Yes


In [None]:
# ▶️ Drop un-needed coulumns

columns_to_delete = ["employee_count", "same_as_monthly_income", "salary", "number_children", "standard_hours", "role_departament"]

abc_data.drop_redundant_columns(columns_to_delete)

# print for testing

for i in range(len(columns_to_delete)):
    try:
        print(abc_data.df[i])
    except: 
        print(f"Column '{columns_to_delete[i]}' deleted")



In [None]:
# ▶️ Transform "age" column data

column_name = "age"
# convert number written in letters into into numbers written in numbers

abc_data.convert_age_to_numbers()

# convert column to "int"

abc_data.convert_to_numeric (column_name,"integer")

# print for testing

abc_data.quick_check(column_name)

In [None]:
# ▶️ Transform "distance_from_home" column data

# Corrects negative values in the distance_from_home column

abc_data.fix_negative_distances()

# print for testing
print("Printing 'distance_from_home' colum unique values for testing:")
print(abc_data.df["distance_from_home"].unique())


In [None]:
# ▶️ Transform "environment_satisfaction" column data

abc_data.correct_env_satisfaction_values()

# print for testing

abc_data.quick_check("environment_satisfaction")

acceptance_criteria = "Data type is float and there isn't any value different than 1,2,3 or 4"
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "gender" column data

# Replace the values in the Gender column with 'Male' and 'Female'

abc_data.replace_gender_values()

# print for checking

abc_data.quick_check("gender")

acceptance_criteria = "Unique values are male or female"
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "hourly_rate" column data

abc_data.correct_hourly_rate()


# print for testing
abc_data.quick_check("hourly_rate")

acceptance_criteria = ""
print (f"\nAcceptance criteria: {acceptance_criteria}")



In [None]:
# ▶️ Transform "monthly_income" column data

abc_data.transform_to_float("monthly_income")

# print for testing

abc_data.quick_check("monthly_income")

acceptance_criteria = ""
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "performance_rating" column data

abc_data.transform_to_float("performance_rating")


# print for testing

abc_data.quick_check("performance_rating")

acceptance_criteria = ""
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "total_working_years" column data

abc_data.transform_to_float("total_working_years")

abc_data.quick_check("total_working_years")

acceptance_criteria = "Cambiar tipo a `int64`."
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "work_life_balance" column data

abc_data.transform_to_float("work_life_balance")

# print for testing
abc_data.quick_check("work_life_balance")

acceptance_criteria = "Cambiar tipo a `int64`."
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "marital_status" data

abc_data.correct_typographical_errors()

# print for testing

abc_data.quick_check("marital_status")
acceptance_criteria = "Corregir errores tipográficos."
print (f"\nAcceptance criteria: {acceptance_criteria}")

# CODE IN PROGRESS 

In [73]:
abc_data.quick_check("employee_number")

Column name: employee_number
Data type: object
Unique values: ['162,0' '259,0' '319,0' ... '2012,0' '2023,0' '2040,0']
Not null count: 1183
Null count: 431
Duplicated values: 534


In [68]:
# ▶️ Remove duplicates

abc_data.remove_duplicates()

In [69]:
abc_data.quick_check("employee_number")

Column name: employee_number
Data type: object
Unique values: ['162,0' '259,0' '319,0' ... '972,0' '990,0' nan]
Not null count: 1079
Null count: 1
Duplicated values: 0


In [55]:
duplicates_non_null = abc_data.df[
    abc_data.df["employee_number"].duplicated(keep=False) & abc_data.df["employee_number"].notna()
]
duplicates_non_null_sorted = duplicates_non_null.sort_values(by='employee_number')
print("Registros duplicados con valores no nulos en 'employee_number':")

duplicates_non_null_sorted.head(10)

Registros duplicados con valores no nulos en 'employee_number':


Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_18,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,date_birth,remote_work
987,33,No,non-travel,"1038,0$",Sales,8,1,Life Sciences,10440,2.0,Female,,2,1,sALES ReprEsentatiVE,4,,,21437,0,Y,,19,3.0,4,0,,2,2.0,2,,2,2,1990,False
1483,33,No,non-travel,"1038,0$",Sales,8,1,Life Sciences,10440,2.0,Female,88.0,2,1,sales repResENTaTiVE,4,,,21437,0,Y,,19,3.0,4,0,,2,2.0,2,,2,2,1990,0
1484,26,Yes,,"342,0$",,2,3,Life Sciences,10530,1.0,Male,57.0,3,1,reSEArcH SCiEnTIst,1,Married,2042.0,15346,6,Y,,14,3.0,2,1,6.0,2,3.0,3,,1,2,1997,Yes
988,26,Yes,,"342,0$",,2,3,Life Sciences,10530,1.0,Male,57.0,3,1,rEsEaRCH SCIenTiSt,1,Married,2042.0,15346,6,Y,,14,3.0,2,1,6.0,2,3.0,3,,1,2,1997,1
990,46,No,,"430,0$",,47,4,Medical,10690,,Male,40.0,3,5,RESeArCH dIrEcTOR,4,,,21445,9,,No,17,3.0,4,2,23.0,0,3.0,2,,2,2,1977,True
1486,46,No,,"430,0$",,1,4,Medical,10690,4.0,Male,40.0,3,5,rESEArCh DIrecTOR,4,,,21445,9,,No,17,3.0,4,2,23.0,0,3.0,2,,2,2,1977,1
1000,21,No,,"984,0$",,1,1,,11310,4.0,Female,70.0,2,1,ResEarCH scIeNtiST,2,Single,2070.0,25326,1,Y,Yes,11,3.0,3,0,2.0,6,4.0,2,,2,2,2002,True
1487,21,No,,"984,0$",,25,1,,11310,4.0,Female,70.0,2,1,REsEaRch scIenTIsT,2,Single,2070.0,25326,1,Y,Yes,11,3.0,3,0,2.0,6,4.0,2,,2,2,2002,1
1001,31,No,travel_frequently,"793,0$",,20,3,,11350,3.0,Male,67.0,4,1,SALEs REPREsEntatIve,4,Married,2791.0,21981,0,,No,12,,1,1,3.0,4,3.0,2,,2,2,1992,1
1488,31,No,travel_frequently,"793,0$",,20,3,,11350,3.0,Male,67.0,4,1,saLES RePRESenTAtIve,4,Married,2791.0,21981,0,,No,12,,1,1,3.0,4,3.0,2,,2,2,1992,False


In [76]:
duplicates_null = abc_data.df[
    abc_data.df["employee_number"].duplicated(keep=False) & abc_data.df["employee_number"].isna()
]
duplicates_null_sorted = duplicates_null.sort_values(by='employee_number')
print("Registros duplicados con valores no nulos en 'employee_number':")

duplicates_null_sorted.head(10)

Registros duplicados con valores no nulos en 'employee_number':


Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_count,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_18,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,standard_hours,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_since_last_promotion,years_with_curr_manager,same_as_monthly_income,date_birth,salary,role_departament,number_children,remote_work
3,47,No,travel_rarely,"359,0$",,2,4,Medical,1,,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690.0,26703,3,Y,,19,30.0,2,,2,,2,,20,,5,6,171690.0,1976,1000000000$,,,False
4,46,No,,"1319,0$",,3,3,Technical Degree,1,,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,Y,No,12,30.0,4,,1,,5,30.0,19,,2,8,,1977,1000000000$,,,0
8,41,No,,"1276,0$",,2,5,,1,,2,1,91,3,4,mANAGEr,1,Married,165950.0,5626,7,,No,16,30.0,2,,1,220.0,2,30.0,18,,11,8,165950.0,1982,1000000000$,,,True
11,56,No,travel_rarely,"1369,0$",,23,3,Life Sciences,1,,4,0,68,3,4,MANUfACturing DIReCtOr,2,Married,134020.0,18235,4,,Yes,12,,1,,1,330.0,0,30.0,19,,15,9,134020.0,1967,1000000000$,,,False
12,38,No,,"201,0$",,10,3,Medical,1,,2,1,99,1,3,REseaRCh dirEcToR,3,Married,132060.0,3376,3,Y,No,12,30.0,1,,1,,3,30.0,18,,1,11,132060.0,1985,1000000000$,,,1
14,55,No,,"692,0$",,14,4,Medical,1,,3,0,61,4,5,RESeaRCH dirEctor,2,Single,,13339,8,,No,11,30.0,4,,0,,3,30.0,24,,2,15,,1968,1000000000$,,,True
17,46,No,travel_rarely,"1402,0$",,2,3,,1,,3,1,69,3,4,MAnaGER,1,,,24097,8,,,23,40.0,1,,0,280.0,2,30.0,26,,15,9,,1977,1000000000$,,,False
18,35,No,,"819,0$",,-13,5,Life Sciences,1,,2,0,48,4,2,REseArcH sciENtiST,1,Married,,26312,1,,,11,30.0,4,,0,,2,30.0,16,,1,10,,1988,1000000000$,,,False
19,40,No,travel_rarely,"884,0$",,15,3,Life Sciences,1,,1,1,80,2,3,MAnUFaCtUring dIRecTOr,3,,,25800,1,Y,,13,30.0,4,,2,180.0,2,,18,,14,12,,1983,1000000000$,,,True
22,36,No,,"1223,0$",,8,3,,1,,3,1,59,3,3,hEaltHCarE ReprESENTAtIvE,3,Divorced,,8202,1,Y,,13,,2,,3,170.0,2,30.0,17,,12,8,,1987,1000000000$,,,1


In [80]:
# buscar duplicados en los no nulos
duplicates_null_sorted.duplicated().sum()
 

0

In [None]:
column_name = "employee_number"

print (f"Column name: {column_name}")
print (f"Data type: {duplicates_non_null_sorted[column_name].dtype}")
print (f"Not null count: {duplicates_non_null_sorted[column_name].notnull().sum()}")
print (f"Null count: {duplicates_non_null_sorted[column_name].isnull().sum()}")
print (f"Duplicated values: {duplicates_non_null_sorted[column_name].duplicated().sum()}")
print (f"Unique values: {duplicates_non_null_sorted[column_name].unique()}")
print (f"Unique values: {duplicates_non_null_sorted[column_name].value_counts()}")
