In [1]:
# Imports 📥

# Packages
#-----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # to see all columns in DataFrames


# Data 
# -----------------------------------------------------------------------
abc_coorp_df = pd.read_csv("HR RAW DATA.csv", index_col=0)

In [51]:
# DataTransformer class definition ✍️

class DataTransformer:
    def __init__(self, dataframe):
        self.df = dataframe

    def replace_gender_values(self):
        """Replaces the values in the Gender column with 'Male' and 'Female'."""
        self.df['Gender'] = self.df['Gender'].replace({0: 'Male', 1: 'Female'})

    def convert_object_to_float_eliminate_dolar(self, column_name):
        """
        Converts a column with object type values containing $ signs at the end
        and commas as decimal separators to numeric float values.
        """
        if column_name in self.df.columns:
            # Remove dollar signs from the end
            self.df[column_name] = self.df[column_name].str.replace('$', '', regex=False)
            
            # Replace commas with dots for decimal points
            self.df[column_name] = self.df[column_name].str.replace(',', '.', regex=False)
            
            # Convert the cleaned column to float type
            self.df[column_name] = self.df[column_name].astype(float)
        else:
            print(f"Column {column_name} doesn't exist in the DataFrame.")
        

    def convert_role_to_department(self):
        # First change de type of data to be capitalize and the same way
        self.df['JobRole'] = self.df['JobRole'].str.title()
        self.df['Department'] = self.df['Department'].str.title()
       # Clean empty spaces
        self.df['JobRole'] = self.df['JobRole'].str.strip()
        self.df['Department'] = self.df['Department'].str.strip()
        
        conversion_dictionary = {
        'Healthcare Representative': 'Research & Development',
        'Sales Executive': 'Sales',
        'Healthcare Representative': 'Research & Development',
        'Laboratory Technician': 'Research & Development',
        'Manufacturing Director': 'Research & Development',
        'Research Scientist': 'Research & Development',
        'Sales Executive': 'Sales',
        'Sales Representative':'Sales',
        'Research Director': 'Research & Development',
        'Human Resources': 'Human Resources',
         }
        # Iterate over the rows of the DataFrame
        for index, row in self.df.iterrows():
            job_role = row['JobRole']
            # Assign the corresponding value to Department using the dictionary
            if job_role in conversion_dictionary:
                self.df.at[index, 'Department'] = conversion_dictionary[job_role]
                print(f"Value '{job_role}' was transformed into '{conversion_dictionary[job_role]}'")

    

In [52]:
# Instantiating a class object 🐣

abc_data = DataTransformer(abc_coorp_df)

In [53]:
# Llamar a la función para convertir la columna DailyRate
abc_data.convert_role_to_department()

print(abc_data)

Value 'Research Director' was transformed into 'Research & Development'
Value 'Research Director' was transformed into 'Research & Development'
Value 'Sales Executive' was transformed into 'Sales'
Value 'Sales Executive' was transformed into 'Sales'
Value 'Sales Executive' was transformed into 'Sales'
Value 'Research Director' was transformed into 'Research & Development'
Value 'Research Director' was transformed into 'Research & Development'
Value 'Manufacturing Director' was transformed into 'Research & Development'
Value 'Research Director' was transformed into 'Research & Development'
Value 'Research Director' was transformed into 'Research & Development'
Value 'Research Director' was transformed into 'Research & Development'
Value 'Research Director' was transformed into 'Research & Development'
Value 'Research Scientist' was transformed into 'Research & Development'
Value 'Manufacturing Director' was transformed into 'Research & Development'
Value 'Healthcare Representative' was 

In [54]:
abc_data.df["JobRole"].sample(50)

1064              Sales Executive
368               Sales Executive
1021        Laboratory Technician
1164    Healthcare Representative
877         Laboratory Technician
497                       Manager
7                 Sales Executive
831            Research Scientist
1390         Sales Representative
1061         Sales Representative
527            Research Scientist
12              Research Director
1280              Sales Executive
814             Research Director
588               Sales Executive
1247         Sales Representative
106               Sales Executive
516         Laboratory Technician
1114              Human Resources
683            Research Scientist
536                       Manager
89         Manufacturing Director
596               Sales Executive
235     Healthcare Representative
401        Manufacturing Director
594         Laboratory Technician
500                       Manager
168     Healthcare Representative
1031           Research Scientist
1458       Man

In [55]:
abc_data.df["JobRole"].unique()

array(['Research Director', 'Manager', 'Sales Executive',
       'Manufacturing Director', 'Research Scientist',
       'Healthcare Representative', 'Laboratory Technician',
       'Sales Representative', 'Human Resources'], dtype=object)

In [56]:
abc_data.df["Department"].value_counts()

Department
Research & Development    1001
Sales                      464
Human Resources             58
Name: count, dtype: int64

In [58]:
abc_data.df[["JobRole","Department"]].sample(50)

Unnamed: 0,JobRole,Department
1243,Research Scientist,Research & Development
971,Laboratory Technician,Research & Development
517,Research Director,Research & Development
210,Healthcare Representative,Research & Development
670,Research Director,Research & Development
1152,Laboratory Technician,Research & Development
243,Sales Executive,Sales
57,Manufacturing Director,Research & Development
128,Manufacturing Director,Research & Development
1235,Laboratory Technician,Research & Development


In [59]:
abc_coorp_df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
0,51,No,,684.0,Research & Development,6,3,,1,1620,1,0,51,3,5,Research Director,3,,195370,6462,7,Y,No,13,30,3,,0,,5,30,20,,15,15,195370,1972,1000000000$,,,Yes
1,52,No,,699.0,,1,4,Life Sciences,1,2590,3,0,65,2,5,Manager,3,,199990,5678,0,,,14,30,1,,1,340,5,30,33,,11,9,199990,1971,1000000000$,,,1
2,42,No,travel_rarely,532.0,Research & Development,4,2,Technical Degree,1,3190,3,0,58,3,5,Manager,4,Married,192320,4933,1,,No,11,30,4,,0,220,3,,22,,11,15,192320,1981,1000000000$,Manager - Research & Development,,1
3,47,No,travel_rarely,359.0,Research & Development,2,4,Medical,1,,1,1,82,3,4,Research Director,3,Married,171690,26703,3,Y,,19,30,2,,2,,2,,20,,5,6,171690,1976,1000000000$,,,False
4,46,No,,1319.0,Sales,3,3,Technical Degree,1,,1,1,45,4,4,Sales Executive,1,Divorced,,7739,2,Y,No,12,30,4,,1,,5,30,19,,2,8,,1977,1000000000$,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,36,Yes,travel_rarely,530.0,Sales,3,1,Life Sciences,1,9670,3,0,51,2,3,Sales Executive,4,Married,103250,5518,1,Y,,11,,1,,1,,6,30,16,,3,7,103250,1987,1000000000$,,,0
1610,45,No,non-travel,805.0,Research & Development,4,2,,1,9720,3,0,57,3,2,Laboratory Technician,2,,44470,23163,1,,,12,30,2,,0,,5,20,9,,0,8,44470,1978,1000000000$,,,1
1611,39,No,travel_rarely,903.0,Sales,-13,5,,1,,13,0,41,4,3,Sales Executive,3,Single,,2560,0,,No,18,30,4,,0,90,3,30,8,,0,7,,1984,1000000000$,,,Yes
1612,36,No,non-travel,1229.0,Sales,8,4,Technical Degree,1,9900,1,0,84,3,2,Sales Executive,4,Divorced,,25952,4,,No,13,,4,,2,120,3,30,7,,0,7,,1987,1000000000$,,,True


## Dirty code below ⬇️⬇️

In [None]:
# Method to replace $ 
#  iterate through all the columns of the DataFrame, convert each to lowercase, and apply a replace to remove the dots.
new_columns = {column: column.lower().replace(".", "") for column in df.columns}

# Check that we have created the dictionary correctly
new_columns

# Once we have the dictionary, let's apply the rename method
df.rename(columns = new_columns, inplace = True)

# Check that the column names have been changed
df.head()

In [None]:
def change_commas(string):
    """
    Replaces commas with periods in a given string that represents a decimal number
    in international format (with commas as thousand separators and period as decimal separator).

    Note:
        If an error occurs during the replacement process (e.g., if the argument is not a string),
        the function will return np.nan (Not a Number) to indicate an invalid or unavailable value.
    """

    try:
        # Replace commas with periods in the string
        return float(string.replace(",", "."))
    
    except:
        # If an error occurs (e.g., if the argument is not a string),
        # return np.nan (Not a Number) to indicate an invalid or unavailable value.
        return np.nan


In [None]:
def categorize_age(number):
    """
    Categorizes age into specific groups.

    This function takes an input number representing age and returns a specific category
    based on the age range.
    """
    if number >= 17 and number <= 25:
        return "Young Adults"
    
    elif number >= 26 and number <= 39:
        return "Young Adults"

    elif number >= 40 and number <= 59:
        return "Middle-aged"
    
    else:
        return "Older Adults"

# We have already created the function and verified that it works.
# The next step is to apply it to our entire DataFrame using the `apply()` method.
# This will return a Series, but we haven't stored this result in a variable.
# So, the next thing we'll do is create a new column in the DataFrame with the result of this apply.
df["age_category"] = df["age"].apply(categorize_age)
