In [None]:
# Psssss READ ME! 👋👓
# The objective of the following code it to clean and transform ABC Coorporation data to make it available for a subsequent analysis. 
# The transformation was developed based on the conclusions from the preliminary Exploratory Data Analysis of the raw data provided by the HR department of BC Coorporation.
# The highlevel code structure is as follows: 
    # 1) Import of packages and data
    # 2) Definition of the DataTransformer class, this is the core of the transformation
    # 3) Execution of the transformation with follows this structure:
        # 3.1) Creation of an object from DataTransformer class
        # 3.2) Rename all columns
        # 3.3) Drop unnecessary columns
        # 3.4) Transform data per column
# Testing code: 
    # For each step of the process the program prints the result so that it can be reviewed and confirm that is working correctly



In [None]:
# Imports 📥

# Packages
#-----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # to see all columns in DataFrames


# Data 
# -----------------------------------------------------------------------
df = pd.read_csv("HR RAW DATA.csv", index_col=0)

In [None]:
# print for testing

df.head()

In [None]:
# DataTransformer class definition ✍️

class DataTransformer:
    def __init__(self, dataframe):
        self.df = dataframe

    def rename_columns (self): 
        """Transform columns' names to snake format. E.g. 'EducationField' to 'education_field' """
        columns_names = {
            'Age': 'age',
            'Attrition': 'attrition',
            'BusinessTravel': 'business_travel',
            'DailyRate': 'daily_rate',
            'Department': 'department',
            'DistanceFromHome': 'distance_from_home',
            'Education': 'education',
            'EducationField': 'education_field',
            'employeecount': 'employee_count',
            'employeenumber': 'employee_number',
            'EnvironmentSatisfaction': 'environment_satisfaction',
            'Gender': 'gender',
            'HourlyRate': 'hourly_rate',
            'JobInvolvement': 'job_involvement',
            'JobLevel': 'job_level',
            'JobRole': 'job_role',
            'JobSatisfaction': 'job_satisfaction',
            'MaritalStatus': 'marital_status',
            'MonthlyIncome': 'monthly_income',
            'MonthlyRate': 'monthly_rate',
            'NUMCOMPANIESWORKED': 'num_companies_worked',
            'Over18': 'over_18',
            'OverTime': 'over_time',
            'PercentSalaryHike': 'percent_salary_hike',
            'PerformanceRating': 'performance_rating',
            'RelationshipSatisfaction': 'relationship_satisfaction',
            'StandardHours': 'standard_hours',
            'StockOptionLevel': 'stock_option_level',
            'TOTALWORKINGYEARS': 'total_working_years',
            'TrainingTimesLastYear': 'training_times_last_year',
            'WORKLIFEBALANCE': 'work_life_balance',
            'YearsAtCompany': 'years_at_company',
            'YearsInCurrentRole': 'years_in_current_role',
            'YearsSinceLastPromotion': 'years_since_last_promotion',
            'YEARSWITHCURRMANAGER': 'years_with_curr_manager',
            'SameAsMonthlyIncome': 'same_as_monthly_income',
            'DateBirth': 'date_birth',
            'Salary': 'salary',
            'RoleDepartament': 'role_departament',
            'NUMBERCHILDREN': 'number_children',
            'RemoteWork': 'remote_work'
        }
        self.df = self.df.rename(columns=columns_names)

    def replace_gender_values(self):
        """Replaces the values in the Gender column with 'Male' and 'Female'."""
        self.df['gender'] = self.df['gender'].replace({0: 'Male', 1: 'Female'})
    
    def convert_age_to_numbers(self):
        """Converts numbers written in letters (e.g: 'thirty-two') into numbers written in numbers ('32'). It does NOT change the data type!"""
        # create a dictionary with numbers written in letters as keys and numbers in numbers as values
        conversion_dictionary = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "ten": 10,
        "eleven": 11,
        "twelve": 12,
        "thirteen": 13,
        "fourteen": 14,
        "fifteen": 15,
        "sixteen": 16,
        "seventeen": 17,
        "eighteen": 18,
        "nineteen": 19,
        "twenty": 20,
        "twenty-one": 21,
        "twenty-two": 22,
        "twenty-three": 23,
        "twenty-four": 24,
        "twenty-five": 25,
        "twenty-six": 26,
        "twenty-seven": 27,
        "twenty-eight": 28,
        "twenty-nine": 29,
        "thirty": 30,
        "thirty-one": 31,
        "thirty-two": 32,
        "thirty-three": 33,
        "thirty-four": 34,
        "thirty-five": 35,
        "thirty-six": 36,
        "thirty-seven": 37,
        "thirty-eight": 38,
        "thirty-nine": 39,
        "forty": 40,
        "forty-one": 41,
        "forty-two": 42,
        "forty-three": 43,
        "forty-four": 44,
        "forty-five": 45,
        "forty-six": 46,
        "forty-seven": 47,
        "forty-eight": 48,
        "forty-nine": 49,
        "fifty": 50,
        "fifty-one": 51,
        "fifty-two": 52,
        "fifty-three": 53,
        "fifty-four": 54,
        "fifty-five": 55,
        "fifty-six": 56,
        "fifty-seven": 57,
        "fifty-eight": 58,
        "fifty-nine": 59,
        "sixty": 60,
        "sixty-one": 61,
        "sixty-two": 62,
        "sixty-three": 63,
        "sixty-four": 64,
        "sixty-five": 65,
        "sixty-six": 66,
        "sixty-seven": 67,
        "sixty-eight": 68,
        "sixty-nine": 69,
        "seventy": 70,
        "seventy-one": 71,
        "seventy-two": 72,
        "seventy-three": 73,
        "seventy-four": 74,
        "seventy-five": 75,
        "seventy-six": 76,
        "seventy-seven": 77,
        "seventy-eight": 78,
        "seventy-nine": 79,
        "eighty": 80,
        "eighty-one": 81,
        "eighty-two": 82,
        "eighty-three": 83,
        "eighty-four": 84,
        "eighty-five": 85,
        "eighty-six": 86,
        "eighty-seven": 87,
        "eighty-eight": 88,
        "eighty-nine": 89,
        "ninety": 90,
        "ninety-one": 91,
        "ninety-two": 92,
        "ninety-three": 93,
        "ninety-four": 94,
        "ninety-five": 95,
        "ninety-six": 96,
        "ninety-seven": 97,
        "ninety-eight": 98,
        "ninety-nine": 99,
        "one hundred": 100
        }
        for index, value in enumerate(self.df["age"].values): 
            if value in conversion_dictionary:
                self.df["age"][index] = conversion_dictionary[value]
                print (f"Value '{value}' was transformed into '{conversion_dictionary[value]}'")
                
    def convert_to_numeric(self, column_name, downcast):
        """Converts a column to numeric type. Downcast = 'integer' or 'float'"""
        self.df[column_name] = pd.to_numeric(self.df[column_name], errors='raise', downcast=downcast)

    def fix_negative_distances(self):
        """Corrects negative values in the DistanceFromHome column."""
        self.df['distance_from_home'] = self.df['distance_from_home'].abs()

    def drop_redundant_columns(self, columns):
            """Drops redundant columns like 'same_as_monthly_income'. Param columns is a list of columns to de dropped"""
            self.df.drop(columns=columns, inplace=True, errors='ignore')

    def correct_env_satisfaction_values(self):
        """Transform values higher than 4 into NaN"""
        self.df['environment_satisfaction'] = self.df['environment_satisfaction'].apply(lambda x: x if x <= 4 else np.nan)

    def correct_hourly_rate(self):
        """Transform all values from the column into into numeric and 'Not Available' into NaN"""
        
        def transform_hourly_rate_individual (value):
            """Transform a value into numeric and 'Not Available' into NaN"""
            if value == "Not Available":
                value = np.nan
            return float(value)

        self.df['hourly_rate'] = self.df['hourly_rate'].apply(transform_hourly_rate_individual)

    def transform_to_float(self,column_name):
        """Transform strings with format '3579,0' in float"""

        self.df[column_name] = self.df[column_name].str.replace(',','.').astype(float)

    def quick_check(self,column_name):
        """ This function is for testiing purposes, to quicky check data type and unique values of a column"""
        print (f"Column name: {column_name}")
        print (f"Data type: {self.df[column_name].dtype}")
        print (f"Unique values: {self.df[column_name].unique()}")

    def correct_typographical_errors(self):
        """Corrects typographical errors in the marital_status column."""
        self.df['marital_status'] = self.df['marital_status'].replace({'Marreid': 'Married','divorced':'Divorced'})

    def get_dataframe(self):
        """Returns the transformed DataFrame."""
        return self.df



In [None]:
# ▶️ Instantiating a class object 🐣

abc_data = DataTransformer(df)

# print for testing
abc_data.df.head(2)

In [None]:
# ▶️ Rename columns


abc_data.rename_columns()

# print for checking

abc_data.df.head(1)

In [None]:
# ▶️ Drop un-needed coulumns

columns_to_delete = ["employee_count", "same_as_monthly_income", "salary", "number_children", "standard_hours", "role_departament"]

abc_data.drop_redundant_columns(columns_to_delete)

# print for testing

for i in range(len(columns_to_delete)):
    try:
        print(abc_data.df[i])
    except: 
        print(f"Column '{columns_to_delete[i]}' deleted")



In [None]:
# ▶️ Transform "age" column data

column_name = "age"
# convert number written in letters into into numbers written in numbers

abc_data.convert_age_to_numbers()

# convert column to "int"

abc_data.convert_to_numeric (column_name,"integer")

# print for testing

abc_data.quick_check(column_name)

In [None]:
# ▶️ Transform "distance_from_home" column data

# Corrects negative values in the distance_from_home column

abc_data.fix_negative_distances()

# print for testing
print("Printing 'distance_from_home' colum unique values for testing:")
print(abc_data.df["distance_from_home"].unique())


In [None]:
# ▶️ Transform "environment_satisfaction" column data

abc_data.correct_env_satisfaction_values()

# print for testing

abc_data.quick_check("environment_satisfaction")

acceptance_criteria = "Data type is float and there isn't any value different than 1,2,3 or 4"
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "gender" column data

# Replace the values in the Gender column with 'Male' and 'Female'

abc_data.replace_gender_values()

# print for checking

abc_data.quick_check("gender")

acceptance_criteria = "Unique values are male or female"
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "hourly_rate" column data

abc_data.correct_hourly_rate()


# print for testing
abc_data.quick_check("hourly_rate")

acceptance_criteria = ""
print (f"\nAcceptance criteria: {acceptance_criteria}")



In [None]:
# ▶️ Transform "monthly_income" column data

abc_data.transform_to_float("monthly_income")

# print for testing

abc_data.quick_check("monthly_income")

acceptance_criteria = ""
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "performance_rating" column data

abc_data.transform_to_float("performance_rating")


# print for testing

abc_data.quick_check("performance_rating")

acceptance_criteria = ""
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "total_working_years" column data

abc_data.transform_to_float("total_working_years")

abc_data.quick_check("total_working_years")

acceptance_criteria = "Cambiar tipo a `int64`."
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "work_life_balance" column data

abc_data.transform_to_float("work_life_balance")

# print for testing
abc_data.quick_check("work_life_balance")

acceptance_criteria = "Cambiar tipo a `int64`."
print (f"\nAcceptance criteria: {acceptance_criteria}")

In [None]:
# ▶️ Transform "marital_status" data

abc_data.correct_typographical_errors()

# print for testing

abc_data.quick_check("marital_status")
acceptance_criteria = "Corregir errores tipográficos."
print (f"\nAcceptance criteria: {acceptance_criteria}")

## Dirty code below (DELETE) ⬇️⬇️

In [None]:
# method to transform string vaules to lower case

column_name = "name"

# overwrite values within the column with values transformed into lowercase 
df[column_name] = df[column_name].str.lower()

print(f"Values in column '{column_name}' were transformed into lowercase 🔠->🔡\n")

# print df to check if the column values were transformed succesfully
df.head()

In [None]:
# Method to replace and change lower/upper columns
#  iterate through all the columns of the DataFrame, convert each to lowercase, and apply a replace to remove the dots.
new_columns = {column: column.lower().replace(".", "") for column in df.columns}

# Check that we have created the dictionary correctly
new_columns

# Once we have the dictionary, let's apply the rename method
df.rename(columns = new_columns, inplace = True)

# Check that the column names have been changed
df.head()