# Creating synthetic DEI HR records with the Python Faker library

Composition of HR records for DEI purposes:
- unique 4 digits employee number, starting at 10000000 to be able to generate potentially up to 10M records for machine learning purposes
- start dates & termination dates (or none if employee is still active) + tenure
- birth dates & age
- department (HR, Finance, Marketing, Sales, IT, Customer Service, Legal, Project Management)
- province of employment
- hierarchy level (79% individual contributor, 16% manager/director, 5% senior leadership) per this [ratio of individual contributors/managers/directors](https://ravio.com/blog/effective-management-structures-how-to-know-if-your-company-is-too-top-heavy#)
- gender (male, female, non-binary)
- ethnicity (caucasian, african descent, indigenous, bi-racial, hispanic, pacific islander, middle eastern, asian)

In [1]:
# importing libraries
from faker import Faker
import pandas as pd

# creating a Faker instance in Canada
fake = Faker(locale='en_CA')

# importing dynamic provider for weighted choices
from faker.providers import DynamicProvider
import random
from random import choices

# importing datetime to convert strings into dates if necessary
from datetime import datetime, timedelta
import calendar

In [2]:
# creating the list of hierarchy levels with their weights
hierarchy_pool = ["Individual contributor",
                  "Manager/Director",
                  "Senior Leadership"]
h_wts = [0.79,
         0.16,
         0.05]

# creating the list of genders with their weights
gender_pool = ["male",
               "female",
               "non-binary"]
g_wts = [0.49,
         0.49,
         0.02]

# creating the list of ethnicities with their weights
ethnicity_pool = ["caucasian", "african descent", 
                  "indigenous", "bi-racial",
                  "hispanic", "pacific islander", 
                  "middle eastern", "asian"]
e_wts=[0.61, 0.04,
       0.06, 0.03,
       0.03, 0.01,
       0.02, 0.2]

# creating the list of departments
department_pool = ["HR","Finance","Marketing","Sales",
                   "IT","Customer Service","Legal",
                   "Project Management"]

In [3]:
# defining a function to generate random date between two dates
def random_between_dates(start, end):
    delta = end - start
    random_point = random.randint(0, delta.days)
    return start + timedelta(days= random_point)

# defining a function to calculate age either at time of departure or now
def calcul_age(birth_date, reference_date):
    return reference_date.year - birth_date.year - \
            ((reference_date.month, reference_date.day) < \
             (birth_date.month, birth_date.day))

# defining constants
_year = 365.25 # days, as we're taking into account leap years
_current_date = datetime.now()

In [4]:
# creating a function to generate employee records
def create_employees(num_employees):
    employee_list = []
    for i in range(num_employees):
        employee = {}
        employee['ee#'] = 10000000+i
        # generating appropriate birth dates
        birth_date = random_between_dates(
            _current_date - timedelta(days= _year * 100),
            _current_date)

        # generating appropriate dates
        '''
        birth date is generated first to serve as a base from a range of 100 years.
        based on birth date, a start date is generated for the employee to be between
        25 and 60 years old when joining the company.
        if the generated start date is beyond current date, it is regenerated to ensure
        that it stays in the past, though that limit can be modified. 
        
        based on start date, a termination date is generated with a lower bound equal
        to the start date (the employee resigned same day as they started) and an upper
        bound either equal to the current date (which will be transformed as none) or
        retirement age defined by tenure <= 40 years or age <= 70.
        to ensure enough employees are active while maintaining retirement restrictions
        a random probability is then applied to nullify the term date.
        '''
        
        while True:
            try:
                birth_date = random_between_dates(
                            _current_date - timedelta(days= _year * 100),
                            _current_date)
                min_start_date = birth_date + timedelta(days= _year*25)
                max_start_date = min(_current_date,
                             birth_date + timedelta(days= _year*60))
                start_date = random_between_dates(min_start_date, 
                                                  max_start_date)
                if start_date > _current_date:
                    raise ValueError
                min_term_date = start_date
                max_term_date = min(_current_date,
                                    start_date + timedelta(days=_year*40),
                                    birth_date + timedelta(days=_year*70))
                term_date = random_between_dates(min_term_date,
                                                 max_term_date)
                if term_date == _current_date:
                    term_date = None
                # having a ratio of 60/40 active/terminated employees
                generate_term = random.choices([True, False],
                                               weights= [20,80],
                                               k= 1)[0]
                if not generate_term:
                    potential_tenure = (_current_date - start_date).days \
                                        // _year
                    potential_age = (_current_date - birth_date).days \
                                        // _year
                    if potential_tenure <= 40 and potential_age <= 70:
                        term_date = None
                break
            except ValueError:
                continue
                    
        # calculating tenure
        if term_date is None:
            tenure = (_current_date - start_date).days / _year
        else:
            tenure = (term_date - start_date).days / _year
        tenure = round(tenure, 1)
        
        # calculating age at time of departure if terminated
        if term_date is None:
            age = (_current_date - birth_date).days / _year
        else:
            age = (term_date - birth_date).days / _year
        age = round(age, 1)
        
        # instancing random choices for DEI variables
        level = random.choices(hierarchy_pool,
                               weights= h_wts,
                               k= 1)[0]
        gender = random.choices(gender_pool,
                               weights= g_wts,
                               k= 1)[0]
        ethnicity = random.choices(ethnicity_pool,
                               weights= e_wts,
                               k= 1)[0]
        department = random.choices(department_pool,
                               k= 1)[0]
        
        
        # storing variables in columns
        employee['start_date'] = start_date
        employee['term_date'] = term_date
        employee['tenure'] = tenure
        employee['birth_date'] = birth_date
        employee['age'] = age
        employee['department'] = department
        employee['province'] = fake.province()
        employee['level'] = level
        employee['gender'] = gender
        employee['ethnicity'] = ethnicity
        employee_list.append(employee)
    return pd.DataFrame(employee_list)

In [5]:
# creating a dataframe to hold the output of the function
# and visualize it to check for correct output
records = create_employees(5000)
records.sample(5)

Unnamed: 0,ee#,start_date,term_date,tenure,birth_date,age,department,province,level,gender,ethnicity
2926,10002926,2012-11-16 23:03:52.535979,NaT,11.7,1973-09-12 17:03:52.535979,50.9,Sales,Alberta,Individual contributor,male,asian
3742,10003742,2014-12-11 23:03:52.535979,2021-03-03 23:03:52.535979,6.2,1975-07-10 17:03:52.535979,45.6,IT,Yukon Territory,Individual contributor,male,caucasian
590,10000590,2013-06-06 23:03:52.535979,NaT,11.2,1986-05-23 17:03:52.535979,38.2,Project Management,Manitoba,Individual contributor,female,caucasian
2752,10002752,1997-04-14 23:03:52.535979,NaT,27.3,1965-04-21 17:03:52.535979,59.3,Finance,Northwest Territories,Manager/Director,male,bi-racial
2468,10002468,2024-03-11 23:03:52.535979,NaT,0.4,1991-11-09 17:03:52.535979,32.7,Project Management,Manitoba,Individual contributor,male,african descent


In [6]:
# export the dataframe to .csv file
records.to_csv('records.csv', index=False)