# Creating synthetic DEI HR records with the Python Faker library

Composition of HR records for DEI purposes:
- unique 4 digits employee number, starting at 1000
- unique employee name
- start dates & termination dates
- department (HR, Finance, Marketing, Sales, IT, Customer Service, Legal, Project Management)
- province of employment
- hierarchy level (79% individual contributor, 16% manager/director, 5% senior leadership) per this [ratio of individual contributors/managers/directors](https://ravio.com/blog/effective-management-structures-how-to-know-if-your-company-is-too-top-heavy#)
- age (between 19 to 70)
- gender (male, female, non-binary)
- ethnicity (caucasian, african descent, indigenous, bi-racial, hispanic, pacific islander, middle eastern, asian)

In [1]:
# importing libraries
from faker import Faker
import pandas as pd

# creating a Faker instance in Canada
fake = Faker(locale='en_CA')

# importing dynamic provider for weighted choices
from faker.providers import DynamicProvider
import random
from random import choices

# importing datetime to convert strings into dates if necessary
from datetime import datetime, timedelta
import calendar

In [2]:
# creating the list of hierarchy levels with their weights
hierarchy_pool = ["Individual contributor",
                  "Manager/Director",
                  "Senior Leadership"]
h_wts=[0.79,
       0.16,
       0.05]

# instancing the dynamic provider
hierarchy_level = DynamicProvider(provider_name="level",\
                                  elements=choices(hierarchy_pool,
                                                   weights=h_wts,\
                                                    k=len(hierarchy_pool)))

# adding the new provider to the Faker instance
fake.add_provider(hierarchy_level)

# creating the list of genders with their weights
gender_pool = ["male",
               "female",
               "non-binary"]
g_wts=[0.49,
       0.49,
       0.02]

# instancing the dynamic provider
gender = DynamicProvider(provider_name="gender",\
                                  elements=choices(gender_pool,
                                                   weights=g_wts,\
                                                    k=len(gender_pool)))

# adding the new provider to the Faker instance
fake.add_provider(gender)

# creating the list of ethnicities with their weights
ethnicity_pool = ["caucasian", "african descent", 
                  "indigenous", "bi-racial",
                  "hispanic", "pacific islander", 
                  "middle eastern", "asian"]
e_wts=[0.61, 0.04,
       0.06, 0.03,
       0.03, 0.01,
       0.02,0.2]

# instancing the dynamic provider
ethnicity = DynamicProvider(provider_name="ethnicity",\
                                  elements=choices(ethnicity_pool,
                                                   weights=e_wts,\
                                                    k=len(ethnicity_pool)))

# adding the new provider to the Faker instance
fake.add_provider(ethnicity)

In [3]:
# defining a function to generate a random in a given year
def random_date_in_year(year):
    month = random.randint(1, 12)
    day = random.randint(1, calendar.monthrange(year, month)[1])
    return datetime(year, month, day)

# defining a function to calculate age either at time of departure or now
def calcul_age(birth_date, reference_date):
    return reference_date.year - birth_date.year - \
            ((reference_date.month, reference_date.day) < \
             (birth_date.month, birth_date.day))

In [4]:
# creating a function to generate employee records
def create_employees(num_employees):
    employee_list = []
    for i in range(num_employees):
        employee = {}
        employee['ee#'] = 10000000+i
        # generating appropriate start, and birth years
        # setting appropriate years
        current_year = datetime.now().year
        start_year = random.randint(current_year - 50, 
                                    current_year)
        birth_year = start_year - random.randint(25, 65)
        # now generating dates within these years
        start_date = random_date_in_year(start_year)
        birth_date = random_date_in_year(birth_year)

        # ensuring employee age at start date is between 25 and 65
        while calcul_age(birth_date, start_date) < 25 \
        or calcul_age(birth_date, start_date) > 65:
            birth_year = start_year - random.randint(25, 65)
            birth_date = random_date_in_year(birth_year)
        
        # having a ratio of 60/40 active/terminated employees
        generate_term = random.choices([True, False],
                                       weights= [60,40],
                                       k= 1)[0]
        if generate_term:
            term_year = start_year + random.randint(0, 40)
            term_date = random_date_in_year(term_year)

            if term_date < start_date:
                term_date = start_date \
                            + timedelta(days= random.randint(0, 40*365.25))
            elif term_date > datetime.now():
                term_date = datetime.now() - timedelta(days= 1)

            # ensuring age at termination isn't past 70
            while calcul_age(birth_date, term_date) > 70:
                term_year = start_year + random.randint(0, 40)
                term_date = random_date_in_year(term_year)
                if term_date < start_date:
                    term_date = start_date \
                            + timedelta(days= random.randint(0, 40*365.25))
                elif term_date > datetime.now():
                    term_date = datetime.now() - timedelta(days= 1)
                
        else:
            term_date = None

        # setting a forced termination date if age >= 70 and term is None
        if term_date is None \
        and calcul_age(birth_date, datetime.now()) >= 70:
            term_date = birth_date + timedelta(days= 70*365.25)

        # calculating tenure
        if term_date is None:
            tenure = (datetime.now() - start_date).days // 365.25
        else:
            tenure = (term_date - start_date).days // 365.25
        
        # calculating age at time of departure if terminated
        if term_date is None:
            age = (datetime.now() - birth_date).days // 365.25
        else:
            age = (term_date - birth_date).days // 365.25

        # storing variables in columns
        employee['start_date'] = start_date
        employee['term_date'] = term_date
        employee['tenure'] = tenure
        employee['birth_date'] = birth_date
        employee['age'] = age
        employee['department'] = fake.random_element(\
            elements=("HR","Finance","Marketing","Sales",\
                      "IT","Customer Service","Legal",\
                      "Project Management"))
        employee['province'] = fake.province()
        employee['level'] = fake.level()
        employee['gender'] = fake.gender()
        employee['ethnicity'] = fake.ethnicity()
        employee_list.append(employee)
    return pd.DataFrame(employee_list)

In [5]:
# creating a dataframe to hold the output of the function
# and visualize it to check for correct output
records = create_employees(5000)
records.sample(5)

Unnamed: 0,ee#,start_date,term_date,tenure,birth_date,age,department,province,level,gender,ethnicity
4570,10004570,2019-10-23,NaT,4.0,1994-06-02,30.0,Finance,Alberta,Individual contributor,male,caucasian
1384,10001384,1997-07-29,NaT,27.0,1966-07-01,58.0,IT,New Brunswick,Senior Leadership,male,caucasian
239,10000239,2003-08-05,2007-10-20 00:00:00,4.0,1942-06-01,65.0,IT,Northwest Territories,Individual contributor,male,hispanic
3380,10003380,2020-06-26,NaT,4.0,1959-05-02,65.0,Customer Service,Newfoundland and Labrador,Individual contributor,male,asian
4724,10004724,1978-01-31,2005-11-06 12:00:00,27.0,1935-11-07,69.0,Marketing,Saskatchewan,Individual contributor,male,caucasian


In [6]:
# export the dataframe to .csv file
records.to_csv('records.csv',index=False)