# Creating synthetic HR records with the Python Faker library

Composition of HR records for DEI purposes:
- unique 4 digits employee number, starting at 1000
- unique employee name
- start dates & termination dates
- department (HR, Finance, Marketing, Sales, IT, Customer Service, Legal, Project Management)
- province of employment
- hierarchy level (79% individual contributor, 16% manager/director, 5% senior leadership) per this [ratio of individual contributors/managers/directors](https://ravio.com/blog/effective-management-structures-how-to-know-if-your-company-is-too-top-heavy#)
- age (between 19 to 70)
- gender (male, female, non-binary)
- ethnicity (caucasian, african descent, indigenous, bi-racial, hispanic, pacific islander, middle eastern, asian)

In [1]:
# importing libraries
from faker import Faker
import pandas as pd

# creating a Faker instance in Canada
fake = Faker(locale='en_CA')

# importing dynamic provider for weighted choices
from faker.providers import DynamicProvider
from random import choices

# importing datetime to convert strings into dates if necessary
from datetime import datetime

In [2]:
# creating the list of hierarchy levels with their weights
hierarchy_pool = ["Individual contributor","Manager/Director","Senior Leadership"]
h_wts=[0.79,0.16,0.05]

# instancing the dynamic provider
hierarchy_level = DynamicProvider(provider_name="level",\
                                  elements=choices(hierarchy_pool,weights=h_wts,\
                                    k=len(hierarchy_pool)))

# adding the new provider to the Faker instance
fake.add_provider(hierarchy_level)

In [3]:
# creating the list of genders with their weights
gender_pool = ["male","female","non-binary"]
g_wts=[0.49,0.49,0.02]

# instancing the dynamic provider
gender = DynamicProvider(provider_name="gender",\
                                  elements=choices(gender_pool,weights=g_wts,\
                                    k=len(gender_pool)))

# adding the new provider to the Faker instance
fake.add_provider(gender)

In [4]:
# creating the list of ethnicities with their weights
ethnicity_pool = ["caucasian", "african descent", "indigenous", "bi-racial",\
                  "hispanic", "pacific islander", "middle eastern", "asian"]
e_wts=[0.61,0.04,0.06,0.03,0.03,0.01,0.02,0.2]

# instancing the dynamic provider
ethnicity = DynamicProvider(provider_name="ethnicity",\
                                  elements=choices(ethnicity_pool,weights=e_wts,\
                                    k=len(ethnicity_pool)))

# adding the new provider to the Faker instance
fake.add_provider(ethnicity)

In [5]:
# creating a function to generate employee records
def create_employees(num_employees):
    employee_list = []
    for i in range(1, num_employees):
        employee = {}
        employee['ee#'] = 1000+i
        employee['employee_name'] = fake.unique.name()
        start_date = fake.date()
        employee['start_date'] = start_date
        employee['term_date'] = fake.date_between_dates(\
            date_start=datetime.strptime(start_date,"%Y-%m-%d"))
        employee['department'] = fake.random_element(\
            elements=("HR","Finance","Marketing","Sales",\
                      "IT","Customer Service","Legal",\
                      "Project Management"))
        employee['province'] = fake.province()
        employee['level'] = fake.level()
        employee['age'] = fake.random_int(min=22,max=70,step=1)
        employee['gender'] = fake.gender()
        employee['ethnicity'] = fake.ethnicity()
        employee_list.append(employee)
    return pd.DataFrame(employee_list)

In [6]:
# creating a dataframe to hold the output of the function
# and visualize it to check for correct output
records = create_employees(5000)
records

Unnamed: 0,ee#,employee_name,start_date,term_date,department,province,level,age,gender,ethnicity
0,1001,Lindsay Buckley,2006-07-11,2013-07-14,Project Management,New Brunswick,Manager/Director,48,female,asian
1,1002,Hunter Gray,1999-11-29,2011-04-20,Marketing,Ontario,Individual contributor,45,female,asian
2,1003,Angela Howe,2015-09-27,2022-07-13,Legal,Yukon Territory,Individual contributor,25,male,middle eastern
3,1004,Angela Vincent,2020-08-12,2022-01-30,Legal,Newfoundland and Labrador,Individual contributor,49,female,asian
4,1005,Michael Parker,1989-12-28,2020-06-23,IT,Yukon Territory,Manager/Director,32,female,middle eastern
...,...,...,...,...,...,...,...,...,...,...
4994,5995,Herbert Armstrong,2020-05-06,2020-08-14,IT,Manitoba,Manager/Director,39,male,caucasian
4995,5996,Michele George,1980-05-14,1998-12-07,Sales,Saskatchewan,Individual contributor,64,male,asian
4996,5997,Amy Davidson,1979-08-12,1997-08-15,Finance,Prince Edward Island,Individual contributor,67,female,asian
4997,5998,Jennifer Santiago,2002-07-07,2015-05-03,Project Management,Nova Scotia,Individual contributor,38,male,asian


In [None]:
# export the dataframe to .csv file
records.to_csv('records.csv')