In [1]:
from faker import Faker
import pandas as pd
import random

fake = Faker()
gen_num = 10000

csvDF = pd.read_csv("./employees.csv")
country_dist = {
    "India": 0.76,
    "China": 0.12,
    "Canada": 0.01,
    "South Korea": 0.009,
    "Philippines": 0.006,
    "Taiwan": 0.006,
    "Mexico": 0.006
}
department_dist = {
    "Legal": 0.05,
    "Marketing": 0.10,
    "Administrative": 0.10,
    "Operations": 0.20,
    "Sales": 0.10,
    "Finance": 0.05,
    "I/T": 0.10,
    "Product": 0.20,
    "Human Resource": 0.10
}


In [2]:
headers = csvDF.columns.to_list()
headers

['First Name',
 'Last Name',
 'Email',
 'Phone',
 'Gender',
 'Age',
 'Job Title',
 'Years Of Experience',
 'Salary',
 'Department']

In [3]:
departments = csvDF["Department"].drop_duplicates().to_list()
departments += ["Legal", "Marketing", "Administrative", "Operations", "Sales", "Finance", "I/T"]

In [4]:
job_titles = csvDF["Job Title"].drop_duplicates().to_list()
job_titles.remove("HR Manager")

job_titles

['Project Manager',
 'Machine Learning Engineer',
 'Web Developer',
 'Mobile Developer',
 'DevOps Engineer',
 'Designer',
 'Tester']

In [5]:
num_rows = len(csvDF)
csvDF.insert(10, "SSN", ["0"] * num_rows)
csvDF.insert(11, "Languages", ["English"] * num_rows)
csvDF.insert(12, "Languages Count", [1] * num_rows)
csvDF.insert(13, "US Citizen", [True] * num_rows)

In [6]:
def generate_salaries(department):
    salary_ranges = {
        "Legal": (80000, 150000),
        "Marketing": (50000, 120000),
        "Administrative": (60000, 200000),
        "Operations": (50000, 150000),
        "Sales": (50000, 180000),
        "Finance": (71000, 130000),
        "I/T": (80000, 180000),
        "Product": (70000, 150000),
        "Human Resource": (80000, 150000)
    }
    return random.randint(*salary_ranges[department])

In [7]:
def makeData():
    fname = fake.first_name_nonbinary()
    lname = fake.last_name_nonbinary()
    email = fake.email()
    number = fake.phone_number()
    gender = random.choice(['male', 'female'])
    age = random.randint(18,66)
    job_title = ""
    years_of_work = random.randint(0, age-18)
    sal = random.randint(30000, 200000)
    department = random.choice(departments)
    if department == "Human Resource":
        job_title = "HR Manager"
    else:
        job_title = random.choice(job_titles)
    ssn = fake.ssn()
    languages_spoken = random.choices(
        ["Spanish", "Chinese", "Hindi", "Arabic", "French", "German"], k=random.randint(0, 2)
    )
    languages_count = len(languages_spoken)+1
    us_citizen = random.choices([True, False], weights=[0.60, 0.40])[0]
    if not us_citizen:
        country = random.choices(list(country_dist.keys()), weights=country_dist.values())[0]
    else:
        country = "USA"
    return [fname, lname, email, number, gender, age, job_title, years_of_work, 
            sal, department, ssn, 
            "English" + (", " + ", ".join(languages_spoken) if languages_count > 0 else ""), 
            languages_count, us_citizen]

In [8]:
new_data = [makeData() for _ in range(gen_num)]
ndDF = pd.DataFrame(new_data)
ndDF.columns = headers + ["SSN", "Languages", "Languages Count", "US Citizen"]
num_rows = len(csvDF)

In [9]:
ndDF.to_csv("./faker-employees.csv")
ndDF

Unnamed: 0,First Name,Last Name,Email,Phone,Gender,Age,Job Title,Years Of Experience,Salary,Department,SSN,Languages,Languages Count,US Citizen
0,Andrea,Anderson,wardpatrick@example.com,430.331.6300x5172,male,56,HR Manager,27,123015,Human Resource,773-52-8188,"English, French",2,True
1,Barry,Gibson,mathewsmichael@example.net,(940)617-7508,female,48,Project Manager,24,134240,Legal,778-58-3849,"English,",1,True
2,Christine,Higgins,toddbooker@example.com,222-479-9471x231,male,57,DevOps Engineer,20,131123,Finance,533-67-3462,"English, Hindi, Arabic",3,True
3,Kevin,Maxwell,phillipskatherine@example.com,968-240-6801,male,47,Machine Learning Engineer,27,44305,Administrative,233-94-3870,"English, German",2,True
4,Jennifer,Blair,ismith@example.com,(400)607-3040x9651,male,42,Mobile Developer,4,138126,Operations,370-33-9381,"English,",1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,James,Moore,johnny24@example.com,497.834.4304,female,43,Project Manager,8,131198,Marketing,308-93-6184,"English, Arabic",2,True
9996,Laura,Mcguire,scottjoseph@example.net,001-982-220-1726x4269,female,27,Tester,8,136591,Legal,618-67-3884,"English, German",2,True
9997,Daniel,Reed,triley@example.com,573.821.7133x1991,female,44,DevOps Engineer,21,134639,I/T,796-14-4462,"English, German, Arabic",3,True
9998,Linda,Chase,marilyngibson@example.com,+1-800-969-0844x971,male,33,Mobile Developer,10,147824,Product,528-59-6094,"English, German, Hindi",3,False
