In [10]:
from faker import Faker
import pandas as pd
import random

fake = Faker()
gen_num = 10000

csvDF = pd.read_csv("./employees.csv")
country_dist = {
    "India": 0.76,
    "China": 0.12,
    "Canada": 0.01,
    "South Korea": 0.009,
    "Philippines": 0.006,
    "Taiwan": 0.006,
    "Mexico": 0.006
}
department_dist = {
    "Legal": 0.05,
    "Marketing": 0.10,
    "Administrative": 0.10,
    "Operations": 0.20,
    "Sales": 0.10,
    "Finance": 0.05,
    "I/T": 0.10,
    "Product": 0.20,
    "Human Resource": 0.10
}


In [11]:
headers = csvDF.columns.to_list()
headers

['First Name',
 'Last Name',
 'Email',
 'Phone',
 'Gender',
 'Age',
 'Job Title',
 'Years Of Experience',
 'Salary',
 'Department']

In [12]:
departments = csvDF["Department"].drop_duplicates().to_list()
departments += ["Legal", "Marketing", "Administrative", "Operations", "Sales", "Finance", "I/T"]

In [13]:
job_titles = csvDF["Job Title"].drop_duplicates().to_list()
job_titles.remove("HR Manager")

job_titles

['Project Manager',
 'Machine Learning Engineer',
 'Web Developer',
 'Mobile Developer',
 'DevOps Engineer',
 'Designer',
 'Tester']

In [14]:
num_rows = len(csvDF)
csvDF.insert(10, "SSN", ["0"] * num_rows)
csvDF.insert(11, "Languages", ["English"] * num_rows)
csvDF.insert(12, "Languages Count", [1] * num_rows)
csvDF.insert(13, "US Citizen", [True] * num_rows)

In [15]:
def generate_salaries(department):
    salary_ranges = {
        "Legal": (80000, 150000),
        "Marketing": (50000, 120000),
        "Administrative": (60000, 200000),
        "Operations": (50000, 150000),
        "Sales": (50000, 180000),
        "Finance": (71000, 130000),
        "I/T": (80000, 180000),
        "Product": (70000, 150000),
        "Human Resource": (80000, 150000)
    }
    return random.randint(*salary_ranges[department])

In [16]:
def makeData():
    fname = fake.first_name_nonbinary()
    lname = fake.last_name_nonbinary()
    email = fake.email()
    number = fake.phone_number()
    gender = random.choice(['male', 'female'])
    age = random.randint(18,66)
    job_title = ""
    years_of_work = random.randint(0, age-18)
    sal = random.randint(30000, 200000)
    department = random.choice(departments)
    if department == "Human Resource":
        job_title = "HR Manager"
    else:
        job_title = random.choice(job_titles)
    ssn = fake.ssn()
    languages_spoken = random.choices(
        ["Spanish", "Chinese", "Hindi", "Arabic", "French", "German"], k=random.randint(0, 2)
    )
    languages_count = len(languages_spoken)+1
    us_citizen = random.choices([True, False], weights=[0.60, 0.40])[0]
    if not us_citizen:
        country = random.choices(list(country_dist.keys()), weights=country_dist.values())[0]
    else:
        country = "USA"
    return [fname, lname, email, number, gender, age, job_title, years_of_work, 
            sal, department, ssn, 
            "English" + (", " + ", ".join(languages_spoken) if languages_count > 0 else ""), 
            languages_count, us_citizen]

In [17]:
new_data = [makeData() for _ in range(gen_num)]
ndDF = pd.DataFrame(new_data)
ndDF.columns = headers + ["SSN", "Languages", "Languages Count", "US Citizen"]
num_rows = len(csvDF)

In [18]:
ndDF.to_csv("./faker-employees.csv")
ndDF

Unnamed: 0,First Name,Last Name,Email,Phone,Gender,Age,Job Title,Years Of Experience,Salary,Department,SSN,Languages,Languages Count,US Citizen
0,Charles,Williams,xross@example.net,(775)415-5498,male,65,HR Manager,15,139048,Human Resource,001-80-6268,"English,",1,False
1,Matthew,Young,harveycrystal@example.com,001-951-794-1556x768,female,40,HR Manager,12,49136,Human Resource,161-90-3450,"English,",1,True
2,Gregory,Dalton,erik41@example.net,001-585-545-9608,female,53,HR Manager,3,63025,Human Resource,714-83-7522,"English,",1,True
3,Kevin,Moore,anna62@example.org,+1-556-319-1891,female,37,Tester,8,67248,Finance,561-72-8198,"English,",1,True
4,Kevin,Payne,melinda06@example.com,964.903.8076,female,41,Project Manager,15,51668,Product,364-24-2440,"English, German",2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Heather,Green,matthewrobinson@example.org,7212966669,male,45,Web Developer,7,141771,Finance,153-99-4455,"English, Chinese, Chinese",3,False
9996,Jackie,Woods,elittle@example.net,+1-305-969-1630x26542,male,39,Designer,16,192201,Product,534-42-1627,"English,",1,False
9997,Daisy,Miller,jerry26@example.net,983-355-8587x30261,female,64,DevOps Engineer,42,190599,Product,221-99-5044,"English, French, French",3,True
9998,Melissa,Roberts,dhodge@example.org,+1-631-971-8082x5039,male,20,Tester,2,37621,Marketing,409-19-7915,"English,",1,True
