In [None]:
# Necessary Libraries
import pandas as pd
import numpy as np
import random
from datetime import timedelta, date
from scipy.stats import norm, skewnorm
from faker import Faker

In [None]:
# For reproducibility (remove set.seed if you wish it to be more random)
fake = Faker()
np.random.seed(10)
random.seed(10)

In [None]:
# Context constants

# Date constants
max_date = date(2024, 12, 31)


# States and cities (with at least 100,000 population)
states_cities_dict = {
    'Alabama': ('AL', [
        ('Huntsville', 221933),
         ('Montgomery', 196986),
          ('Birmingham', 196910),
           ('Mobile', 183289),
            ('Tuscaloosa', 110602)
            ]),
    'Alaska': ('AK', [
        ('Anchorage', 287145)
        ]),
    'Arizona': ('AZ', [
        ('Phoenix', 1644000),
         ('Tucson', 546574),
          ('Mesa', 512498),
           ('Chandler', 280711),
            ('Gilbert', 275346),
             ('Glendale', 252136),
              ('Scottsdale', 243050),
               ('Peoria', 197866),
                ('Tempe', 185950),
                 ('Surprise', 154198)
                 ]),
    'Arkansas': ('AR', [
        ('Little Rock', 202864)
        ]),
    'California': ('CA', [
        ('Los Angeles', 3822000),
         ('San Diego', 1381000),
          ('San Jose', 971233),
           ('San Francisco', 808437),
            ('Fresno', 545567),
             ('Sacramento', 528001),
              ('Long Beach', 451307),
               ('Oakland', 430553),
                ('Bakersfield', 410647),
                 ('Anaheim', 344461),
                  ('Stockton', 321819),
                   ('Irvine', 313685),
                    ('Santa Ana', 308189),
                     ('Chula Vista', 279170),
                      ('Fremont', 223871),
                       ('Santa Clarita', 221345),
                        ('San Bernardino', 220328),
                         ('Modesto', 218069),
                          ('Fontana', 212475),
                           ('Moreno Valley', 211924),
                            ('Oxnard', 200415),
                             ('Huntington Beach', 194310),
                              ('Glendale', 189221),
                               ('Santa Rosa', 177181),
                                ('Rancho Cucamonga', 176336),
                                 ('Oceanside', 172199),
                                  ('Palmdale', 163463),
                                   ('Sunnyvale', 153091),
                                    ('Escondido', 150270),
                                     ('Pomona', 146017),
                                      ('Visalia', 143966),
                                       ('Fullerton', 140541),
                                        ('Victorville', 137221),
                                         ('Pasadena', 134211),
                                          ('Santa Clara', 126930),
                                           ('Simi Valley', 124398),
                                            ('Thousand Oaks', 124265),
                                             ('Berkeley', 118950),
                                              ('Carlsbad', 114160),
                                               ('Temecula', 111752),
                                                ('Santa Maria', 110125),
                                                 ('Ventura', 109527),
                                                  ('Costa Mesa', 109521),
                                                   ('Burbank', 103920),
                                                    ('Inglewood', 103621),
                                                     ('Vacaville', 101918),
                                                      ('San Mateo', 100984)
                                                      ]),
    'Colorado': ('CO', [
        ('Denver', 713252),
         ('Colorado Springs', 486248),
          ('Aurora', 393537),
           ('Fort Collins', 169249),
            ('Lakewood', 156120),
             ('Thorton', 143282),
              ('Arvada', 121414),
               ('Westminster', 114875),
                ('Greeley', 112609),
                 ('Pueblo', 111077),
                  ('Boulder', 102000)
                  ]),
    'Connecticut': ('CT', [
        ('Bridgeport', 148377),
         ('New Haven', 138915),
          ('Stamford', 136188),
           ('Hartford', 120686),
            ('Waterbury', 115016)
            ]),
    'Delaware': ('DE', [
        ('Wilmington', 71569)  # Does not have a city above 100,000 so only the largest city is included
        ]),
    'Florida': ('FL', [
        ('Jacksonville', 971319),
         ('Miami', 449514),
          ('Tampa', 398173),
           ('Orlando', 316081),
            ('St. Petersburg', 261256),
             ('Hialeah', 220292),
              ('Port St. Lucie', 231790),
               ('Cape Coral', 216992),
                ('Tallahassee', 201731),
                 ('Fort Lauderdale', 183146),
                  ('Pembroke Pines', 169876),
                   ('Gainesville', 145214),
                    ('Miramar', 137228),
                     ('Coral Springs', 133369),
                      ('Palm Bay', 129234),
                       ('West Palm Beach', 120932),
                        ('Clearwater', 117027),
                         ('Pompano Beach', 112302),
                          ('Miami Gardens', 110497)
                          ]),
    'Georgia': ('GA', [
        ('Atlanta', 499127),
         ('Columbus', 202616),
          ('Augusta', 202096),
           ('Savannah', 148004),
            ('Athens', 128561)
            ]),
    'Hawaii': ('HI', [
        ('Honolulu', 349913)
        ]),
    'Idaho': ('ID', [
        ('Boise', 236634),
         ('Meridian', 129736)
         ]),
    'Illinois': ('IL', [
        ('Chicago', 2665000),
         ('Aurora', 177866),
          ('Joliet', 150033),
           ('Naperville', 149936),
            ('Rockford', 146713),
             ('Springfield', 113273),
              ('Elgin', 113177),
               ('Peoria', 111021)
               ]),
    'Indiana': ('IN', [
        ('Indianapolis', 880621),
         ('Fort Wayne', 267927),
          ('Evansville', 115749),
           ('South Bend', 103110),
            ('Fishers', 101966)
            ]),
    'Iowa': ('IA', [
        ('Des Moines', 211034),
         ('Cedar Rapids', 136429),
          ('Davenport', 100486)
          ]),
    'Kansas': ('KS', [
        ('Wichita', 396192),
         ('Overland Park', 197726),
          ('Kansas City', 153345),
           ('Olathe', 145616),
            ('Topeka', 125449)
            ]),
    'Kentucky': ('KY', [
        ('Louisville', 624444),
         ('Lexington', 320347)
         ]),
    'Louisiana': ('LA', [
        ('New Orleans', 369749),
         ('Baton Rouge', 221453),
          ('Shreveport', 180153),
           ('Metairie', 140046),
            ('Lafayette', 121389)
            ]),
    'Maine': ('ME', [
        ('Portland', 68424)  # Does not have a city above 100,000 so only the largest city is included
        ]),
    'Maryland': ('MD', [
        ('Baltimore', 569931)
        ]),
    'Massachusetts': ('MA', [
        ('Boston', 650706),
         ('Worcester', 205319),
          ('Springfield', 154064),
           ('Cambridge', 118488),
            ('Lowell', 113608)
            ]),
    'Michigan': ('MI', [
        ('Detroit', 620376),
         ('Grand Rapids', 196908),
          ('Warren', 137107),
           ('Sterling Heights', 132567),
            ('Ann Arbor', 119875),
             ('Lansing', 112537)
             ]),
    'Minnesota': ('MN', [
        ('Minneapolis', 425096),
         ('Saint Paul', 303176),
          ('Rochester', 209352)
          ]),
    'Mississippi': ('MS', [
        ('Jackson', 145995)
        ]),
    'Missouri': ('MO', [
        ('Kansas City', 509297),
         ('St. Louis', 286578),
          ('Springfield', 170067),
           ('Columbia', 128555),
            ('Independence', 121202)
            ]),
    'Montana': ('MT', [
        ('Billings', 119960)
        ]),
    'Nebraska': ('NE', [
        ('Omaha', 485153),
         ('Lincoln', 292627)
         ]),
    'Nevada': ('NV', [
        ('Las Vegas', 656274),
         ('Henderson', 331415),
          ('Reno', 273448),
           ('North Las Vegas', 280543),
            ('Spring Valley', 220114),
             ('Sunrise Manor', 198325),
              ('Paradise', 189733),
               ('Sparks', 109226)
               ]),
    'New Hampshire': ('NH', [
        ('Manchester', 115141)
        ]),
    'New Jersey': ('NJ', [
        ('Newark', 305344),
         ('Jersey City', 286670),
          ('Paterson', 156661),
           ('Elizabeth', 134283),
            ('Lakewood', 104157),
             ('Edison', 100693),
              ('Woodbridge', 100450)
              ]),
    'New Mexico': ('NM', [
        ('Albuquerque', 561008),
         ('Las Cruces', 113888),
          ('Rio Rancho', 110660)
          ]),
    'New York': ('NY', [
        ('New York City', 8336000),
         ('Hempstead', 768103),
          ('Brookhaven', 482436),
           ('Islip', 330914),
            ('Buffalo', 276486),
             ('Rochester', 209352),
              ('Huntington', 201546),
               ('Yonkers', 208121),
                ('Syracuse', 144451),
                 ('Smith Town', 116384)
                 ]),
    'North Carolina': ('NC', [
        ('Charlotte', 897720),
         ('Raleigh', 476587),
          ('Greensboro', 301115),
           ('Durham', 291928),
            ('Winston-Salem', 251350),
             ('Fayetteville', 208873),
              ('Cary', 180388),
               ('Wilmington', 120324),
                ('High Point', 115067)
                ]),
    'North Dakota': ('ND', [
        ('Fargo', 131444)
        ]),
    'Ohio': ('OH', [
        ('Columbus', 907971),
         ('Cleveland', 361607),
          ('Cincinnati', 309513),
           ('Toledo', 266301),
            ('Akron', 188509),
             ('Dayton', 135944)
             ]),
    'Oklahoma': ('OK', [
        ('Oklahoma City', 694800),
         ('Tulsa', 411867),
          ('Norman', 129627),
           ('Broken Arrow', 117911)
           ]),
    'Oregon': ('OR', [
        ('Portland', 635067),
         ('Salem', 177487),
          ('Eugene', 177923),
           ('Gresham', 111621),
            ('Hillsboro', 107299)
            ]),
    'Pennsylvania': ('PA', [
        ('Philadelphia', 1567000),
         ('Pittsburgh', 302898),
          ('Allentown', 125094)
          ]),
    'Rhode Island': ('RI', [
        ('Providence', 189563)
        ]),
    'South Carolina': ('SC', [
        ('Charleston', 153672),
         ('Columbia', 139698),
          ('North Charleston', 118608)
          ]),
    'South Dakota': ('SD', [
        ('Sioux Falls', 202078)
        ]),
    'Tennessee': ('TN', [
        ('Nashville', 692587),
         ('Memphis', 651700),
          ('Knoxville', 187574),
           ('Chattanooga', 184086),
            ('Clarksville', 176974),
             ('Murfreesboro', 162398)
             ]),
    'Texas': ('TX', [
        ('Houston', 2303000),
         ('San Antonio', 1473000),
          ('Dallas', 1300000),
           ('Austin', 974447),
            ('Fort Worth', 956709),
             ('El Paso', 677456),
              ('Arlington', 394602),
               ('Corpus Christi', 316239),
                ('Plano', 289547),
                 ('Lubbock', 263930),
                  ('Laredo', 256187),
                   ('Garland', 240854),
                    ('Grand Prairie', 201843),
                     ('Amarillo', 201291),
                      ('McKinney', 207507),
                       ('Frisco', 219587),
                        ('Brownsville', 189382),
                         ('Killeen', 159172),
                          ('Denton', 150353),
                           ('Mesquite', 147899),
                            ('McAllen', 144579),
                             ('Waco', 143984),
                              ('Carrollton', 133820),
                               ('Lewisville', 131215),
                                ('Abilene', 127385),
                                 ('Pearland', 126949),
                                  ('Round Rock', 126697),
                                   ('College Station', 124319),
                                    ('League City', 115418),
                                     ('Sugar Land', 109414),
                                      ('Wichita Falls', 102664)
                                      ]),
    'Utah': ('UT', [
        ('Salt Lake City', 204657),
         ('West Valley City', 136650),
          ('West Jordan', 116664),
           ('Provo', 113523)
           ]),
    'Vermont': ('VT', [
        ('Burlington', 44595)  # Does not have a city above 100,000 so only the largest city is included
        ]),
    'Virginia': ('VA', [
        ('Virginia Beach', 455618),
         ('Chesapeake', 252488),
          ('Norfolk', 232995),
           ('Richmond', 229395),
            ('Newport News', 184306),
             ('Alexandria', 155525),
              ('Hampton', 138037)
              ]),
    'Washington': ('WA', [
        ('Seattle', 749256),
         ('Spokane', 230160),
          ('Tacoma', 221776),
           ('Vancouver', 194512),
            ('Bellevue', 152767),
             ('Everett', 111337),
              ('Renton', 104047)
              ]),
    'West Virginia': ('WV', [
        ('Charleston', 47129)  # Does not have a city above 100,000 so only the largest city is included
        ]),
    'Wisconsin': ('WI', [
        ('Milwaukee', 563305),
         ('Madison', 272903),
          ('Green Bay', 106095)
          ]),
    'Wyoming': ('WY', [
        ('Cheyenne', 64610) # Does not have a city above 100,000 so only the largest city is included
        ])
}

cities_states_flat = [(city, state, abbr, pop) for state, (abbr, cities) in states_cities_dict.items() for city, pop in cities]

cities_states_df = pd.DataFrame(cities_states_flat, columns=['city', 'state', 'state_abbr', 'population'])


# Employee's probability of being a chosen gender at company [Female, Male, Non Conforming]
gender_prob = [.46, .46, .08]


# Constants for choose_possible_locations()
bin_levels = [
      (1, 28),
      (28, 56),
      (56, 84),
      (84, 112),
      (112, 140),
      (140, 168),
      (168, 196),
      (196, 224),
      (224, 252),
      (252, 284)
      ]


# Possible age range and employment rate for 2022 as reference (rates from Statista)
age_range_probabilities = {
    '16-24': {
        'Range': (16,24), 'Probability': 41.5
    },
    '25-29': {
        'Range': (25, 29), 'Probability': 79.1
    },
    '30-34': {
        'Range': (30,34), 'Probability': 80.7
    },
    '35-44': {
        'Range': (35,44), 'Probability': 80.5
    },
    '45-54': {
        'Range': (45,54), 'Probability': 79.1
    },
    '55 and Over': {
        'Range': (55,75), 'Probability': 41.1
    }
}

age_probabilities = [age_range_probabilities[age]['Probability'] for age in age_range_probabilities]
age_probabilities /= np.sum(age_probabilities)


# Race and ethnicity demographics using census bureau article as reference
race_or_ethnicity_prob = {
    'White': 57.8,
    'Hispanic and/or Latino': 18.7,
    'Black and/or African American': 12.1,
    'Asian': 6.4,
    'Multi': 3.4,
    'American Indian and/or Alaska Native': 1.3,
    'Native Hawaiian and/or Other Pacific Islander alone': 0.3
}

race_or_ethnicity_prob_array = np.array(list(race_or_ethnicity_prob.values()))
race_or_ethnicity_prob_array /= np.sum(race_or_ethnicity_prob_array)

possible_races = list(race_or_ethnicity_prob.keys())


# Highest education by age 25 and above using census bureau articles as reference
highest_education = {
    'Above 25': {
        'Less Than High School': 8.9,
        'High School': 27.9,
        'Some College but No Degree': 14.9,
        'Associate\'s': 10.5,
        'Bachelor\'s': 23.5,
        'Master\'s': 10.4,
        'Professional Degree': 2,
        'PhD': 2
    },
    'Below 25': {
        'Less Than High School': 8.9,
        'High School': 33.1,
        'Some College but No Degree': 17.67,
        'Associate\'s': 12.46,
        'Bachelor\'s': 27.87
    }
}

education_above25 = highest_education['Above 25']
education_below25 = highest_education['Below 25']

education_above25_prob = np.array(list(education_above25.values()))
education_above25_prob /= sum(education_above25_prob)
possible_education_above25 = list(education_above25.keys())

education_below25_prob = np.array(list(education_below25.values()))
education_below25_prob /= sum(education_below25_prob)
possible_education_below25 = list(education_below25.keys())

# Quantifying education level for future manipulation
education_level_num = {
    'Less Than High School': 1,
    'High School': 2,
    'Some College but No Degree': 3,
    'Associate\'s': 4,
    'Bachelor\'s': 5,
    'Master\'s': 6,
    'Professional Degree': 7,
    'PhD': 8
}


# Dictionary for department, job title, salary, and education requirement
department_jobtitle_salary_education = {
    'Research and Development': {
        'Laboratory Assistant': {
            'Salary Range': (28000, 40000),
            'Education Requirement': [1, 2]
        },
        'Laboratory Technician': {
            'Salary Range': (30000, 45000),
            'Education Requirement': [2, 3]
        },
        'Research Assistant': {
            'Salary Range': (35000, 55000),
            'Education Requirement': [3, 4]
        },
        'Junior Researcher': {
            'Salary Range': (45000, 70000),
            'Education Requirement': [4, 5]
        },
        'Project Manager': {
            'Salary Range': (70000, 120000),
            'Education Requirement': [5, 6]
        },
        'Research Scientist': {
            'Salary Range': (80000, 150000),
            'Education Requirement': [6, 8]
        },
        'Senior Research Scientist': {
            'Salary Range': (110000, 170000),
            'Education Requirement': [7, 8]
        },
        'Director of R&D': {
            'Salary Range': (150000, 250000),
            'Education Requirement': [7, 8]
        }
    },
    'IT': {
        'IT Support Specialist': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [2, 3]
        },
        'Junior Developer': {
            'Salary Range': (45000, 65000),
            'Education Requirement': [3, 4]
        },
        'Network Technician': {
            'Salary Range': (50000, 75000),
            'Education Requirement': [3, 4]
        },
        'Systems Administrator': {
            'Salary Range': (60000, 90000),
            'Education Requirement': [4, 5]
        },
        'Software Developer': {
            'Salary Range': (70000, 110000),
            'Education Requirement': [5, 6]
        },
        'Data Analyst': {
            'Salary Range': (60000, 90000),
            'Education Requirement': [5, 6]
        },
        'Cybersecurity Analyst': {
            'Salary Range': (85000, 120000),
            'Education Requirement': [5, 6]
        },
        'IT Manager': {
            'Salary Range': (95000, 150000),
            'Education Requirement': [6, 7]
        },
        'DevOps Engineer': {
            'Salary Range': (95000, 140000),
            'Education Requirement': [6, 7]
        },
        'Chief Information Officer (CIO)': {
            'Salary Range': (150000, 300000),
            'Education Requirement': [7, 8]
        }
    },
    'HR': {
        'HR Assistant': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [2, 3]
        },
        'HR Coordinator': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Recruitment Specialist': {
            'Salary Range': (45000, 70000),
            'Education Requirement': [4, 5]
        },
        'HR Generalist': {
            'Salary Range': (50000, 80000),
            'Education Requirement': [4, 5]
        },
        'HR Manager': {
            'Salary Range': (75000, 110000),
            'Education Requirement': [5, 6]
        },
        'Compensation and Benefits Specialist': {
            'Salary Range': (60000, 90000),
            'Education Requirement': [5, 6]
        },
        'Training and Development Manager': {
            'Salary Range': (80000, 120000),
            'Education Requirement': [6, 7]
        },
        'Director of Human Resources': {
            'Salary Range': (100000, 180000),
            'Education Requirement': [6, 7]
        },
        'Chief Human Resources Officer (CHRO)': {
            'Salary Range': (150000, 250000),
            'Education Requirement': [7, 8]
        }
    },
    'Finance': {
        'Accounting Clerk': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [2, 3]
        },
        'Accounts Payable Specialist': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Bookkeeper': {
            'Salary Range': (45000, 65000),
            'Education Requirement': [3, 4]
        },
        'Staff Accountant': {
            'Salary Range': (55000, 80000),
            'Education Requirement': [4, 5]
        },
        'Financial Analyst': {
            'Salary Range': (70000, 100000),
            'Education Requirement': [5, 6]
        },
        'Finance Manager': {
            'Salary Range': (90000, 130000),
            'Education Requirement': [6, 7]
        },
        'Controller': {
            'Salary Range': (100000, 150000),
            'Education Requirement': [6, 7]
        },
        'Director of Finance': {
            'Salary Range': (130000, 200000),
            'Education Requirement': [6, 8]
        },
        'Chief Financial Officer (CFO)': {
            'Salary Range': (150000, 300000),
            'Education Requirement': [7, 8]
        }
    },
    'Marketing': {
        'Marketing Assistant': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [2, 3]
        },
        'Social Media Coordinator': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Content Creator': {
            'Salary Range': (45000, 65000),
            'Education Requirement': [3, 4]
        },
        'SEO Specialist': {
            'Salary Range': (55000, 85000),
            'Education Requirement': [4, 5]
        },
        'Marketing Specialist': {
            'Salary Range': (55000, 80000),
            'Education Requirement': [4, 5]
        },
        'Marketing Manager': {
            'Salary Range': (80000, 120000),
            'Education Requirement': [5, 6]
        },
        'Brand Manager': {
            'Salary Range': (90000, 140000),
            'Education Requirement': [5, 6]
        },
        'Product Marketing Manager': {
            'Salary Range': (95000, 150000),
            'Education Requirement': [6, 7]
        },
        'Chief Marketing Officer (CMO)': {
            'Salary Range': (150000, 300000),
            'Education Requirement': [7, 8]
        }
    },
    'Sales': {
        'Sales Associate': {
            'Salary Range': (30000, 45000),
            'Education Requirement': [1, 2]
        },
        'Sales Representative': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [2, 3]
        },
        'Inside Sales Representative': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Account Manager': {
            'Salary Range': (50000, 80000),
            'Education Requirement': [4, 5]
        },
        'Sales Manager': {
            'Salary Range': (80000, 120000),
            'Education Requirement': [5, 6]
        },
        'Regional Sales Director': {
            'Salary Range': (100000, 150000),
            'Education Requirement': [5, 6]
        },
        'Vice President of Sales': {
            'Salary Range': (150000, 250000),
            'Education Requirement': [6, 7]
        }
    },
    'Operations': {
        'Warehouse Worker': {
            'Salary Range': (30000, 40000),
            'Education Requirement': [1, 2]
        },
        'Forklift Operator': {
            'Salary Range': (35000, 45000),
            'Education Requirement': [2, 3]
        },
        'Logistics Coordinator': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Inventory Specialist': {
            'Salary Range': (45000, 65000),
            'Education Requirement': [3, 4]
        },
        'Production Planner': {
            'Salary Range': (55000, 80000),
            'Education Requirement': [4, 5]
        },
        'Operations Manager': {
            'Salary Range': (70000, 110000),
            'Education Requirement': [5, 6]
        },
        'Supply Chain Manager': {
            'Salary Range': (90000, 140000),
            'Education Requirement': [6, 7]
        },
        'Director of Operations': {
            'Salary Range': (120000, 180000),
            'Education Requirement': [6, 8]
        }
    },
    'Customer Service': {
        'Customer Service Representative': {
            'Salary Range': (30000, 40000),
            'Education Requirement': [2, 3]
        },
        'Help Desk Technician': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [3, 4]
        },
        'Call Center Supervisor': {
            'Salary Range': (40000, 55000),
            'Education Requirement': [4, 5]
        },
        'Customer Support Manager': {
            'Salary Range': (50000, 70000),
            'Education Requirement': [4, 5]
        },
        'Customer Experience Manager': {
            'Salary Range': (70000, 100000),
            'Education Requirement': [5, 6]
        },
        'Director of Customer Service': {
            'Salary Range': (90000, 130000),
            'Education Requirement': [6, 7]
        }
    },
    'Legal': {
        'Legal Assistant': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [2, 3]
        },
        'Paralegal': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Legal Secretary': {
            'Salary Range': (45000, 65000),
            'Education Requirement': [3, 4]
        },
        'Compliance Officer': {
            'Salary Range': (90000, 140000),
            'Education Requirement': [5, 6]
        },
        'Contract Manager': {
            'Salary Range': (80000, 130000),
            'Education Requirement': [5, 6]
        },
        'Corporate Lawyer': {
            'Salary Range': (100000, 180000),
            'Education Requirement': [7, 8]
        },
        'General Counsel': {
            'Salary Range': (180000, 300000),
            'Education Requirement': [7, 8]
        }
    },
    'Administration': {
        'Office Assistant': {
            'Salary Range': (28000, 40000),
            'Education Requirement': [1, 2]
        },
        'Administrative Assistant': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [2, 3]
        },
        'Office Manager': {
            'Salary Range': (50000, 70000),
            'Education Requirement': [3, 4]
        },
        'Executive Assistant': {
            'Salary Range': (60000, 90000),
            'Education Requirement': [4, 5]
        },
        'Facilities Manager': {
            'Salary Range': (70000, 100000),
            'Education Requirement': [4, 5]
        },
        'Administrative Manager': {
            'Salary Range': (80000, 120000),
            'Education Requirement': [5, 6]
        },
        'Chief Administrative Officer': {
            'Salary Range': (120000, 200000),
            'Education Requirement': [6, 8]
        }
    },
    'Procurement': {
        'Procurement Clerk': {
            'Salary Range': (35000, 50000),
            'Education Requirement': [2, 3]
        },
        'Purchasing Assistant': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Buyer': {
            'Salary Range': (50000, 75000),
            'Education Requirement': [4, 5]
        },
        'Procurement Specialist': {
            'Salary Range': (60000, 90000),
            'Education Requirement': [5, 6]
        },
        'Supply Chain Analyst': {
            'Salary Range': (70000, 110000),
            'Education Requirement': [5, 6]
        },
        'Purchasing Manager': {
            'Salary Range': (80000, 120000),
            'Education Requirement': [6, 7]
        },
        'Director of Procurement': {
            'Salary Range': (120000, 180000),
            'Education Requirement': [6, 8]
        }
    },
    'Health and Safety': {
        'Safety Officer': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [2, 3]
        },
        'Environmental Technician': {
            'Salary Range': (45000, 65000),
            'Education Requirement': [3, 4]
        },
        'Occupational Health Specialist': {
            'Salary Range': (50000, 75000),
            'Education Requirement': [4, 5]
        },
        'Health and Safety Coordinator': {
            'Salary Range': (60000, 90000),
            'Education Requirement': [4, 5]
        },
        'Environmental Health and Safety Manager': {
            'Salary Range': (80000, 120000),
            'Education Requirement': [5, 6]
        },
        'Director of Environmental Health and Safety': {
            'Salary Range': (100000, 150000),
            'Education Requirement': [6, 7]
        }
    },
    'Education and Training': {
        'Training Assistant': {
            'Salary Range': (30000, 45000),
            'Education Requirement': [2, 3]
        },
        'Corporate Trainer': {
            'Salary Range': (40000, 60000),
            'Education Requirement': [3, 4]
        },
        'Instructional Designer': {
            'Salary Range': (55000, 80000),
            'Education Requirement': [5, 6]
        },
        'Training Coordinator': {
            'Salary Range': (50000, 75000),
            'Education Requirement': [4, 5]
        },
        'Training and Development Manager': {
            'Salary Range': (80000, 120000),
            'Education Requirement': [6, 7]
        },
        'Director of Training': {
            'Salary Range': (120000, 180000),
            'Education Requirement': [6, 8]
        }
    }
}


# Performance ratings' probabilities
performance_rating_prob ={
    'Outstanding': .15,
    'Very Satisfactory': .25,
    'Satisfactory': .30,
    'Unsatisfactory': .22,
    'Poor': .11
}

penalty_coefficients = {
    'Very Dissatisfied': 1.5,
    'Dissatisfied': 1.25,
    'Neutral': 1.0,
    'Satisfied': .75,
    'Very Satisfied': .5
}


# Performance adjustments for termination
performance_adjustment_prob ={
    'Outstanding': 0,
    'Very Satisfactory': .05,
    'Satisfactory': .10,
    'Unsatisfactory': .20,
    'Poor': .40
}


# The probability distribution of firings within the company based on year
years_firings_prob = {
    2004: .01, 2005: .02, 2006: .02,
    2007: .11, 2008: .17, # Financial crisis has the largest firings
    2009: .05, 2010: .04, 2011: .02,
    2012: .03, 2013: .02, 2014: .03,
    2015: .02, 2016: .01, 2017: .01,
    2018: .02, 2019: .03,
    2020: .10, # Covid firings
    2021: .04, 2022: .03, 2023: .02,
    2024: .01
}

# The probability distribution of termination reasons
termination_prob = {
    'Voluntary Resignation': .4,
    'Involuntary Termination': .2,
    'Retirement': .15,
    'End of Contract': .1,
    'Layoff': .1,
    'Other': .05
}

In [None]:
# Creating functions for data generation

def get_num_records():
  '''
  Gets user input on number of records to generate
  It is limited to 100,000 records (employees)

  Parameters:
    None

  Returns:
    num_records(int): number of records (number of employee data)
  '''
  while True:
    try:
      num_records = int(input("Enter the number of records to generate (max 100000):  "))
      if num_records > 0 and num_records <= 100000:
        break
      else:
        raise ValueError
    except ValueError:
      print("Invalid input: Please enter a number between 1 and 100,000")

  return num_records


def get_op_years():
  '''
  Gets user input on company operating years
  The range of operating years is (2000 -- 2024)

  Parameters:
    None

  Returns:
    op_years(array): company's years of operation
  '''
  while True:
    try:
      print("By default the end date of records 2024-12-31")
      start_year = int(input("Enter the year the records should begin as a whole number from 2000 -- 2024:  "))
      if 2000 <= start_year <= max_date.year:
        op_years = range(start_year, max_date.year + 1)
        break
      else:
        raise ValueError
    except ValueError:
      print("Invalid input: your input did not follow the guidelines")

  op_years = np.array(list(op_years))

  return op_years


def generate_unique_emp_id():
  '''
  Generates unique employee IDs

  Parameters:
    None

  Returns:
    emp_id(str): the unique employee ID
  '''
  unique_id_list = set()
  while True:
    emp_id = f"{random.randint(10000, 99999):05d}"
    if emp_id not in unique_id_list:
      unique_id_list.add(emp_id)
      return emp_id


def choose_name(gender):
  '''
  Chooses random combination of first and last names based on gender previously chosen

  Parameters:
    gender(str): the gender generated from a list of [Female, Male, Non Conforming]

  Return:
    first_name(str), last_name(str): the employee's first and last name
  '''
  if gender == 'Female':
    first_name = fake.first_name_female()
  elif gender == 'Male':
    first_name = fake.first_name_male()
  else:
    first_name = fake.first_name()

  last_name = fake.last_name()

  return first_name, last_name


def generate_dob(age_probabilities):
  '''
  Generates the employee's date of birth based on company's age range distribution

  Parameters:
    age_probabilities(array): predefined age probabilities

  Returns:
    dob(date): the employee's date of birth
  '''
  age_ranges = np.random.choice(list(age_range_probabilities.keys()), p = age_probabilities)
  min_age, max_age = age_range_probabilities[age_ranges]['Range']

  min_birth_year = max_date.year - max_age
  max_birth_year = max_date.year - min_age

  eligible_birth_year = random.randint(min_birth_year, max_birth_year)
  possible_birth_start_date = date(day=1, month = 1, year= eligible_birth_year).toordinal()
  possible_birth_end_date = date(day=31, month = 12, year= eligible_birth_year).toordinal()

  dob = date.fromordinal(random.randint(possible_birth_start_date, possible_birth_end_date))

  return dob


def choose_education(age):
  '''
  Chooses the employee's highest education based on predefined probabilities

  Parameters:
    age(int): the employee's age

  Returns:
    education(str): the employee's age
  '''
  if age < 18:
    education = 'Less Than High School'
  elif age >= 25:
    education = np.random.choice(possible_education_above25, p= education_above25_prob)
  else:
    education = np.random.choice(possible_education_below25, p= education_below25_prob)

  return education


def choose_possible_locations(num_records):
  '''
  Chooses a cities where employees can be located
  The probability of a city being chosen is weighted by their population
  The number of cities chosen is based on the number of records to generate

  Parameters:
    num_records_chosen(int): the number of records users have chosen to generate

  Returns:
    possible_cities(dataframe): the possible locations employees are located
  '''
  chosen_level = min((num_records // 10000), 9)

  num_cities_to_sample = random.randint(bin_levels[chosen_level][0], bin_levels[chosen_level][1])

  # Possible cities the employee can be located
  possible_locations = cities_states_df.sample(n = num_cities_to_sample, weights= cities_states_df['population'], replace = False)

  return possible_locations


def choose_city_state(possible_locations):
  '''
  Chooses the city and associated state an employee is working at

  Parameters:
    sampled_cities(dataframe): the possible cities which the company has locations

  Returns:
    city(str), state(str), state_abbr(str): the city, state, and state abbreviation where employee is located
  '''
  selected_city = possible_locations.sample(n = 1).iloc[0]
  city = selected_city.city
  state = selected_city.state
  state_abbr = selected_city.state_abbr

  return city, state, state_abbr


def generate_hired_date():
  '''
  Generates the employee hired date based on assumed Gaussian distribution of company growth-to-hiring process

  Parameters:
    None

  Returns:
    hired_date(date): the date the employee was hired
  '''
  mean_year = np.mean(op_years)
  std_dev_year = np.std(op_years)

  gaussian_prob = norm.pdf(op_years, loc= mean_year, scale=std_dev_year)
  gaussian_prob /= np.sum(gaussian_prob)

  hired_year = np.random.choice(op_years, p= gaussian_prob)
  possible_start_date = date(day=1, month = 1, year= hired_year).toordinal()
  possible_end_date = date(day=31, month = 12, year= hired_year).toordinal()

  hired_date = date.fromordinal(random.randint(possible_start_date, possible_end_date))

  return hired_date


def choose_job(education):
  '''
  Chooses the employee's job title based on their education

  Parameters:
    education(str): the education of the employee

  Returns:
    department(str), job_title(str): the employee's job title and accompanying department
  '''
  education_level = education_level_num.get(education)

  eligible_jobs = []

  for department, job in department_jobtitle_salary_education.items():
    for job_title, job_info in job.items():
      edu_req = job_info['Education Requirement']
      if edu_req[0] <= education_level <= edu_req[1]:
        eligible_jobs.append((department, job_title))

  if eligible_jobs:
    department, job_title = random.choice(eligible_jobs)
    return department, job_title


def choose_salary(department, job_title):
  '''
  Chooses the employee's salary based on job title's arbitrary range

  Parameters:
    job_title(str): the name of the job title
    department(str): the department the job title belongs to in the dictionary

  Returns:
    salary(int): the employee's salary
  '''
  job_info = department_jobtitle_salary_education.get(department, {}).get(job_title, None)

  if job_info:
    salary_range = job_info['Salary Range']
    salary = random.randint(salary_range[0], salary_range[1])

    return salary


def calculate_overtime_pay_yearly(salary, overtime):
  '''
  Calculates the employee's yearly overtime pay
  Based on their salary and wheter they did overtime

  Parameters:
    salary(int): the employee's salary
    overtime(str): wheter the employee did overtime (Yes/No)

  returns:
    overtime_hours_total(int): the total number of overtime hours for the year
    total_overtime_pay(int): the total yearly overtime pay
  '''
  if overtime == 'Yes':
    overtime_hours_total = random.randint(1, 260)

    hourly_wage = salary // 2080
    overtime_pay_yearly = (hourly_wage * 1.5) * overtime_hours_total
    total_overtime_pay = round(overtime_pay_yearly)
  else:
    overtime_hours_total = 0
    total_overtime_pay = 0

  return overtime_hours_total, total_overtime_pay


def choose_performance_rating(satisfaction_rating):
  '''
  Calculates the employee's performance rating based on their satisfaction score

  Parameters:
    satisfaction_rating(str): 1 of 5 choices for the employees satisfaction rating

  Returns:
    performance_rating(str): the employee's performance rating (1 of 5 choices)
  '''
  base_probabilities = performance_rating_prob.copy()
  penalty = penalty_coefficients[satisfaction_rating]

  base_probabilities['Outstanding'] *= (1 / penalty)
  base_probabilities['Very Satisfactory'] *= (1 / penalty)
  base_probabilities['Satisfactory'] *= penalty
  base_probabilities['Unsatisfactory'] *= penalty
  base_probabilities['Poor'] *= penalty

  total = sum(base_probabilities.values())
  normalized_probs = {rating: prob / total for rating, prob in base_probabilities.items()}

  possible_ratings = list(normalized_probs.keys())
  possible_ratings_prob = list(normalized_probs.values())

  performance_rating = np.random.choice(possible_ratings, p = possible_ratings_prob)

  return performance_rating


def choose_employment_status(hired_date_result, performance_rating_result):
  '''
  Chooses the employee's employment status
  Probability is adjusted by employee's performance rating

  Parameters:
    hired_date_result(date): the date employee was hired
    performance_rating_result(str): the employees's performance rating

  Returns:
    employment_status(str): the employee's employment status(Employed/Terminated)
  '''
  hired_year = hired_date_result.year

  term_probability = years_firings_prob.get(hired_year, 0)
  performance_adjustment = performance_adjustment_prob.get(performance_rating_result, 0)
  adjusted_term_probability = term_probability + performance_adjustment
  employed_probability = 1 - adjusted_term_probability

  employment_status = np.random.choice(['Terminated', 'Employed'], p=[adjusted_term_probability, employed_probability])

  return employment_status


def choose_term_date(hired_date_result, employment_status_result):
  '''
  Chooses the employee's termination date based on hiring date and wheter they were terminated

  Parameters:
    hired_date_result(date): the date employee was hired
    employment_status_result(str): the employees's employment status

  Returns:
    term_date(date): the employee's termination date otherwise None
  '''
  if employment_status_result == 'Terminated':
    if hired_date_result >= max_date:
      return None

    max_employment_duration = max_date - hired_date_result - timedelta(days=1)

    if max_employment_duration > timedelta(0):
      random_employment_duration = random.randint(0, max_employment_duration.days)
      term_date = hired_date_result + timedelta(days = int(random_employment_duration))

      return term_date

  return None


def calculate_years_employed(hired_date, term_date, employment_status):
  '''
  Calculates the number of years the employee has worked at the company

  Parameters:
    hired_date(date): the day the employee was hired
    term_date(date): the day the employee was terminated if terminated
    employment_status(str): wheter the employee is emplyed or terminated

  Returns:
    years_employed(int): the number of years an employee has worked at the company
  '''
  if employment_status == "Employed":
    end_date = max_date
  else:
    end_date = term_date

  years_employed = end_date.year - hired_date.year

  return years_employed


def choose_overtime(employment_status):
  '''
  Chooses wheter the employee did overtime this year based on wheter they were still employeed

  Parameters:
    emplyment_status(str): the status of the employee's employment

  Returns:
    overtime(str): Yes or No regarding employee's overtime
  '''
  if employment_status == 'Employed':
    overtime = np.random.choice(['Yes', 'No'], p = [.20, .80])
  else:
    overtime = 'No'

  return overtime


def choose_satisfaction_score(satisfaction_distribution):
  '''
  Chooses satisfactory score based on skewed gaussian distribution

  Parameters:
    satisfaction_distribution(str): the chosen gaussian distribution

  Return:
    satisfaction_score(int): the employee's satisfaction score
  '''
  if satisfaction_distribution == 'Left Skewed':
    skew = 10
    satisfaction_score = skewnorm.rvs(skew, loc = 50, scale = 25)
  elif satisfaction_distribution == 'Right Skewed':
    skew = -5
    satisfaction_score = skewnorm.rvs(skew, loc = 50, scale = 25)
  elif satisfaction_distribution == 'Normal':
    satisfaction_score = norm.rvs(loc = 50, scale = 25)

  satisfaction_score = min(max(satisfaction_score, 0), 100)
  satisfaction_score = int(round(satisfaction_score))

  return satisfaction_score


def choose_satisfaction_rating(satisfaction_score):
  '''
  Chooses the employee's satisfaction rating based on their satisfaction score

  Parameters:
    satisfaction_score(int): the employee's satisfaction score

  Returns:
    satisfaction_rating(str): the employee's satisfaction rating
  '''
  if satisfaction_score <= 49:
    return 'Very Dissatisfied'
  elif 50 <= satisfaction_score <= 59:
    return 'Dissatisfied'
  elif 60 <= satisfaction_score <= 69:
    return 'Neutral'
  elif 70 <= satisfaction_score <= 79:
    return 'Satisfied'
  elif 80 <= satisfaction_score <= 100:
    return 'Very Satisfied'


def calculate_term_prob(satisfaction_rating, performance_rating, age):
  '''
  calculates the employee's termination reason if they've been terminated

  Parameters:
    satisfaction_rating(str): the employee's satisfaction rating (1 of 5 choices)
    performance_rating(str): the employee's performance rating (1 of 5 choices)
    age(int): the employee's age

  Returns:
    normalized_term_probs(dict): the employee's termination probabilitiy

  '''
  base_term_prob = termination_prob.copy()

  if satisfaction_rating in ['Very Dissatisfied', 'Dissatisfied']:
    base_term_prob['Voluntary Resignation'] *= 1.5
  elif satisfaction_rating in ['Very Satisfied', 'Satisfied']:
    base_term_prob['Voluntary Resignation'] *= .5

  if performance_rating in ['Poor', 'Unsatisfactory']:
    base_term_prob['Involuntary Termination'] *= 1.5
  elif performance_rating in ['Outstanding', 'Very Satisfactory']:
    base_term_prob['Involuntary Termination'] *= .5

  if age >= 65:
    base_term_prob['Retirement'] *= 2
  elif age < 50:
    base_term_prob['Retirement'] *= .5

  total_term_prob = sum(base_term_prob.values())
  normalized_term_probs = {reason: prob / total_term_prob for reason, prob in base_term_prob.items()}

  return normalized_term_probs


def choose_term_reason(employment_status, satisfaction_rating, performance_rating, age):
  '''
  Chooses the employee's termination reason if they've been terminated

  Parameters:
    employment_status(str): the employment status of the employee (either 'Employed' or 'Terminated')
    satisfaction_rating(str): the employee's satisfaction rating (1 of 5 choices)
    performance_rating(str): the employee's performance rating (1 of 5 choices)
    age(int): the employee's age

  Returns:
    term_reason(str): the employee's termination reason (1 of 6 choices)

  '''
  if employment_status == 'Terminated':
    adjusted_term_prob = calculate_term_prob(satisfaction_rating, performance_rating, age)
    possible_term_reasons = list(adjusted_term_prob.keys())
    possible_term_prob = list(adjusted_term_prob.values())
    term_reason = np.random.choice (possible_term_reasons, p = possible_term_prob)

    return term_reason
  else:
    return 'Still Employed'

In [None]:
# User input and following constraints

num_records = get_num_records()
op_years = get_op_years()


possible_locations = choose_possible_locations(num_records)


satisfaction_distribution = random.choice([
    'Left Skewed', 'Right Skewed', 'Normal'
])

In [None]:
data =[]

for i in range(num_records):
  emp_id = generate_unique_emp_id()
  gender = np.random.choice(['Female', 'Male', 'Non Conforming'], p= gender_prob)
  first_name, last_name = choose_name(gender)
  dob = generate_dob(age_probabilities)
  age = max_date.year - dob.year
  race_or_ethnicity = np.random.choice(possible_races, p=race_or_ethnicity_prob_array)
  education = choose_education(age)
  city, state, state_abbr = choose_city_state(possible_locations)
  hired_date = generate_hired_date()
  department, job_title = choose_job(education)
  salary = choose_salary(department, job_title)
  last_satisfaction_score = choose_satisfaction_score(satisfaction_distribution)
  last_satisfaction_rating = choose_satisfaction_rating(last_satisfaction_score)
  last_performance_rating = choose_performance_rating(last_satisfaction_rating)
  employment_status = choose_employment_status(hired_date, last_performance_rating)
  term_date = choose_term_date(hired_date, employment_status)
  term_reason = choose_term_reason(employment_status, last_satisfaction_rating, last_performance_rating, age)
  years_employed = calculate_years_employed(hired_date, term_date, employment_status)
  overtime_this_year = choose_overtime(employment_status)
  overtime_hours_this_year, overtime_pay_this_year = calculate_overtime_pay_yearly(salary, overtime_this_year)

  data.append({
      'employee_id': emp_id,
      'first_name': first_name,
      'last_name': last_name,
      'gender': gender,
      'dob': dob,
      'age': age,
      'race_or_ethnicity': race_or_ethnicity,
      'education': education,
      'city': city,
      'state': state,
      'state_abbr': state_abbr,
      'hired_date': hired_date,
      'department': department,
      'job_title': job_title,
      'salary': salary,
      'employment_status': employment_status,
      'term_date': term_date,
      'term_reason': term_reason,
      'years_employed': years_employed,
      'overtime_this_year': overtime_this_year,
      'overtime_hours_this_year': overtime_hours_this_year,
      'overtime_pay_this_year': overtime_pay_this_year,
      'last_performance_rating': last_performance_rating,
      'last_satisfaction_score': last_satisfaction_score,
      'last_satisfaction_rating': last_satisfaction_rating
  }
  )



synthetic_data_df = pd.DataFrame(data)

synthetic_data_df.head(30)

In [None]:
# Output data to csv
synthetic_data_df.to_csv('synthetic_data.csv', index= False)