In [41]:
import pandas as pd
import numpy as np
import os
import pymysql
import random
import datetime
import math
import requests
import pyspark

from pyspark.sql import SparkSession
from random import randrange, randint
from datetime import timedelta

In [42]:
word_site = "https://www.mit.edu/~ecprice/wordlist.10000"

response = requests.get(word_site)
words = response.content.decode("utf-8").splitlines()
spark = SparkSession.builder.appName("Employee_Data").getOrCreate() 
spark

In [43]:
minimum_wage_df = pd.read_csv("Minimum_Wage_List.csv")
employees_df = spark.read.csv("Employee_Dummy.csv", header=True)
employees_df.columns

['first_name',
 'last_name',
 'company_name',
 'address',
 'city',
 'county',
 'state',
 'zip',
 'phone1',
 'phone2',
 'email',
 'web']

In [44]:
def repair_employees_df(employees_df, words):
    employees = employees_df.drop("phone2", "company_name", "county")
    employees = generate_unique_values(employees.toPandas(), words)
    
    employees = employees.rename({"address": "street", "phone1": "phone", "web": "website"}, axis=1)
    employee_id = [*range(1, 4500, 1)]
    employees.insert(0, "employee_id", employee_id)
    return employees

In [45]:
def generate_unique_values(employees, words):
    counter = 0
    
    
    for rows, columns in employees.iterrows():
        worker = (rows, columns)[1]
        
        street_numb = random.randint(1, 999)
        street = random.choice(words)
        street_ending = ["Ave", "St", "Blvd", "Ct", "Mnr", "RD", "Drive",
                     "Pkwy", "Terrace", "Hwy"]
        email_ending = ["@gmail.com", "@hotmail.com", "@yahoo.com", "@mail.com", "@venere.org", "@cox.net"]
    
        line_one = str(street_numb) + " " + street.capitalize() + " " + random.choice(street_ending)
        email = worker["first_name"][0] + worker["last_name"] + random.choice(email_ending)
        website = "http://" + worker["first_name"].lower() + random.choice(words) + ".com"
        
        if worker["address"] is None: worker["address"] = line_one
        if worker["zip"] is None: worker["zip"] = random.randint(10000, 99999)
        if worker["phone1"] is None: worker["phone1"] = "-".join([str(i) for i in random.sample(range(100, 999), 3)])
        if worker["email"] is None: worker["email"] = email
        if worker["web"] is None: worker["web"] = website
        
    return employees

In [46]:
def random_date(start, end): 
    between_dates = end - start
    days_between = between_dates.days

    random_numb_days = random.randrange(days_between)
    random_date = start + datetime.timedelta(days=random_numb_days)
    return random_date.date()

In [47]:
def repair_minimum_wage(minimum_wage):
    minimum_wage = minimum_wage.loc[:,["Minimum Wage Rate", "USPS Abbreviation"]]
    fixed_hourly = []
    
    for items in minimum_wage["Minimum Wage Rate"]:
        new_items = (items[1:-7])
        fixed_hourly.append(float(new_items))
        
    minimum_wage["Minimum Wage Rate"] = fixed_hourly
    return minimum_wage

In [61]:
def create_employee_attributes(employees, minimum_wage, start, end, database_starttime):
    employee_states = employees["state"].to_numpy()
    department_codes = [100, 200, 300, 400, 500]
    job_codes = [27345, 47893, 36289, 11532, 22634]
    pay = ["minimum", 60000, 100000, 120000, 110000]
    departments = ["Customer Service", "Manager", "Leadership", "IT", "IT"]
    work_benefits = ["Enrolled", "Not Enrolled"]

    jobs = ["Team Member", "Department Leader", "Store Leader", "Software Engineer", "Data Analyst"]
    job_desc = ["Help customers with complaints and questions, give customers information about products and services, take orders, and process returns",
                "Accomplishes department objectives by managing staff; planning and evaluating department activities. Maintains staff by recruiting, selecting, orienting, and training employees. Ensures a safe, secure, and legal work environment. Develops personal growth opportunities",
                "Manage everything related to the store. Store leaders manage the store's employees. They are directly involved in hiring, training, and even firing employees.",
                "Collects and stores data on sales numbers, market research, logistics, linguistics, or other behaviors. They bring technical expertise to ensure the quality and accuracy of that data, then process, design, and present it in ways to help people, businesses, and organizations make better decisions.",
                "Designs, develops and maintains computer software at a company. They use their creativity and technical skills and apply the principles of software engineering to help solve new and ongoing problems for an organization."]



    department_id = []
    job_id = []
    department = []
    job_title = []
    job_description = []
    start_date = []
    experience = []
    benefits = []
    status = []
    salary = []

    counter = 0

    for items in range(4499):
        code = random.choice(department_codes)
        index = department_codes.index(code)

        department_id.append(code)
        job_id.append(job_codes[index])
        department.append(departments[index])
        job_title.append(jobs[index])
        job_description.append(job_desc[index])
        benefits.append(random.choice(work_benefits))
        status.append("Currently Employed")

        employment_date = random_date(start, end)
        start_date.append(employment_date)

        years = math.trunc((database_starttime - employment_date).days / 365)
        experience.append(years)

        if pay[index] == "minimum":
            state = employee_states[counter]
            hourly_rate = minimum_wage.loc[minimum_wage["USPS Abbreviation"]==state]
            annual = round((hourly_rate["Minimum Wage Rate"].values[0] + years) * 40 * 52)
            salary.append(annual)
        else:
            annual = pay[index] + (10000 * years)
            salary.append(annual)


        counter += 1

    employees = employees.assign(
        job_id = job_id,
        department_id = department_id,
        department = department,
        job_title = job_title,
        job_description = job_description,
        start_date = start_date,
        years_with_company = experience,
        benefits = benefits,
        salary = salary,
        work_status = status


    )
    
    employees["department_id"].replace({500 : 400}, inplace=True)
    return employees

In [62]:
def employees_table(worker, cur, connection):
    sql = """INSERT INTO employees (employee_id, 
                                        first_name, 
                                        last_name, 
                                        street, 
                                        city, 
                                        state, 
                                        zip)
                                    
         values (%s, %s, %s, %s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["employee_id"], worker["first_name"], worker["last_name"], 
                      worker["street"], worker["city"], worker["state"], worker["zip"]))
    
    connection.commit()

In [63]:
def contact_table(worker, cur, connection):
    sql = """INSERT INTO contact (employee_id, 
                                    phone, 
                                    email, 
                                    website)
                                    
         values (%s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["employee_id"], worker["phone"], worker["email"], 
                      worker["website"]))
    
    connection.commit()

In [64]:
def identification_table(worker, cur, connection):
    sql = """INSERT INTO identification_codes (employee_id, 
                                                job_id, 
                                                department_id)
                                    
         values (%s, %s, %s) 
    """
    cur.execute(sql, (worker["employee_id"], worker["job_id"], worker["department_id"]))
    
    connection.commit()

In [65]:
def department_table(worker, cur, connection):
    sql = """INSERT INTO department (department_id, 
                                         department)
                                    
         values (%s, %s) 
    """
    cur.execute(sql, (worker["department_id"], worker["department"]))
    
    connection.commit()

In [66]:
def job_profile_table(worker, cur, connection):
    sql = """INSERT INTO job_profile (job_id, 
                                    job_title, 
                                    job_description, 
                                    start_date,
                                    years_company)
                                    
         values (%s, %s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["job_id"], worker["job_title"], worker["job_description"], 
                      worker["start_date"], worker["years_with_company"]))
    
    connection.commit()

In [67]:
def payment_information_table(worker, cur, connection):
    sql = """INSERT INTO payment_information (job_id, 
                                                job_salary, 
                                                benefits, 
                                                employment_status)
                                    
         values (%s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["job_id"], worker["salary"], worker["benefits"], 
                      worker["work_status"]))
    
    connection.commit()

In [68]:
def main():
    database_starttime = datetime.datetime.strptime("1/1/2020", "%m/%d/%Y").date()
    start = datetime.datetime.strptime("1/1/2010", "%m/%d/%Y")
    end = datetime.datetime.strptime("1/1/2020", "%m/%d/%Y")
    
    minimum_wage = repair_minimum_wage(minimum_wage_df)
    cleaned_employees = repair_employees_df(employees_df, words)
    employees = create_employee_attributes(cleaned_employees, minimum_wage, start, end, database_starttime)
    connection = pymysql.connect(host="localhost",
                     user="root",
                     password="Immalegacy5",
                     charset="utf8",
                     db="payroll_db")
    
    cur = connection.cursor()
    counter = 0
    for rows, columns in employees.iterrows():
        if counter % 100 == 0 and counter != 0: print("Inserted Total Rows of Data: ", counter)
        counter += 1
        worker = (rows, columns)[1]
        employees_table(worker, cur, connection)
        contact_table(worker, cur, connection)
        identification_table(worker, cur, connection)
        department_table(worker, cur, connection)
        job_profile_table(worker, cur, connection)
        payment_information_table(worker, cur, connection)
    print("Data Transferred Successfully")
    return employees

In [69]:
main()

Current Row Count:  100
Current Row Count:  200
Current Row Count:  300
Current Row Count:  400
Current Row Count:  500
Current Row Count:  600
Current Row Count:  700
Current Row Count:  800
Current Row Count:  900
Current Row Count:  1000
Current Row Count:  1100
Current Row Count:  1200
Current Row Count:  1300
Current Row Count:  1400
Current Row Count:  1500
Current Row Count:  1600
Current Row Count:  1700
Current Row Count:  1800
Current Row Count:  1900
Current Row Count:  2000
Current Row Count:  2100
Current Row Count:  2200
Current Row Count:  2300
Current Row Count:  2400
Current Row Count:  2500
Current Row Count:  2600
Current Row Count:  2700
Current Row Count:  2800
Current Row Count:  2900
Current Row Count:  3000
Current Row Count:  3100
Current Row Count:  3200
Current Row Count:  3300
Current Row Count:  3400
Current Row Count:  3500
Current Row Count:  3600
Current Row Count:  3700
Current Row Count:  3800
Current Row Count:  3900
Current Row Count:  4000
Current R

Unnamed: 0,employee_id,first_name,last_name,street,city,state,zip,phone,email,website,job_id,department_id,department,job_title,job_description,start_date,years_with_company,benefits,salary,work_status
0,1,James,Butt,6649 N Blue Gum St,New Orleans,LA,70116,504-621-8927,jbutt@gmail.com,http://www.bentonjohnbjr.com,36289,300,Leadership,Store Leader,Manage everything related to the store. Store ...,2018-06-28,1,Not Enrolled,110000,Currently Employed
1,2,Josephine,Darakjy,4 B Blue Ridge Blvd,Brighton,MI,48116,810-292-9388,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com,36289,300,Leadership,Store Leader,Manage everything related to the store. Store ...,2010-07-04,9,Not Enrolled,190000,Currently Employed
2,3,Art,Venere,8 W Cerritos Ave #54,Bridgeport,NJ,8014,856-636-8749,art@venere.org,http://www.chemeljameslcpa.com,22634,400,IT,Data Analyst,"Designs, develops and maintains computer softw...",2015-08-27,4,Enrolled,150000,Currently Employed
3,4,Lenna,Paprocki,639 Main St,Anchorage,AK,99501,907-385-4412,lpaprocki@hotmail.com,http://www.feltzprintingservice.com,27345,100,Customer Service,Team Member,"Help customers with complaints and questions, ...",2016-12-11,3,Not Enrolled,27747,Currently Employed
4,5,Donette,Foller,34 Center St,Hamilton,OH,45011,513-570-1893,donette.foller@cox.net,http://www.printingdimensions.com,11532,400,IT,Software Engineer,"Collects and stores data on sales numbers, mar...",2012-01-21,7,Enrolled,190000,Currently Employed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4494,4495,Brande,Rader,389 Plenty Ct,Sharpsburg,GA,67611,743-440-806,BRader@mail.com,http://brandereductions.com,22634,400,IT,Data Analyst,"Designs, develops and maintains computer softw...",2017-11-12,2,Enrolled,130000,Currently Employed
4495,4496,Heath,Brito,71 Ecommerce Ct,San Diego,CA,81984,950-189-904,HBrito@venere.org,http://heathlexus.com,22634,400,IT,Data Analyst,"Designs, develops and maintains computer softw...",2018-05-27,1,Not Enrolled,120000,Currently Employed
4496,4497,Scottie,Benites,453 Punk Hwy,Dallas,TX,17951,155-318-533,SBenites@cox.net,http://scottiefootage.com,36289,300,Leadership,Store Leader,Manage everything related to the store. Store ...,2016-08-20,3,Not Enrolled,130000,Currently Employed
4497,4498,Kori,Messinger,158 Barely RD,New Market,TN,95003,824-157-718,KMessinger@hotmail.com,http://korifc.com,47893,200,Manager,Department Leader,Accomplishes department objectives by managing...,2014-11-08,5,Not Enrolled,110000,Currently Employed
