In [None]:
import pandas as pd
import numpy as np
import os
import pymysql
import random
import datetime
import math
import requests
import pyspark

from pyspark.sql import SparkSession
from random import randrange, randint
from datetime import timedelta

In [None]:
minimum_wage_df = pd.read_csv("Minimum_Wage_List.csv")
employees_df = spark.read.csv("Employee_Dummy.csv", header=True)
employees_df.columns

In [None]:
spark = SparkSession.builder.appName("Employee_Data").getOrCreate() 
spark

In [None]:
employees_df = spark.read.csv("Employee_Dummy.csv", header=True)
employees_df.columns

In [None]:
def repair_employees_df(employees_df, words):
    employees = employees_df.drop("phone2", "company_name", "county")
    employees = generate_unique_values(employees.toPandas(), words)
    
    employees = employees.rename({"address": "street", "phone1": "phone", "web": "website"}, axis=1)
    employee_id = [*range(1, 4500, 1)]
    employees.insert(0, "employee_id", employee_id)
    return employees

In [None]:
def generate_unique_values(employees, words):
    counter = 0
    
    
    for rows, columns in employees.iterrows():
        worker = (rows, columns)[1]
        
        street_numb = random.randint(1, 999)
        street = random.choice(words)
        street_ending = ["Ave", "St", "Blvd", "Ct", "Mnr", "RD", "Drive",
                     "Pkwy", "Terrace", "Hwy"]
        email_ending = ["@gmail.com", "@hotmail.com", "@yahoo.com", "@mail.com", "@venere.org", "@cox.net"]
    
        line_one = str(street_numb) + " " + street.capitalize() + " " + random.choice(street_ending)
        email = worker["first_name"][0] + worker["last_name"] + random.choice(email_ending)
        website = "http://" + worker["first_name"].lower() + random.choice(words) + ".com"
        
        if worker["address"] is None: worker["address"] = line_one
        if worker["zip"] is None: worker["zip"] = random.randint(10000, 99999)
        if worker["phone1"] is None: worker["phone1"] = "-".join([str(i) for i in random.sample(range(100, 999), 3)])
        if worker["email"] is None: worker["email"] = email
        if worker["web"] is None: worker["web"] = website
        
    return employees

In [None]:
def random_date(start, end): 
    between_dates = end - start
    days_between = between_dates.days

    random_numb_days = random.randrange(days_between)
    random_date = start + datetime.timedelta(days=random_numb_days)
    return random_date.date()

In [None]:
def repair_minimum_wage(minimum_wage):
    minimum_wage = minimum_wage.loc[:,["Minimum Wage Rate", "USPS Abbreviation"]]
    fixed_hourly = []
    
    for items in minimum_wage["Minimum Wage Rate"]:
        new_items = (items[1:-7])
        fixed_hourly.append(float(new_items))
        
    minimum_wage["Minimum Wage Rate"] = fixed_hourly
    return minimum_wage

In [None]:
def create_employee_attributes(employees, minimum_wage, start, end, database_starttime):
    employee_states = employees["state"].to_numpy()
    print(len(employee_states))
    department_codes = [100, 200, 300, 400, 500]
    job_codes = [27345, 47893, 36289, 11532, 22634]
    pay = ["minimum", 60000, 100000, 120000, 110000]
    departments = ["Customer Service", "Manager", "Leadership", "IT", "IT"]
    work_benefits = ["Enrolled", "Not Enrolled"]

    jobs = ["Team Member", "Department Leader", "Store Leader", "Software Engineer", "Data Analyst"]
    job_desc = ["Help customers with complaints and questions, give customers information about products and services, take orders, and process returns",
                "Accomplishes department objectives by managing staff; planning and evaluating department activities. Maintains staff by recruiting, selecting, orienting, and training employees. Ensures a safe, secure, and legal work environment. Develops personal growth opportunities",
                "Manage everything related to the store. Store leaders manage the store's employees. They are directly involved in hiring, training, and even firing employees.",
                "Collects and stores data on sales numbers, market research, logistics, linguistics, or other behaviors. They bring technical expertise to ensure the quality and accuracy of that data, then process, design, and present it in ways to help people, businesses, and organizations make better decisions.",
                "Designs, develops and maintains computer software at a company. They use their creativity and technical skills and apply the principles of software engineering to help solve new and ongoing problems for an organization."]



    department_id = []
    job_id = []
    department = []
    job_title = []
    job_description = []
    start_date = []
    experience = []
    benefits = []
    status = []
    salary = []

    counter = 0

    for items in range(4499):
        code = random.choice(department_codes)
        index = department_codes.index(code)

        department_id.append(code)
        job_id.append(job_codes[index])
        department.append(departments[index])
        job_title.append(jobs[index])
        job_description.append(job_desc[index])
        benefits.append(random.choice(work_benefits))
        status.append("Currently Employed")

        employment_date = random_date(start, end)
        start_date.append(employment_date)

        years = math.trunc((database_starttime - employment_date).days / 365)
        experience.append(years)

        if pay[index] == "minimum":
            state = employee_states[counter]
            hourly_rate = minimum_wage.loc[minimum_wage["USPS Abbreviation"]==state]
            print(counter, state, hourly_rate)
            annual = round((hourly_rate["Minimum Wage Rate"].values[0] + years) * 40 * 52)
            salary.append(annual)
        else:
            annual = pay[index] + (10000 * years)
            salary.append(annual)


        counter += 1

    employees = employees.assign(
        job_id = job_id,
        department_id = department_id,
        department = department,
        job_title = job_title,
        job_description = job_description,
        start_date = start_date,
        years_with_company = experience,
        benefits = benefits,
        salary = salary,
        work_status = status


    )
    
    employees["department_id"].replace({500 : 400}, inplace=True)
    return employees

In [None]:
def employees_table(worker, cur, connection):
    sql = """INSERT INTO employees (employee_id, 
                                        first_name, 
                                        last_name, 
                                        street, 
                                        city, 
                                        state, 
                                        zip)
                                    
         values (%s, %s, %s, %s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["employee_id"], worker["first_name"], worker["last_name"], 
                      worker["street"], worker["city"], worker["state"], worker["zip"]))
    
    connection.commit()

In [None]:
def contact_table(worker, cur, connection):
    sql = """INSERT INTO contact (employee_id, 
                                    phone, 
                                    email, 
                                    website)
                                    
         values (%s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["employee_id"], worker["phone"], worker["email"], 
                      worker["website"]))
    
    connection.commit()

In [None]:
def identification_table(worker, cur, connection):
    sql = """INSERT INTO identification_codes (employee_id, 
                                                job_id, 
                                                department_id)
                                    
         values (%s, %s, %s) 
    """
    cur.execute(sql, (worker["employee_id"], worker["job_id"], worker["department_id"]))
    
    connection.commit()

In [None]:
def department_table(worker, cur, connection):
    sql = """INSERT INTO department (department_id, 
                                         department)
                                    
         values (%s, %s) 
    """
    cur.execute(sql, (worker["department_id"], worker["department"]))
    
    connection.commit()

In [None]:
def job_profile_table(worker, cur, connection):
    sql = """INSERT INTO job_profile (job_id, 
                                    job_title, 
                                    job_description, 
                                    start_date,
                                    years_company)
                                    
         values (%s, %s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["job_id"], worker["job_title"], worker["job_description"], 
                      worker["start_date"], worker["years_with_company"]))
    
    connection.commit()

In [None]:
def payment_information_table(worker, cur, connection):
    sql = """INSERT INTO payment_information (job_id, 
                                                job_salary, 
                                                benefits, 
                                                employment_status)
                                    
         values (%s, %s, %s, %s) 
    """
    cur.execute(sql, (worker["job_id"], worker["salary"], worker["benefits"], 
                      worker["work_status"]))
    
    connection.commit()

In [133]:
def main():
    database_starttime = datetime.datetime.strptime("1/1/2020", "%m/%d/%Y").date()
    start = datetime.datetime.strptime("1/1/2010", "%m/%d/%Y")
    end = datetime.datetime.strptime("1/1/2020", "%m/%d/%Y")
    
    minimum_wage = repair_minimum_wage(minimum_wage_df)
    cleaned_employees = repair_employees_df(employees_df, words)
    employees = create_employee_attributes(cleaned_employees, minimum_wage, start, end, database_starttime)
    connection = pymysql.connect(host="localhost",
                     user="root",
                     password="Immalegacy5",
                     charset="utf8",
                     db="payroll_db")
    
    cur = connection.cursor()
    counter = 0
    for rows, columns in employees.iterrows():
        worker = (rows, columns)[1]
        employees_table(worker, cur, connection)
        contact_table(worker, cur, connection)
        identification_table(worker, cur, connection)
        department_table(worker, cur, connection)
        job_profile_table(worker, cur, connection)
        payment_information_table(worker, cur, connection)
    print("Data Transferred Successfully")
    return employees

In [None]:
main()

4499
0 LA     Minimum Wage Rate USPS Abbreviation
17               7.25                LA
3 AK    Minimum Wage Rate USPS Abbreviation
1              10.34                AK
11 NY     Minimum Wage Rate USPS Abbreviation
31               13.2                NY
15 AZ    Minimum Wage Rate USPS Abbreviation
2               12.8                AZ
33 NJ     Minimum Wage Rate USPS Abbreviation
29               13.0                NJ
36 NY     Minimum Wage Rate USPS Abbreviation
31               13.2                NY
40 OR     Minimum Wage Rate USPS Abbreviation
36              12.75                OR
41 KS     Minimum Wage Rate USPS Abbreviation
15               7.25                KS
43 FL    Minimum Wage Rate USPS Abbreviation
8               10.0                FL
44 AK    Minimum Wage Rate USPS Abbreviation
1              10.34                AK
46 MA     Minimum Wage Rate USPS Abbreviation
20              14.25                MA
50 NY     Minimum Wage Rate USPS Abbreviation
31           

34                9.3                OH
508 NY     Minimum Wage Rate USPS Abbreviation
31               13.2                NY
514 NH     Minimum Wage Rate USPS Abbreviation
28               7.25                NH
515 NC     Minimum Wage Rate USPS Abbreviation
32               7.25                NC
516 PA     Minimum Wage Rate USPS Abbreviation
37               7.25                PA
520 TX     Minimum Wage Rate USPS Abbreviation
42               7.25                TX
523 NJ     Minimum Wage Rate USPS Abbreviation
29               13.0                NJ
531 CO    Minimum Wage Rate USPS Abbreviation
5              12.56                CO
532 IL     Minimum Wage Rate USPS Abbreviation
12               12.0                IL
541 MD     Minimum Wage Rate USPS Abbreviation
19               12.5                MD
542 MO     Minimum Wage Rate USPS Abbreviation
24              11.15                MO
543 MI     Minimum Wage Rate USPS Abbreviation
21               9.87                MI
556 S

43               7.25                UT
1115 VA     Minimum Wage Rate USPS Abbreviation
45               11.0                VA
1116 VA     Minimum Wage Rate USPS Abbreviation
45               11.0                VA
1120 FL    Minimum Wage Rate USPS Abbreviation
8               10.0                FL
1126 FL    Minimum Wage Rate USPS Abbreviation
8               10.0                FL
1127 NE     Minimum Wage Rate USPS Abbreviation
26                9.0                NE
1135 MA     Minimum Wage Rate USPS Abbreviation
20              14.25                MA
1138 FL    Minimum Wage Rate USPS Abbreviation
8               10.0                FL
1148 VA     Minimum Wage Rate USPS Abbreviation
45               11.0                VA
1149 TX     Minimum Wage Rate USPS Abbreviation
42               7.25                TX
1151 MA     Minimum Wage Rate USPS Abbreviation
20              14.25                MA
1152 TX     Minimum Wage Rate USPS Abbreviation
42               7.25                T

21               9.87                MI
1762 TX     Minimum Wage Rate USPS Abbreviation
42               7.25                TX
1766 MD     Minimum Wage Rate USPS Abbreviation
19               12.5                MD
1770 GA    Minimum Wage Rate USPS Abbreviation
9               7.25                GA
1774 WA     Minimum Wage Rate USPS Abbreviation
46              14.49                WA
1783 IL     Minimum Wage Rate USPS Abbreviation
12               12.0                IL
1787 SC     Minimum Wage Rate USPS Abbreviation
39               7.25                SC
1790 AR    Minimum Wage Rate USPS Abbreviation
3               11.0                AR
1792 AK    Minimum Wage Rate USPS Abbreviation
1              10.34                AK
1794 NE     Minimum Wage Rate USPS Abbreviation
26                9.0                NE
1806 MS     Minimum Wage Rate USPS Abbreviation
23               7.25                MS
1817 SD     Minimum Wage Rate USPS Abbreviation
40               9.95                S

8               10.0                FL
2262 OH     Minimum Wage Rate USPS Abbreviation
34                9.3                OH
2264 CA    Minimum Wage Rate USPS Abbreviation
4               14.0                CA
2273 MO     Minimum Wage Rate USPS Abbreviation
24              11.15                MO
2281 OK     Minimum Wage Rate USPS Abbreviation
35               7.25                OK
2292 TX     Minimum Wage Rate USPS Abbreviation
42               7.25                TX
2294 AL    Minimum Wage Rate USPS Abbreviation
0               7.25                AL
2297 OR     Minimum Wage Rate USPS Abbreviation
36              12.75                OR
2300 MN     Minimum Wage Rate USPS Abbreviation
22              10.33                MN
2312 LA     Minimum Wage Rate USPS Abbreviation
17               7.25                LA
2314 IL     Minimum Wage Rate USPS Abbreviation
12               12.0                IL
2320 UT     Minimum Wage Rate USPS Abbreviation
43               7.25                

30               11.5                NM
2807 GA    Minimum Wage Rate USPS Abbreviation
9               7.25                GA
2810 KS     Minimum Wage Rate USPS Abbreviation
15               7.25                KS
2813 WI     Minimum Wage Rate USPS Abbreviation
48               7.25                WI
2833 NE     Minimum Wage Rate USPS Abbreviation
26                9.0                NE
2834 MI     Minimum Wage Rate USPS Abbreviation
21               9.87                MI
2837 OH     Minimum Wage Rate USPS Abbreviation
34                9.3                OH
2838 SC     Minimum Wage Rate USPS Abbreviation
39               7.25                SC
2840 CO    Minimum Wage Rate USPS Abbreviation
5              12.56                CO
2855 IL     Minimum Wage Rate USPS Abbreviation
12               12.0                IL
2857 MA     Minimum Wage Rate USPS Abbreviation
20              14.25                MA
2858 NY     Minimum Wage Rate USPS Abbreviation
31               13.2               

24              11.15                MO
3386 IL     Minimum Wage Rate USPS Abbreviation
12               12.0                IL
3390 MO     Minimum Wage Rate USPS Abbreviation
24              11.15                MO
3393 ME     Minimum Wage Rate USPS Abbreviation
18              12.75                ME
3395 IA     Minimum Wage Rate USPS Abbreviation
14               7.25                IA
3404 KS     Minimum Wage Rate USPS Abbreviation
15               7.25                KS
3415 MD     Minimum Wage Rate USPS Abbreviation
19               12.5                MD
3431 NC     Minimum Wage Rate USPS Abbreviation
32               7.25                NC
3433 PA     Minimum Wage Rate USPS Abbreviation
37               7.25                PA
3443 CA    Minimum Wage Rate USPS Abbreviation
4               14.0                CA
3450 MA     Minimum Wage Rate USPS Abbreviation
20              14.25                MA
3459 VT     Minimum Wage Rate USPS Abbreviation
44              12.55             

4               14.0                CA
3940 TX     Minimum Wage Rate USPS Abbreviation
42               7.25                TX
3941 AK    Minimum Wage Rate USPS Abbreviation
1              10.34                AK
3947 VA     Minimum Wage Rate USPS Abbreviation
45               11.0                VA
3949 FL    Minimum Wage Rate USPS Abbreviation
8               10.0                FL
3957 IN     Minimum Wage Rate USPS Abbreviation
13               7.25                IN
3965 VA     Minimum Wage Rate USPS Abbreviation
45               11.0                VA
3972 LA     Minimum Wage Rate USPS Abbreviation
17               7.25                LA
3973 AL    Minimum Wage Rate USPS Abbreviation
0               7.25                AL
3975 KY     Minimum Wage Rate USPS Abbreviation
16               7.25                KY
3976 IA     Minimum Wage Rate USPS Abbreviation
14               7.25                IA
3979 OK     Minimum Wage Rate USPS Abbreviation
35               7.25                OK

5              12.56                CO
4436 NC     Minimum Wage Rate USPS Abbreviation
32               7.25                NC
4437 NJ     Minimum Wage Rate USPS Abbreviation
29               13.0                NJ
4438 CA    Minimum Wage Rate USPS Abbreviation
4               14.0                CA
4447 NM     Minimum Wage Rate USPS Abbreviation
30               11.5                NM
4451 WV     Minimum Wage Rate USPS Abbreviation
47               8.75                WV
4452 PA     Minimum Wage Rate USPS Abbreviation
37               7.25                PA
4453 CA    Minimum Wage Rate USPS Abbreviation
4               14.0                CA
4455 NY     Minimum Wage Rate USPS Abbreviation
31               13.2                NY
4461 GA    Minimum Wage Rate USPS Abbreviation
9               7.25                GA
4466 KY     Minimum Wage Rate USPS Abbreviation
16               7.25                KY
4467 RI     Minimum Wage Rate USPS Abbreviation
38              12.25                RI