In [1]:
# Base path to csv files
base_path = "../raw-files/Human Resources/"

# List of file names
file_names = [
    'HumanResources Department.csv',
    'HumanResources Employee.csv',
    'HumanResources EmployeeDepartmentHistory.csv',
    'HumanResources EmployeePayHistory.csv',
    'HumanResources JobCandidate.csv',
    'HumanResources Shift.csv'
]

In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName('HR').getOrCreate()

# Read each file into a DataFrame
dataframes = {}
for file_name in file_names:
    df_name = file_name.replace(' ', '_').replace('.csv', '').lower()
    dataframes[df_name] = spark.read.format('csv').option('header', 'true').load(f'{base_path}/{file_name}')

# Accessing individual DataFrames
department_df = dataframes['humanresources_department']
employee_df = dataframes['humanresources_employee']
employeeDepartmentHistory_df = dataframes['humanresources_employeedepartmenthistory']
employeePayHistory_df = dataframes['humanresources_employeepayhistory']
jobCandidate_df = dataframes['humanresources_jobcandidate']
shift_df = dataframes['humanresources_shift']

24/06/24 13:33:49 WARN Utils: Your hostname, Joshs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.100.2 instead (on interface en0)
24/06/24 13:33:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/24 13:33:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
def rename_columns(df, rename_mappings):
    for old_name, new_name in rename_mappings.items():
        df = df.withColumnRenamed(old_name,new_name)
    return df

# Define the rename mappings for each DataFrame
rename_mappings = {
    'department_df': {'Name':'DepartmentName', 'ModifiedDate': 'DepartmentModifiedDate'},
    'employee_df': {'ModifiedDate': 'EmployeeModifiedDate'},
    'employeeDepartmentHistory_df': {'ModifiedDate': 'EmployeeDepartmentHistoryModifiedDate'},
    'employeePayHistory_df': {'ModifiedDate': 'EmployeePayHistoryModifiedDate'},
    'jobCandidate_df': {'ModifiedDate': 'JobCandidateModifiedDate'},
    'shift_df': {'Name':'ShiftName', 'ModifiedDate': 'ShiftModifiedDate'}
}

department_df = rename_columns(department_df, rename_mappings['department_df'])
employee_df = rename_columns(employee_df, rename_mappings['employee_df'])
employeeDepartmentHistory_df = rename_columns(employeeDepartmentHistory_df, rename_mappings['employeeDepartmentHistory_df'])
employeePayHistory_df = rename_columns(employeePayHistory_df, rename_mappings['employeePayHistory_df'])
jobCandidate_df = rename_columns(jobCandidate_df, rename_mappings['jobCandidate_df'])
shift_df = rename_columns(shift_df, rename_mappings['shift_df'])

In [4]:
# Join the employeeDepartmentHistory_df and department_df and shift_df
human_resources_details_df = employeeDepartmentHistory_df.join(department_df, "DepartmentID", "left")
human_resources_details_df = human_resources_details_df.join(shift_df,"ShiftID","left")

# Join other DataFrames
human_resources_details_df = human_resources_details_df.join(employee_df,"BusinessEntityID","left")
human_resources_details_df = human_resources_details_df.join(jobCandidate_df, "BusinessEntityID","left")
human_resources_details_df = human_resources_details_df.join(employeePayHistory_df,"BusinessEntityID","left")


In [6]:
# save to csv file
output_path = "../denormalized-files/human_resources.csv"

# Convert Spark DataFrame to pandas DataFrame
human_resources_details_pd_df = human_resources_details_df.toPandas()

# Save to CSV using pandas, ensuring it's a single file
human_resources_details_pd_df.to_csv(output_path, index=False, header=True)