In [0]:
from pyspark.sql.functions import *
from cryptography.fernet import Fernet
from pyspark.sql.types import StringType
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql.functions import current_timestamp, lit
from pyspark.sql.functions import current_timestamp
from delta.tables import DeltaTable
from pyspark.sql.types import *

In [0]:
%run "./UTIL_FUNCTIONS"

In [0]:
%run "./CONSTANTS"

In [0]:
data_source = dbutils.widgets.get("DATA_SOURCE")
integration_id = dbutils.widgets.get("INTEGRATION_ID")
load_type = dbutils.widgets.get("LOAD_TYPE")
run_id = dbutils.widgets.get("RUN_ID")
target_table = dbutils.widgets.get("TARGET_TABLE")
key_column_list = integration_id.split(',')

In [0]:
landing_path = f"{data_landing_path}"
curated_path = f"{data_curated_path}"

In [0]:
employee_schema = StructType([
    StructField("EmpID", StringType(), True),
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("StartDate", DateType(), True),
    StructField("ExitDate", DateType(), True),
    StructField("Title", StringType(), True),
    StructField("Supervisor", StringType(), True),
    StructField("ADEmail", StringType(), True),
    StructField("BusinessUnit", StringType(), True),
    StructField("EmployeeStatus", StringType(), True),
    StructField("EmployeeType", StringType(), True),
    StructField("PayZone", StringType(), True),
    StructField("EmployeeClassificationType", StringType(), True),
    StructField("TerminationType", StringType(), True),
    StructField("TerminationDescription", StringType(), True),
    StructField("DepartmentType", StringType(), True),
    StructField("Division", StringType(), True),
    StructField("DOB", DateType(), True),
    StructField("State", StringType(), True),
    StructField("JobFunctionDescription", StringType(), True),
    StructField("GenderCode", StringType(), True),
    StructField("LocationCode", StringType(), True),
    StructField("RaceDesc", StringType(), True),
    StructField("MaritalDesc", StringType(), True),
    StructField("PerformanceScore", StringType(), True),
    StructField("CurrentEmployeeRating", StringType(), True),
])
df = spark.read.schema(employee_schema).format('csv').option('header', True).load(f'{landing_path}/employee_data.csv')

In [0]:
quality_df = []

# check for dupplicate
quality_df.append(check_duplicates(df, 'EmpID'))

# check for nulls
quality_df.extend([check_to_nulls(df, val) for val in ['EmpID', 'ADEmail', 'StartDate', 'FirstName', 'LastName', 'EmployeeStatus', 'EmployeeType', 'BusinessUnit', 'Supervisor', 'DepartmentType', 'Division']])

# check for valid state
quality_df.extend([check_for_valid_state(df, 'GenderCode', ['Male', 'Female']), check_for_valid_state(df, 'EmployeeType', ['Contract', 'Full-Time', 'Part-Time'])])

# check for email column
quality_df.append(check_for_email_format(df, 'ADEmail'))
                                               
 # check termination/start date consitency check
quality_df.append(check_logical_date(df, 'StartDate',  'ExitDate'))
                                                                                                 
union_df = reduce(DataFrame.unionByName, quality_df)

In [0]:
write_to_data_lake(union_df, 'delta', 'owerwrite', '{curated_path}/{target_table}_DQ', '')

In [0]:
df = df.withColumn('Performance Score', encrypt_data(col('Performance Score'))) \
    .withColumn('Current Employee Rating', encrypt_data(col('Current Employee Rating')))

In [0]:
df_final = update_dw_columns(df , key_column_list, run_id, data_source)
df_final = convert_date_columns(df_final)

In [0]:
write_to_data_lake(df, 'delta', 'scd2', '{curated_path}/{target_table}', integration_id)