In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
%run "./UTIL_FUNCTIONS"

In [0]:
%run "./CONSTANTS"

In [0]:
data_source = dbutils.widgets.get("DATA_SOURCE")
integration_id = dbutils.widgets.get("INTEGRATION_ID")
load_type = dbutils.widgets.get("LOAD_TYPE")
run_id = dbutils.widgets.get("RUN_ID")
target_table = dbutils.widgets.get("TARGET_TABLE")
key_column_list = integration_id.split(',')

In [0]:
landing_path = f"{data_landing_path}"
curated_path = f"{data_curated_path}{target_table}/"

In [0]:
training_schema = StructType([
    StructField("Employee ID", StringType(), True),              # ID as string (safer if mixed formats)
    StructField("Training Date", DateType(), True),              # Training date
    StructField("Training Program Name", StringType(), True),
    StructField("Training Type", StringType(), True),
    StructField("Training Outcome", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Trainer", StringType(), True),
    StructField("Training Duration(Days)", IntegerType(), True), # duration should be int
    StructField("Training Cost", DoubleType(), True)             # cost as numeric (decimal/double)
])
df = spark.read.schema(training_schema).format('csv').option('header', True).load(f'{landing_path}/training_and_development_data.csv')


In [0]:
quality_df = []

# check for nulls
quality_df.extend([check_to_nulls(df, val) for val in ['Employee ID', 'Traning Date', 'Training Program Name', 'Trainer', 'Training Cost']])

# check for valid state
quality_df.extend([check_for_valid_state(df, 'Training Type', ['External', 'Internal']), check_for_valid_state(df, 'Training Outcome', ['Passed', 'Completed', 'Failed', 'Incomplete'])])
                                                                                                 
union_df = reduce(DataFrame.unionByName, quality_df)

In [0]:
write_to_data_lake(union_df, 'delta', 'owerwrite', '{curated_path}/{target_table}_DQ', '')

In [0]:
df_final = update_dw_columns(df , key_column_list, run_id, data_source)
df_final = convert_date_columns(df_final)

In [0]:
write_to_data_lake(df_final, 'delta', 'merge', '{curated_path}/{target_table}', integration_id)