In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def dim_course_progress(catalog_name):
    # Read the raw course completion data from the specified table
    analytics_course_completions = f"{catalog_name}.cleaned_data.analytics_course_completions"
    df = spark.read.table(analytics_course_completions)

    # Generate a unique identifier for each course-user combination
    df = df.withColumn("course_user_id_str", F.concat_ws("_", F.col("Course ID").cast("string"), F.col("User ID").cast("string")))

    # Select and rename relevant columns
    df_selected = df.select(
        "course_user_id_str",
        F.col("Course ID").alias("course_id"),
        F.col("User ID").alias("user_id"),
        F.col("Course Name").alias("course_name"),
        F.col("Course Category").alias("course_category"),
        F.col("Completion Status").alias("completion_status"),
        F.col("Time enrolled").alias("time_enrolled"),
        F.col("Time started").alias("time_started"),
        F.col("Time completed").alias("time_completed")
    )

    # Add a surrogate key using the row_number() function
    window_spec = Window.orderBy("course_user_id_str")
    df_with_id = df_selected.withColumn("course_user_id", F.row_number().over(window_spec))

    # Create a fallback row with default values (for cases with missing data)
    fallback_row = spark.createDataFrame([{
        "course_user_id": -1,
        "course_id": -1,
        "user_id": -1,
        "course_name": "N/A",
        "course_category": "N/A",
        "completion_status": "N/A",
        "time_enrolled": "1900-01-01",
        "time_started": "1900-01-01",
        "time_completed": "1900-01-01"
    }])

    # Cast the fallback date columns to the date type
    fallback_row = fallback_row.withColumn("time_enrolled", F.col("time_enrolled").cast("date")) \
                               .withColumn("time_started", F.col("time_started").cast("date")) \
                               .withColumn("time_completed", F.col("time_completed").cast("date"))

    # Union the main data with the fallback row to ensure all rows are included
    final_df = df_with_id.select(
        "course_user_id", "course_id", "user_id", "course_name", "course_category",
        "completion_status", "time_enrolled", "time_started", "time_completed"
    ).unionByName(fallback_row)

    # Save the final DataFrame as a Delta table at the specified output path
    output_path = f"{catalog_name}.schema.dim_course_progress"
    final_df.write.format("delta").mode("overwrite").saveAsTable(output_path)

    # Return the final DataFrame (optional, useful for further processing or testing)
    return final_df