In [0]:
%python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, regexp_replace, when
from pyspark.sql.types import LongType

def clean_course_completions(catalog_name):
    spark = SparkSession.builder.getOrCreate()

    csv_path = f"/Volumes/{catalog_name}/raw_data/{catalog_name}_data/kineo_analytics___course_completions_report_report.csv"
    table_path = f"{catalog_name}.cleaned_data.analytics_course_completions"

    # Column transformations
    columns_to_transform_to_datetime = [
        "Time enrolled", "Time started", "Time completed", "Completion Due Date"
    ]
    columns_to_transform_to_boolean = ["Include in dashboard"]
    columns_to_transform_to_bigint = ["User ID", "Course ID"]

    try:
        # Read CSV
        df = spark.read.format("csv").option("header", "true").load(csv_path)

        # Convert datetime fields
        for col_name in columns_to_transform_to_datetime:
            df = df.withColumn(
                col_name,
                to_timestamp(regexp_replace(col(col_name), ",", ""), "dd/MM/yyyy HH:mm:ss")
            )

        # Fill missing "Time enrolled" with "Time completed" if applicable
        df = df.withColumn(
            "Time enrolled",
            when(col("Time enrolled").isNull() & col("Time completed").isNotNull(), col("Time completed"))
            .otherwise(col("Time enrolled"))
        )

        # Boolean conversion
        for col_name in columns_to_transform_to_boolean:
            df = df.withColumn(
                col_name,
                when(col(col_name) == "1", True).otherwise(False)
            )

        # Bigint conversion
        for col_name in columns_to_transform_to_bigint:
            df = df.withColumn(col_name, col(col_name).cast(LongType()))

        # Preview
        display(df)

        # Write to Delta table
        df.write.format("delta") \
            .option("delta.columnMapping.mode", "name") \
            .mode("overwrite") \
            .saveAsTable(table_path)

        print(f"Successfully processed")
    except Exception as e:
        print(f"Data cleaning failed: {e}")