In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, regexp_replace, when
from pyspark.sql.types import LongType, StringType

def clean_users(catalog_name):
    spark = SparkSession.builder.getOrCreate()

    csv_path = f"/Volumes/{catalog_name}/raw_data/{catalog_name}_data/kineo_analytics___users_report_report.csv"
    table_path = f"{catalog_name}.cleaned_data.analytics_users"

    columns_to_transform_to_datetime = []
    columns_to_transform_to_boolean = []
    columns_to_transform_to_bigint = [
        "Assignment ID", "User ID", "Organisation ID", "Position ID", "Manager ID", "Appraiser ID"
    ]

    # Define expected schema
    expected_schema = {
        "User ID": LongType(),
        "User's Fullname": StringType(),
        "User First Name": StringType(),
        "User Last Name": StringType(),
        "User's Country": StringType(),
        "User's City": StringType(),
        "User Status": StringType(),
        "Assignment ID": LongType(),
        "Assignment Name": StringType(),
        "Organisation ID": LongType(),
        "Organisation Name": StringType(),
        "Position ID": LongType(),
        "Position Name": StringType(),
        "Manager ID": LongType(),
        "Manager Name": StringType(),
        "Appraiser ID": LongType(),
        "Appraiser Name": StringType()
    }

    try:
        # Read CSV
        df = spark.read.format("csv").option("header", "true").load(csv_path)

        # Transform datetime columns
        for col_name in columns_to_transform_to_datetime:
            df = df.withColumn(
                col_name,
                to_timestamp(regexp_replace(col(col_name), ",", ""), "dd/MM/yyyy HH:mm:ss")
            )

        # Transform boolean columns
        for col_name in columns_to_transform_to_boolean:
            df = df.withColumn(
                col_name,
                when(col(col_name) == "1", True).otherwise(False)
            )

        # Transform bigint columns
        for col_name in columns_to_transform_to_bigint:
            df = df.withColumn(
                col_name,
                col(col_name).cast(LongType())
            )

        # Optional preview
        display(df)

        # Write to Delta Table
        df.write.format("delta") \
            .option("delta.columnMapping.mode", "name") \
            .mode("overwrite") \
            .saveAsTable(table_path)

        print(f"Successfully processed {csv_path} -> {table_path}")

    except Exception as e:
        print(f"ETL process failed: {e}")