In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

CATALOG = "music_demo"
BRONZE  = "bronze"
SILVER  = "silver"

spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SILVER}")
spark.sql(f"USE {SILVER}")

def to_int(col):
    # empty -> null, non-numeric -> null, else cast
    return F.when(F.trim(F.col(col)) == "", None) \
            .otherwise(F.col(col)).cast("int")

def to_bigint(col):
    return F.when(F.trim(F.col(col)) == "", None) \
            .otherwise(F.col(col)).cast("bigint")

def to_date(col, fmt="yyyy-MM-dd"):
    # empty -> null, else to_date
    return F.to_date(F.when(F.trim(F.col(col)) == "", None).otherwise(F.col(col)), fmt)


In [0]:
bronze_art = spark.table(f"{CATALOG}.{BRONZE}.artists")  # all strings

dim_artist = (bronze_art
    .select(
        F.col("artist_id"),
        F.col("artist_name"),
        F.col("genre"),
        F.col("country_of_origin"),
        to_int("debut_year").alias("debut_year"),
    )
    .dropDuplicates(["artist_id"])
)

dim_artist.write.mode("overwrite").format("delta").saveAsTable(f"{CATALOG}.{SILVER}.dim_artist")
display(spark.sql(f"SELECT * FROM {CATALOG}.{SILVER}.dim_artist LIMIT 5"))


In [0]:
bronze_tr = spark.table(f"{CATALOG}.{BRONZE}.tracks")

dim_track = (bronze_tr
    .select(
        F.col("track_id"),
        F.col("artist_id"),
        F.col("track_title"),
        to_date("release_date").alias("release_date"),   # cast safely
        F.col("primary_genre"),
    )
    .dropDuplicates(["track_id"])
)

dim_track.write.mode("overwrite").format("delta").saveAsTable(f"{CATALOG}.{SILVER}.dim_track")
display(spark.sql(f"SELECT * FROM {CATALOG}.{SILVER}.dim_track LIMIT 5"))


In [0]:
bronze_m = spark.table(f"{CATALOG}.{BRONZE}.daily_metrics")

fact_metrics = (bronze_m
    .select(
        to_date("date").alias("date"),
        F.col("platform"),
        F.col("region"),
        F.col("artist_id"),
        F.col("track_id"),
        to_bigint("streams").alias("streams"),
        to_bigint("views").alias("views"),
        to_bigint("likes").alias("likes"),
        to_bigint("comments").alias("comments"),
        to_bigint("shares").alias("shares"),
        to_bigint("followers_gained").alias("followers_gained"),
        to_int("rank_estimate").alias("rank_estimate"),
    )
)

fact_metrics.write.mode("overwrite").format("delta").saveAsTable(f"{CATALOG}.{SILVER}.fact_metrics")
display(spark.sql(f"SELECT COUNT(*) rows FROM {CATALOG}.{SILVER}.fact_metrics"))
display(spark.sql(f"""
SELECT date, platform, region, streams, views
FROM {CATALOG}.{SILVER}.fact_metrics
ORDER BY date DESC
LIMIT 10
"""))
