# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType, DateType
from pyspark.sql.functions import trim, col, lit

# Read from Bronze Table

In [0]:
df = spark.read.table("workspace.bronze.spotify_albums_raw")
display(df)

# Silver Transformations

### Expand Columns

In [0]:
df = df.select(
    col("album_type"),
    col("total_tracks"),
    col("available_markets"),
    col("external_urls.spotify").alias("external_urls_spotify"),
    col("href"),
    col("id"),
    F.get(col("images.url"), 0).alias("image_url"),
    col("name"),
    col("release_date"),
    col("release_date_precision"),
    col("restrictions"),
    col("type"),
    col("uri"),

    # Album Artists
    col("artists.external_urls.spotify").alias("artists_external_urls_spotify"),
    col("artists.href").alias("artists_href"),
    col("artists.id").alias("artists_id"),
    col("artists.name").alias("artists_name"),
    col("artists.type").alias("artists_type"),
    col("artists.uri").alias("artists_uri"),

    # Album Tracks
    # col("tracks.items").alias("tracks"),
    col("tracks.items.id").alias("tracks_id"),
    col("tracks.items.name").alias("tracks_name"),
    col("tracks.items.track_number").alias("tracks_track_number"),
    col("tracks.items.disc_number").alias("tracks_disc_number"),
    col("tracks.items.duration_ms").alias("tracks_duration_ms"),
    col("tracks.items.explicit").alias("tracks_explicit"),
    col("tracks.items.type").alias("tracks_type"),
    col("tracks.items.uri").alias("tracks_uri"),
    col("tracks.items.is_local").alias("tracks_is_local"),
    col("tracks.items.is_playable").alias("tracks_is_playable"),
    col("tracks.items.preview_url").alias("tracks_preview_url"),
    col("tracks.items.restrictions").alias("tracks_restrictions"),
    col("tracks.items.artists.id").alias("tracks_artists_id"),
    col("tracks.items.artists.name").alias("tracks_artists_name"),
    col("tracks.items.artists.type").alias("tracks_artists_type"),
    col("tracks.items.artists.uri").alias("tracks_artists_uri"),
    col("tracks.items.artists.href").alias("tracks_artists_href"),
    col("tracks.items.artists.external_urls.spotify").alias("tracks_artists_external_urls_spotify"),
    col("tracks.items.available_markets").alias("tracks_available_markets"),
    col("tracks.items.external_urls").alias("tracks_external_urls"),
    col("tracks.items.linked_from.id").alias("tracks_linked_from_id"),
    col("tracks.items.linked_from.type").alias("tracks_linked_from_type"),
    col("tracks.items.linked_from.uri").alias("tracks_linked_from_uri"),
    col("tracks.items.linked_from.href").alias("tracks_linked_from_href"),
    col("tracks.items.linked_from.external_urls.spotify").alias("tracks_linked_from_external_urls_spotify"),

    col("copyrights.text").alias("copyrights_text"),
    col("copyrights.type").alias("copyrights_type"),
    col("external_ids.isrc").alias("external_ids_isrc"),
    col("external_ids.ean").alias("external_ids_ean"),
    col("external_ids.upc").alias("external_ids_upc"),
    col("genres"),
    col("label"),
    col("popularity"),
    col("processed_at")
)

### Trimming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))
  elif isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StringType):
    df = df.withColumn(field.name, F.transform(col(field.name), lambda x: trim(x)))

### Remove All NULL Columns

In [0]:
null_counts = df.select([F.count(c).alias(c) for c in df.columns]).collect()[0].asDict()
cols_to_drop = [k for k, v in null_counts.items() if v == 0]
df = df.drop(*cols_to_drop)

print(f"Colunas removidas: {cols_to_drop}")

### Remove Unnecessary Columns

In [0]:
columns_to_drop = [
    "external_urls_spotify",
    "href",
    "uri",
    "artists_external_urls_spotify",
    "artists_href",
    "artists_name",
    "artists_type",
    "artists_uri",
    "tracks_name",
    "tracks_track_number",
    "tracks_disc_number",
    # "tracks_duration_ms", # dá para fazer alguma espécie de média com estas durações
    "tracks_explicit",
    "tracks_type",
    "tracks_uri",
    "tracks_is_local",
    "tracks_is_playable",
    "tracks_preview_url",
    "tracks_restrictions",
    "tracks_artists_name",
    "tracks_artists_type",
    "tracks_artists_uri",
    "tracks_artists_href",
    "tracks_artists_external_urls_spotify",
    "tracks_available_markets",
    "tracks_external_urls",
    "tracks_linked_from_type",
    "tracks_linked_from_uri",
    "tracks_linked_from_href",
    "tracks_linked_from_external_urls_spotify",
    "copyrights_type"
]

df = df.drop(*columns_to_drop)

### Handle Type Column

In [0]:
df = df.filter(col("type") == "album").drop("type")

In [0]:
df.where(F.array_contains(col("artists_id"), "06HL4z0CvFAxyc27GXpf02")).orderBy(F.desc("release_date")).display()

### Convert Date Columns to the right type

In [0]:
df = df.withColumns({
    "release_date": 
        F.when(F.length(col("release_date")) == 4, F.to_timestamp(col("release_date"), "yyyy"))
         .when(F.length(col("release_date")) == 7, F.to_timestamp(col("release_date"), "yyyy-MM"))
         .when(F.length(col("release_date")) == 10, F.to_timestamp(col("release_date"), "yyyy-MM-dd"))
         .otherwise(F.lit(None)),
    "processed_at": F.to_timestamp(col("processed_at"), "yyyy-MM-dd'T'HH:mm:ss'Z'")
})

### Handle NULL Values

In [0]:
df = df.withColumns({
    "album_type": F.coalesce(col("album_type"), lit("n/a")),
    "total_tracks": F.coalesce(col("total_tracks"), lit(0)),
    "available_markets": F.coalesce(col("available_markets"), F.array()),
    "image_url": F.coalesce(col("image_url"), lit("n/a")),
    "name": F.coalesce(col("name"), lit("n/a")),
    "release_date": F.coalesce(col("release_date"), F.to_timestamp(F.lit("1900-01-01"))),
    "release_date_precision": F.coalesce(col("release_date_precision"), lit("n/a")),
    "artists_id": F.coalesce(col("artists_id"), F.array()),
    "tracks_id": F.coalesce(col("tracks_id"), F.array()),
    "tracks_duration_ms": F.coalesce(col("tracks_duration_ms"), F.array()),
    "tracks_artists_id": F.coalesce(col("tracks_artists_id"), F.array()),
    "tracks_linked_from_id": F.coalesce(col("tracks_linked_from_id"), F.array()),
    "copyrights_text": F.coalesce(col("copyrights_text"), F.array()),
    "external_ids_upc": F.coalesce(col("external_ids_upc"), lit("n/a")),
    "genres": F.coalesce(col("genres"), F.array()),
    "label": F.coalesce(col("label"), lit("n/a")),
    "popularity": F.coalesce(col("popularity"), lit(0)),
    "processed_at": F.coalesce(col("processed_at"), F.current_timestamp())
})

### Handle Track Artists IDs Column

In [0]:
df = df.withColumn(
    "tracks_artists_id", 
    F.array_distinct(F.flatten(col("tracks_artists_id")))
)

### Customed Column Order

In [0]:
column_order = [
    "id",
    "name",
    "artists_id",
    "tracks_artists_id",
    "tracks_id",
    "popularity",
    "genres",
    "album_type",
    "total_tracks",
    "image_url",
    "release_date",
    "release_date_precision",
    "tracks_duration_ms",
    "tracks_linked_from_id",
    "available_markets",
    "copyrights_text",
    "external_ids_upc",
    "label",
    "processed_at"
]

df = df.select(*column_order)

### Rename Columns

In [0]:
RENAME_MAP = {
    "artists_id": "album_artists_ids",
    "tracks_artists_id": "track_artists_ids",
    "tracks_id": "tracks_ids"
}

for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

## Check Dataframe

In [0]:
df.display()

## Save Silver Table

In [0]:
df.write.mode("append").format("delta").saveAsTable("workspace.silver.spotify_albums")

## Check Silver Table

In [0]:

%sql
SELECT *
FROM workspace.silver.spotify_albums