# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import trim, col, lit, array
from datetime import datetime

# Read from Bronze Table

In [0]:
df = spark.read.table("workspace.bronze.spotify_tracks_raw")
display(df)

# Silver Transformations

### Expand Columns

In [0]:
df = df.select(

    #Album
    col("album.album_type").alias("album_album_type"),
    col("album.total_tracks").alias("album_total_tracks"),
    col("album.available_markets").alias("album_available_markets"),
    col("album.external_urls.spotify").alias("album_external_urls_spotify"),
    col("album.href").alias("album_href"),
    col("album.id").alias("album_id"),
    F.get(col("album.images.url"), 0).alias("album_image_url"),
    col("album.name").alias("album_name"),
    col("album.release_date").alias("album_release_date"),
    col("album.release_date_precision").alias("album_release_date_precision"),
    col("album.type").alias("album_type"),
    col("album.uri").alias("album_uri"),
    col("album.artists.external_urls.spotify").alias("album_artists_external_urls_spotify"),
    col("album.artists.href").alias("album_artists_href"),
    col("album.artists.id").alias("album_artists_id"),
    col("album.artists.name").alias("album_artists_name"),
    col("album.artists.type").alias("album_artists_type"),
    col("album.artists.uri").alias("album_type_uri"),

    # Artists
    col("artists.external_urls.spotify").alias("artists_external_urls_spotify"),
    col("artists.href").alias("artists_href"),
    col("artists.id").alias("artists_id"),
    col("artists.name").alias("artists_name"),
    col("artists.type").alias("artists_type"),
    col("artists.uri").alias("artists_uri"),

    # Track Details
    col("available_markets"),
    col("disc_number"),
    col("duration_ms"),
    col("explicit"),
    col("external_ids.isrc").alias("external_ids_isrc"),
    col("external_ids.ean").alias("external_ids_ean"),
    col("external_ids.upc").alias("external_ids_upc"),
    col("external_urls.spotify").alias("external_urls_spotify"),
    col("href"),
    col("id"),
    col("is_playable"),
    col("linked_from.external_urls.spotify").alias("linked_from_external_urls_spotify"),
    col("linked_from.href").alias("linked_from_href"),
    col("linked_from.id").alias("linked_from_id"),
    col("linked_from.type").alias("linked_from_type"),
    col("linked_from.uri").alias("linked_from_uri"),
    col("restrictions.reason").alias("restrictions_reason"),
    col("name"),
    col("popularity"),
    col("preview_url"),
    col("track_number"),
    col("type"),
    col("uri"),
    col("is_local"),
    col("processed_at")
)

### Trimming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))

### Remove Full NULL Columns

In [0]:
null_counts = df.select([F.count(c).alias(c) for c in df.columns]).collect()[0].asDict()
cols_to_drop = [k for k, v in null_counts.items() if v == 0]
df = df.drop(*cols_to_drop)

print(f"Colunas removidas: {cols_to_drop}")

In [0]:
df.display()

### Remove Unnecessary columns

In [0]:
columns_to_drop = [
    "album_album_type",
    "album_artists_id",
    "album_type_uri",
    "album_total_tracks",
    "album_available_markets",
    "album_external_urls_spotify",
    "album_href",
    "album_image_url",
    "album_name",
    "album_release_date",
    "album_release_date_precision",
    "album_type",
    "album_uri",
    "album_artists_external_urls_spotify",
    "album_artists_href",
    "album_artists_name",
    "album_artists_type",
    "album_artists_uri",
    "artists_external_urls_spotify",
    "artists_href",
    "artists_name",
    "artists_type",
    "artists_uri",
    "href",
    "uri"
]

df = df.drop(*columns_to_drop)

### Handling Type Column
- Should be '*track*'

In [0]:
df = df.filter(df.type == "track").drop("type")

In [0]:
df.display()

### Handle NULL Values

In [0]:
df = df.withColumns({
    "album_id": F.coalesce(F.col("album_id"), lit("n/a")),
    "artists_id": F.coalesce(F.col("artists_id"), array()),
    "available_markets": F.coalesce(F.col("available_markets"), array()),
    "disc_number": F.coalesce(col("disc_number"), lit(0)),
    "duration_ms": F.coalesce(col("duration_ms"), lit(0)),
    "explicit": F.coalesce(col("explicit"), lit(False)),
    "external_ids_isrc": F.coalesce(col("external_ids_isrc"), lit("n/a")),
    "external_urls_spotify": F.coalesce(col("external_urls_spotify"), lit("n/a")),
    "name": F.coalesce(F.col("name"), F.lit("n/a")),
    "popularity": F.coalesce(F.col("popularity"), F.lit(0)),
    "track_number": F.coalesce(F.col("track_number"), F.lit(0)),
    "is_local": F.coalesce(F.col("is_local"), F.lit(False)),
    "processed_at": F.coalesce(F.col("processed_at"), F.current_timestamp())
})

### Reorder Column Order

In [0]:
column_order = [
    "id",
    "album_id",
    "artists_id",
    "name",
    "duration_ms",
    "popularity",
    "explicit",
    "track_number",
    "disc_number",
    "is_local",
    "available_markets",
    "external_urls_spotify",
    "external_ids_isrc",
    "processed_at"
]

df = df.select(*column_order)

## Check Dataframe

In [0]:
df.display()

## Save Silver Table

In [0]:
df.write.mode("append").format("delta").saveAsTable("workspace.silver.spotify_tracks")

## Check Silver Table

In [0]:
%sql
SELECT *
FROM workspace.silver.spotify_tracks