# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import trim, col, lit, array
from datetime import datetime

# Read from Bronze Table

In [0]:
df = spark.read.table("workspace.bronze.spotify_tracks_raw")
display(df)

# Silver Transformations

### Trimming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))

### Remove Unnecessary Columns
- *URI* not necessary, ID is enough
- *href* and '*external_ids_isrc*' also not importante for this project

In [0]:
df = df.drop("uri", "href", "external_urls_spotify", "external_ids_isrc")

### Handling Type Column
- Should be '*track*'

In [0]:
df = df.filter(df.type == "track").drop("type")

### Handle NULL Values

In [0]:
df = df.withColumns({
    "name": F.coalesce(F.col("name"), F.lit("n/a")),
    "duration_ms": F.coalesce(F.col("duration_ms"), F.lit(0)),
    "available_markets": F.coalesce(F.col("available_markets"), F.lit("n/a")),
    "explicit": F.coalesce(F.col("explicit"), F.lit(False)),
    "popularity": F.coalesce(F.col("popularity"), F.lit(0)),
    "track_number": F.coalesce(F.col("track_number"), F.lit(0)),
    "disc_number": F.coalesce(F.col("disc_number"), F.lit(0)),
    "is_local": F.coalesce(F.col("is_local"), F.lit(False)),
    "album_id": F.coalesce(F.col("album_id"), F.lit("n/a")),
    "artist_ids": F.coalesce(F.col("artist_ids"), F.array()),
    "processed_at": F.coalesce(F.col("processed_at"), F.current_timestamp())
})

## Check Dataframe

In [0]:
df.display()

## Save Silver Table

In [0]:
df.write.mode("append").format("delta").saveAsTable("workspace.silver.spotify_tracks")

## Check Silver Table

In [0]:
%sql
SELECT *
FROM workspace.silver.spotify_tracks