In [0]:
dbutils.library.restartPython()

# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import trim, col, lit

# Read from Bronze Table

In [0]:
df = spark.read.table("workspace.bronze.spotify_streaming_history_raw")
df.display()

# Silver Transformations

### Drop columns that are 100% null

In [0]:
columns_to_drop = ["audiobook_chapter_title", "audiobook_chapter_uri", "audiobook_title", "audiobook_uri", "episode_name", "episode_show_name", "spotify_episode_uri"]

df = df.drop(*columns_to_drop)

### Remove rows where Track ID is null
- These rows are relative to episode podcasts

In [0]:
df = df.dropna(subset=["spotify_track_uri"])

### Trimming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))

### Replace Track URI for Track ID

In [0]:
df = df.withColumn("track_id", F.split(F.col("spotify_track_uri"), ":")[2]).drop("spotify_track_uri")

### Delete Track, Album and Artist Columns
- Because of the star schema that it will be used to get that information

In [0]:
df = df.drop("master_metadata_album_artist_name", "master_metadata_album_album_name", "master_metadata_track_name")

### Convert Date Columns to Timestamp

In [0]:
df = df.withColumn("ts", F.to_timestamp(col("ts")))

### Handle Offline Column Values

In [0]:
df = df.withColumn("offline_timestamp", 
    F.when(
        (F.col("offline") == True) & (F.col("offline_timestamp") > 0), 
        F.col("offline_timestamp")
    ).otherwise(None)
)

# Se o valor tiver 13 dÃ­gitos, dividimos por 1000. Se tiver 10, usamos direto.
df = df.withColumn("offline_at", 
    F.when(F.length(F.col("offline_timestamp").cast("string")) >= 13, 
           F.from_unixtime(F.col("offline_timestamp") / 1000))
     .otherwise(F.from_unixtime(F.col("offline_timestamp"))).cast("timestamp")
)

### Rename Column(s)

In [0]:
RENAME_MAP = {
    "conn_country": "country_code",
    "ip_addr": "ip_address",
    "ts": "played_at"
}

for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

## Check Dataframe

In [0]:
display(df)

# Save Silver Table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.spotify_streaming_history")