# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType, DateType
from pyspark.sql.functions import trim, col, lit

# Read from Bronze Table

In [0]:
df = spark.read.table("workspace.bronze.spotify_recently_played_raw")
display(df)

# Silver Transformations

### Flatenning

In [0]:
df = df.select(
    F.col("played_at"),
    F.col("context.type").alias("context_type"),
    F.col("context.href").alias("context_href"),
    F.col("context.uri").alias("context_uri"),
    F.col("context.external_urls.spotify").alias("context_external_url"),
    F.col("track.id").alias("track_id"),
    F.col("track.name").alias("track_name"),
    F.col("track.uri").alias("track_uri"),
    F.col("track.duration_ms").alias("track_duration_ms"),
    F.col("track.explicit").alias("track_explicit"),
    F.col("track.popularity").alias("track_popularity"),
    F.col("track.preview_url").alias("track_preview_url"),
    F.col("track.track_number").alias("track_number"),
    F.col("track.disc_number").alias("track_disc_number"),
    F.col("track.is_local").alias("track_is_local"),
    F.col("track.available_markets").alias("track_available_markets"),
    F.array_join(F.col("track.artists.name"), ", ").alias("track_artists_names"),
    F.col("track.artists.id").alias("track_artists_ids"),
    F.col("track.album.id").alias("album_id"),
    F.col("track.album.name").alias("album_name"),
    F.col("track.album.album_type").alias("album_type"),
    F.col("track.album.release_date").alias("album_release_date"),
    F.col("track.album.release_date_precision").alias("album_release_date_precision"),
    F.col("track.album.total_tracks").alias("album_total_tracks"),
    F.col("track.album.uri").alias("album_uri"),
    F.col("track.album.external_urls.spotify").alias("album_external_url"),
    F.col("track.album.images")[0].getItem("url").alias("album_image_url"),
    F.array_join(F.col("track.album.artists.name"), ", ").alias("album_artists_names")
)

### Trimming

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name, trim(col(field.name)))
    elif isinstance(field.dataType, ArrayType):
        df = df.withColumn(field.name, F.transform(col(field.name), lambda x: trim(x)))

### Remove Unnecessary Columns

In [0]:
columns_to_drop = [
    "context_type", 
    "context_uri",
    "context_href",
    "context_external_url",
    "track_name",
    "track_uri",
    "track_duration_ms",
    "track_explicit",
    "track_popularity",
    "track_preview_url",
    "track_number",
    "track_disc_number",
    "track_is_local",
    "track_available_markets",
    "track_artists_names",
    "album_name",
    "album_type",
    "album_release_date",
    "album_release_date_precision",
    "album_total_tracks",
    "album_uri",
    "album_external_url",
    "album_image_url",
    "album_artists_names"
]

df = df.drop(*columns_to_drop)

### Rename Columns

In [0]:
RENAME_MAP = {
    "track_artists_ids": "artists_ids"
}

for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

### Column Order

In [0]:
column_order = [
  "track_id",
  "artists_ids",
  "album_id",
  "played_at"
]

df = df.select(*column_order)