# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import trim, col, lit
from datetime import datetime

# Read from Bronze Table

In [0]:
df = spark.read.table("workspace.bronze.spotify_artists_raw")
display(df)

# Silver Transformations

### Trimming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))

### Remove URI and href Columns
- Only ID is necessary
- href isn't necessary for this project

In [0]:
df = df.drop("uri", "href")

### Filter rows where type is not "artist"

In [0]:
df = df.filter(df.type == "artist")

### Remove the column type
- After filtering the artists, the column is not necessary

In [0]:
df = df.drop("type")

### Handle null genres

In [0]:
df = df.withColumn(
    "genres",
    F.when(
        (F.col("genres").isNull()) | (F.size(F.coalesce(F.col("genres"), F.array())) == 0),
        F.array(F.lit("other"))
    ).otherwise(F.col("genres"))
)

### Remove column '*external_urls_spotify*' because is interily null

In [0]:
df = df.drop("external_urls_spotify")

### Handle total followers number's column 

In [0]:
df = df.withColumn("followers", col("followers.total"))
df = df.withColumnRenamed("followers", "total_followers")

# df = df.select(
#     "*",
#     col("followers.total").alias("total_followers")
# ).drop("followers")

### Handle images urls
- Get the first one only

In [0]:
df = df.withColumn(
    "images",
    F.when(
        (F.col("images").isNull()) | (F.size(F.col("images")) == 0),
        F.lit(None)
    ).otherwise(F.col("images")[0].getItem("url"))
)

### Handle NULL Values

In [0]:
df = df.withColumns({
    "name": F.coalesce(col("name"), lit("n/a")),
    "popularity": F.coalesce(col("popularity"), lit(0)),
    "total_followers": F.coalesce(col("total_followers"), lit(0)),
    "images": F.coalesce(col("images"), lit("n/a")),
    "processed_at": F.coalesce(col("processed_at"), lit(datetime.now()))
})

### Check Silver Dataframe

In [0]:
display(df)

### Save Silver Table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.spotify_artists")

## Check Silver Table

In [0]:
%sql
SELECT *
FROM workspace.silver.spotify_artists