# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, ArrayType, DateType
from pyspark.sql.functions import trim, col, lit, array
from datetime import datetime

# Read from Bronze Table

In [0]:
df = spark.read.table("workspace.bronze.spotify_artists_raw")
display(df)

# Silver Transformations

### Expand Columns

In [0]:
df = df.select(
    col("external_urls.spotify").alias("external_url_spotify"),
    col("followers.href").alias("followers_href"),
    col("followers.total").alias("followers_total"),
    col("genres").alias("genres"),
    col("href"),
    col("id"),
    F.get(col("images.url"), 0).alias("image_url"),
    col("name"),
    col("popularity"),
    col("type"),
    col("uri"),
    col("processed_at")
)

In [0]:
display(df)

### Trimming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))
  elif isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StringType):
    df = df.withColumn(field.name, F.transform(col(field.name), lambda x: trim(x)))

### Remove Full NULL Columns

In [0]:
null_counts = df.select([F.count(c).alias(c) for c in df.columns]).collect()[0].asDict()
cols_to_drop = [k for k, v in null_counts.items() if v == 0]
df = df.drop(*cols_to_drop)

print(f"Colunas removidas: {cols_to_drop}")

### Remove Unnecessary Columns

In [0]:
df = df.drop("external_url_spotify", "uri", "href", "image_url")

### Handle Column *Type*

In [0]:
df = df.filter(df.type == "artist").drop("type")

### Handle null genres

In [0]:
df = df.withColumns({
    "followers_total": F.coalesce(col("followers_total"), lit("n/a")),
    "genres": F.coalesce(col("genres"), array()),
    "name": F.coalesce(col("name"), lit("n/a")),
    "popularity": F.coalesce(col("popularity"), lit(0)),
    "processed_at": F.coalesce(col("processed_at"), lit(F.current_timestamp()))
})

### Rename Columns

In [0]:
RENAME_MAP = {
  "followers_total": "total_followers"
}

for old_name, new_name in RENAME_MAP.items():
  df = df.withColumnRenamed(old_name, new_name)

### Handle Columns Order

In [0]:
column_order = [
    "id",
    "name",
    "total_followers",
    "genres",
    "popularity",
    "processed_at"
]

df = df.select(*column_order)

## Check Silver Dataframe

In [0]:
display(df)

## Save Silver Table

In [0]:
try:
    target_table = spark.table(spark, "workspace.silver.spotify_artists")

    target_table.alias("target") \
  .merge(
    df.alias("source"),
    "target.id = source.id"
  ) \
  .whenMatchedUpdateAll() \
  .whenNotMatchedInsertAll() \
  .execute()

except:
    df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.spotify_artists")

## Check Silver Table

In [0]:
%sql
SELECT *
FROM workspace.silver.spotify_artists