In [0]:
%sql
USE CATALOG spotify_etl;
USE SCHEMA silver

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS spotify_etl.silver.spotify_tracks_cleaned;

In [0]:
from pyspark.sql.functions import col, when, round, explode, split

bronze_path = "/Volumes/spotify_etl/bronze/spotify/tracks_info"

#limpeza
df_silver = (
    spark.read.format("delta").load(bronze_path) \
    # verificar se há Ids duplicados
    .dropDuplicates(["id"])\
    #excluir a coluna _c0
    .drop("_C0")
 )

In [0]:
#Transformação
df_silver = df_silver \
    .withColumn("duration_min", round(col("duration_ms")/60000, 2))\
    .withColumn("key_note", when(col("key") == 0, "C")\
                .when(col("key") == 1, "C#")\
                .when(col("key") == 2, "D")\
                .when(col("key") == 3, "D#")\
                .when(col("key") == 4, "E")\
                .when(col("key") == 5, "F")\
                .when(col("key") == 6, "F#")\
                .when(col("key") == 7, "G")\
                .when(col("key") == 8, "G#")\
                .when(col("key") == 9, "A")\
                .when(col("key") == 10, "A#")\
                .when(col("key") == 11, "B")\
                .otherwise("Unknown"))
df_silver.limit(5).display()

id,artist_names,name,genres,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artists_ids,duration_min,key_note
spotify:track:2eSnW4d3A4SyEVhVPmBffa,"Anitta, Becky G",Banana,"latin pop,pop nacional,pagode baiano,latin viral pop,rap latina,reggaeton,pop,funk pop,urbano latino,trap latino,funk rj",0.0195,0.752,195619.0,0.681,0.0,11,0.0353,-6.684,0,0.299,73.233,4,0.648,"7FNnA9vBm6EKceENgCGRMb,4obzFoKoKRHIphyHzJ35G3",3.26,B
spotify:track:2OUffBn30YvuPbZ0SmBgNs,LUDMILLA,Jogando sujo,"funk carioca,pop nacional,pagode baiano,funk pop,funk rj",0.111,0.904,167695.0,0.75,0.0,9,0.127,-3.823,0,0.0584,128.023,4,0.79,3CDoRporvSjdzTrm99a3gi,2.79,A
spotify:track:4BLyDt62gonjl7b2k41d6Y,Zé Neto & Cristiano,Bebida Na Ferida - Acústico,"agronejo,sertanejo universitario,sertanejo,arrocha",0.503,0.658,165291.0,0.606,0.0,1,0.17,-6.089,1,0.0376,125.997,4,0.426,487N2T9nIPEHrlTZLL3SQs,2.75,C#
spotify:track:301UKq8uNAv7Zj7MQkQWyd,"MC Rick, Mc Leozin, Dj Caio Vieira",Terminei Com A Ex,"funk bh,funk carioca,funk mtg",0.443,0.932,110769.0,0.472,0.0,5,0.0916,-7.408,0,0.328,129.939,4,0.893,"1mvpEXClANunyiHFtAXCxt,6mKwmTMzDrye9elc8JWiw3,4Kv35Xnw1ODjO2nIHLhdBY",1.85,F
spotify:track:1ELsaYdj2NlxLvkx9g24xZ,"Guilherme & Benuto, Hugo & Guilherme",Haja Colírio (feat. Hugo & Guilherme) - Ao Vivo,"agronejo,arrocha,sertanejo universitario,sertanejo pop,sertanejo",0.59,0.714,173367.0,0.748,0.0,11,0.626,-4.605,1,0.0679,105.031,4,0.698,"6m6e7D2TnV0aYMllFFwMxu,1LIuN7ov1IBQDdLsU83ojl",2.89,B


In [0]:
# Enriquecimento
df_silver = df_silver \
    .withColumn("modo", when(col("mode") == 0, "menor").otherwise("maior")) \
    .withColumn("sentimento", when(col("valence") > 0.66, "feliz") \
                .when(col("valence") < 0.33, "triste")\
                .otherwise("neutra"))\
    .withColumn("is_instrumental", col("instrumentalness") > 0.88)\
    .withColumn("is_energy", col("energy") > 0.88)\
    .withColumn("is_acoustic", col("acousticness") > 0.88)


silver_path = "spotify_etl.silver.spotify_tracks_cleaned"
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(silver_path)


In [0]:
df_silver.write.format("delta").mode("overwrite").save("/Volumes/spotify_etl/silver/spotify_tracks_cleaned")

df_songs = spark.read.format("delta").load("/Volumes/spotify_etl/silver/spotify_tracks_cleaned")

In [0]:
# subtabela de generos
df_genres = df_songs.select("id", "name", "duration_min", explode(split("genres", ",")).alias("genre"))

df_genres.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("spotify_etl.silver.genre")

In [0]:
# subtabela de musica acustica

df_acoustic_info = df_silver.filter(col("is_acoustic") == True) \
  .select("id", "artist_names", "name", "is_acoustic")

df_acoustic_info.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("spotify_etl.silver.acoustic_tracks_info")

In [0]:
# subtabela de musica instrumental

df_instrumental_info = df_silver.filter(col("is_instrumental") == True) \
    .select("id", "artist_names", "name", "is_instrumental")

df_instrumental_info.write.format("delta") \
    .mode("overwrite") \
        .option("overwriteSchema", "true") \
            .saveAsTable("spotify_etl.silver.instrumental_tracks_info")

In [0]:
# subtabela de musica animada

df_energy_info = df_silver.filter(col("is_energy") == True) \
    .select("id", "artist_names", "name", "is_energy")

df_energy_info.write.format("delta") \
    .mode("overwrite") \
        .option("overwriteSchema", "true") \
            .saveAsTable("spotify_etl.silver.energy_tracks_info")

In [0]:
# subtabela que mostra os sentimentos das musicas - alegre, triste, neutra

df_sentiment_info = df_silver \
    .select("id", "artist_names", "name", "sentimento")

df_sentiment_info.write.format("delta") \
    .mode("overwrite") \
        .option("overwriteSchema", "true") \
            .saveAsTable("spotify_etl.silver.sentimento_tracks_info")

In [0]:
# subtabela que mostra a tonalidade de cada musica

df_keymode_info = df_silver \
    .select("id", "artist_names", "name", "key_note")

df_keymode_info.write.format("delta") \
    .mode("overwrite") \
        .option("overwriteSchema", "true") \
            .saveAsTable("spotify_etl.silver.keymode_tracks_info")

df_keymode_info.limit(5).display()

id,artist_names,name,key_note
spotify:track:2eSnW4d3A4SyEVhVPmBffa,"Anitta, Becky G",Banana,B
spotify:track:2OUffBn30YvuPbZ0SmBgNs,LUDMILLA,Jogando sujo,A
spotify:track:4BLyDt62gonjl7b2k41d6Y,Zé Neto & Cristiano,Bebida Na Ferida - Acústico,C#
spotify:track:301UKq8uNAv7Zj7MQkQWyd,"MC Rick, Mc Leozin, Dj Caio Vieira",Terminei Com A Ex,F
spotify:track:1ELsaYdj2NlxLvkx9g24xZ,"Guilherme & Benuto, Hugo & Guilherme",Haja Colírio (feat. Hugo & Guilherme) - Ao Vivo,B
