In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,explode
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import StringType
from pyspark.sql.functions import split, explode, trim


In [2]:
hdfs_uri = "hdfs://namenode:8020/silver_layer/silver_artists_genres.parquet"
spark = SparkSession.builder.appName("Check Genres").getOrCreate()

In [3]:
df_genres = spark.read.parquet(hdfs_uri)

In [4]:
df_genres.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- genre: string (nullable = true)



In [5]:
df_genres.distinct().count()

5459

In [8]:
hdfs_uri_feature_bronze = "hdfs://namenode:8020/bronze_layer/feature_music.parquet"
df_feature_bronze = spark.read.parquet(hdfs_uri_feature_bronze)
df_feature_bronze.count()

2000

In [9]:
df_feature_bronze.select("artist", "genre").where(col("artist") == "Drake").show(5, truncate=False)

+------+-----------------+
|artist|genre            |
+------+-----------------+
|Drake |hip hop, pop, R&B|
|Drake |hip hop, pop, R&B|
|Drake |hip hop, pop, R&B|
|Drake |hip hop, pop, R&B|
|Drake |hip hop, pop, R&B|
+------+-----------------+
only showing top 5 rows



In [12]:
artists_feature_genres = df_feature_bronze \
    .select("artist", "genre") \
    .withColumn("genre", explode(split(col("genre"), ",\s*"))) \
    .distinct()

In [13]:
df_Drake_feature = artists_feature_genres.select("artist", "genre").where(col("artist") == "Drake")

In [14]:
print("df ở genres artists",df_genres.count())
print("genres ở Feature", artists_feature_genres.count())

df ở genres artists 5459
genres ở Feature 1507


In [18]:
df_Drake_feature.printSchema()

root
 |-- artist: string (nullable = true)
 |-- genre: string (nullable = false)



<h1> Prepare Gold Layer </h1>

In [22]:
artists = "hdfs://namenode:8020/silver_layer/silver_artists.parquet"
albums = "hdfs://namenode:8020/silver_layer/silver_album.parquet"
artists_genres = "hdfs://namenode:8020/silver_layer/silver_artists_genres.parquet"
feature_music = "hdfs://namenode:8020/silver_layer/silver_feature_music.parquet"
tracks = "hdfs://namenode:8020/silver_layer/tracks_data.parquet"

spark = SparkSession.builder.appName("Gold_Layer_Processing").getOrCreate()

In [23]:
artists_df = spark.read.parquet(artists)
albums_df = spark.read.parquet(albums)
artists_genres_df = spark.read.parquet(artists_genres)
feature_music_df = spark.read.parquet(feature_music)
tracks_df = spark.read.parquet(tracks)

In [24]:
artists_df.show(2, vertical=True, truncate=False)
albums_df.show(2, vertical=True, truncate=False)
artists_genres_df.show(2, vertical=True, truncate=False)
feature_music_df.show(2, vertical=True, truncate=False)
tracks_df.show(2, vertical=True, truncate=False)

-RECORD 0---------------------------------------------------------------------------------
 id                    | 00FQb4jTyendYWaN8pK0wa                                           
 name                  | Lana Del Rey                                                     
 popularity            | 91                                                               
 type                  | artist                                                           
 uri                   | spotify:artist:00FQb4jTyendYWaN8pK0wa                            
 artist_id             | 00FQb4jTyendYWaN8pK0wa                                           
 external_urls_artists | https://open.spotify.com/artist/00FQb4jTyendYWaN8pK0wa           
 followers number      | 45985411                                                         
 images_artists        | https://i.scdn.co/image/ab6761610000e5ebb99cacf8acd5378206767261 
-RECORD 1---------------------------------------------------------------------------------

In [27]:
artists_df.show(2, vertical=True, truncate=False)

-RECORD 0---------------------------------------------------------------------------------
 id                    | 00FQb4jTyendYWaN8pK0wa                                           
 name                  | Lana Del Rey                                                     
 popularity            | 91                                                               
 type                  | artist                                                           
 uri                   | spotify:artist:00FQb4jTyendYWaN8pK0wa                            
 artist_id             | 00FQb4jTyendYWaN8pK0wa                                           
 external_urls_artists | https://open.spotify.com/artist/00FQb4jTyendYWaN8pK0wa           
 followers number      | 45985411                                                         
 images_artists        | https://i.scdn.co/image/ab6761610000e5ebb99cacf8acd5378206767261 
-RECORD 1---------------------------------------------------------------------------------

<h1> gold_track_metadata </h1>

In [28]:
print("artists_DataFrame Schema ", artists_df.printSchema())
print("albums_DataFrame Schema ", albums_df.printSchema())
print("artists_genres_DataFrame Schema ", artists_genres_df.printSchema())
print("feature_music_df_DataFrame Schema ", feature_music_df.printSchema())
print("tracks_df_DataFrame Schema ", tracks_df.printSchema())


root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- uri: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- external_urls_artists: string (nullable = true)
 |-- followers number: integer (nullable = true)
 |-- images_artists: string (nullable = true)

artists_DataFrame Schema  None
root
 |-- album_type: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- label: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- release_date: string (nullable = true)
 |-- release_date_precision: string (nullable = true)
 |-- total_tracks: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- uri: string (nullable = true)
 |-- album_id: string (nullable = true)
 |-- external_urls_albums: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- image_album: string (nullable = tru

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_set, concat_ws


def gold_track_metadata(spark: SparkSession):

    tracks_df = spark.read.parquet("hdfs://namenode:8020/silver_layer/tracks_data.parquet")
    albums_df = spark.read.parquet("hdfs://namenode:8020/silver_layer/silver_album.parquet")
    artists_df = spark.read.parquet("hdfs://namenode:8020/silver_layer/silver_artists.parquet")
    artists_genres_df = spark.read.parquet("hdfs://namenode:8020/silver_layer/silver_artists_genres.parquet")
    
    artists_genres_df = (
        artists_genres_df.groupBy("artist_id")
        .agg(concat_ws(", ", collect_set("genre")).alias("genres"))
    )
    
    artists_df = artists_df.selectExpr(
        "artist_id",
        "id as artist_spotify_id",
        "name as artist_name",
        "popularity as popularity_artist",
        "`followers number`",
        "type as type_artist",
        "uri as uri_artist",
        "external_urls_artists",
        "images_artists"
    )
    
    albums_df = albums_df.selectExpr(
        "album_id",
        "artist_id as album_artist_id",
        "album_name",
        "album_type",
        "release_date",
        "label",
        "popularity as popularity_album",
        "image_album",
        "external_urls_albums"
    )
    
    tracks_df = tracks_df.selectExpr(
        "track_id",
        "album_id as track_album_id",
        "artist_id as track_artist_id",
        "name as track_name",
        "duration_ms",
        "explicit",
        "popularity as popularity_track",
        "uri as uri_track",
        "external_urls_tracks"
    )
    
    df = (
        tracks_df
        .join(albums_df, tracks_df.track_album_id == albums_df.album_id, how="inner")
        .join(artists_df, tracks_df.track_artist_id == artists_df.artist_id, how="inner")
        .join(artists_genres_df, tracks_df.track_artist_id == artists_genres_df.artist_id, how="inner")
    )
    
    result_df = df.select(
        "track_id", "track_name", "duration_ms", "explicit", "popularity_track", "uri_track", "external_urls_tracks",
        "album_name", "album_type", "release_date", "label", "popularity_album", "image_album", "external_urls_albums",
        "artist_name", "popularity_artist", "followers number", "artist_spotify_id", "type_artist", "uri_artist", "external_urls_artists", "images_artists",
        "genres"
    ).dropDuplicates(["track_id"])

    result_df.write.mode("overwrite").parquet("hdfs://namenode:8020/gold_layer/gold_track_metadata.parquet")


[Row(track_id='0007aPK8VmXN4ycL2OcBFa', track_name='Bodhisattva - Live', duration_ms=295266, explicit='false', popularity_track=23, uri_track='open.spotify.com/track/0007aPK8VmXN4ycL2OcBFa', external_urls_tracks='https://open.spotify.com/track/0007aPK8VmXN4ycL2OcBFa', album_name='Live in Amsterdam (25th Anniversary)', album_type='album', release_date='2003-03-25', label='earMUSIC Classics', popularity_album=40, image_album='https://i.scdn.co/image/ab67616d0000b2732a4acfa030224c7b50b4988f', external_urls_albums='https://open.spotify.com/album/49R9ye71gtBwAbiHMcAunV', artist_name='TOTO', popularity_artist=76, followers number=3397853, artist_spotify_id='0PFtn5NtBbbUNbU9EAmIWF', type_artist='artist', uri_artist='spotify:artist:0PFtn5NtBbbUNbU9EAmIWF', external_urls_artists='https://open.spotify.com/artist/0PFtn5NtBbbUNbU9EAmIWF', images_artists='https://i.scdn.co/image/ab6761610000e5eba59a5bcab211f964fe9bfb06', genres='soft rock'),
 Row(track_id='000N4CJL8IjQ0f2I4grgBO', track_name='Girl 

In [40]:
pandas_df = result_df.toPandas()

In [41]:
pandas_df.head(5)

Unnamed: 0,track_id,track_name,duration_ms,explicit,popularity_track,uri_track,external_urls_tracks,album_name,album_type,release_date,...,external_urls_albums,artist_name,popularity_artist,followers number,artist_spotify_id,type_artist,uri_artist,external_urls_artists,images_artists,genres
0,0007aPK8VmXN4ycL2OcBFa,Bodhisattva - Live,295266,False,23,open.spotify.com/track/0007aPK8VmXN4ycL2OcBFa,https://open.spotify.com/track/0007aPK8VmXN4yc...,Live in Amsterdam (25th Anniversary),album,2003-03-25,...,https://open.spotify.com/album/49R9ye71gtBwAbi...,TOTO,76,3397853,0PFtn5NtBbbUNbU9EAmIWF,artist,spotify:artist:0PFtn5NtBbbUNbU9EAmIWF,https://open.spotify.com/artist/0PFtn5NtBbbUNb...,https://i.scdn.co/image/ab6761610000e5eba59a5b...,soft rock
1,000N4CJL8IjQ0f2I4grgBO,Girl Next Door,180682,False,58,open.spotify.com/track/000N4CJL8IjQ0f2I4grgBO,https://open.spotify.com/track/000N4CJL8IjQ0f2...,Girl Next Door,single,2023-05-11,...,https://open.spotify.com/album/1FLS0YAOPY3Mcb0...,Tyla,78,2844698,3SozjO3Lat463tQICI9LcE,artist,spotify:artist:3SozjO3Lat463tQICI9LcE,https://open.spotify.com/artist/3SozjO3Lat463t...,https://i.scdn.co/image/ab6761610000e5eba1fca1...,afrobeats
2,001c1daR1pFdqfpe1vAWzh,It's Only Rock 'N' Roll (But I Like It),322346,False,43,open.spotify.com/track/001c1daR1pFdqfpe1vAWzh,https://open.spotify.com/track/001c1daR1pFdqfp...,It's Only Rock 'N' Roll (But I Like It),single,2022-11-04,...,https://open.spotify.com/album/1GCjwU2CejKwooZ...,Brothers Osborne,64,1266529,39NR3AUhpbbqKM33vWn2fp,artist,spotify:artist:39NR3AUhpbbqKM33vWn2fp,https://open.spotify.com/artist/39NR3AUhpbbqKM...,https://i.scdn.co/image/ab6761610000e5eb3d875b...,"country, country rock"
3,004t0MwHjdcKZSJ2PAICy7,Ángela,171040,False,20,open.spotify.com/track/004t0MwHjdcKZSJ2PAICy7,https://open.spotify.com/track/004t0MwHjdcKZSJ...,"12 Mejores Éxitos con Los Caminantes, Vol. 3",album,2025-04-01,...,https://open.spotify.com/album/10cIakFsWgQkzsP...,Los Caminantes,69,2318753,6ziEnj8UydSg8dr68C3aut,artist,spotify:artist:6ziEnj8UydSg8dr68C3aut,https://open.spotify.com/artist/6ziEnj8UydSg8d...,https://i.scdn.co/image/7aa2d60ae801f2a8f9a3c5...,"cumbia, grupera"
4,004vyNAXRqeEp6TpXmSkky,Something About You,293973,False,17,open.spotify.com/track/004vyNAXRqeEp6TpXmSkky,https://open.spotify.com/track/004vyNAXRqeEp6T...,Coming up to Breathe,album,2006-04-25,...,https://open.spotify.com/album/4UlF8KoLFaLRQQu...,MercyMe,65,1977839,6APm8EjxOHSYM5B4i3vT3q,artist,spotify:artist:6APm8EjxOHSYM5B4i3vT3q,https://open.spotify.com/artist/6APm8EjxOHSYM5...,https://i.scdn.co/image/ab6761610000e5ebd9f0ad...,"christian pop, ccm, worship, christian alterna..."


In [42]:
pandas_df.count()

track_id                 304177
track_name               304177
duration_ms              304177
explicit                 304177
popularity_track         304177
uri_track                304177
external_urls_tracks     304177
album_name               304177
album_type               304177
release_date             304177
label                    304177
popularity_album         304177
image_album              304151
external_urls_albums     304177
artist_name              304177
popularity_artist        304177
followers number         304177
artist_spotify_id        304177
type_artist              304177
uri_artist               304177
external_urls_artists    304177
images_artists           304176
genres                   304177
dtype: int64

In [31]:
feature_music_df.printSchema()

root
 |-- acousticness: double (nullable = true)
 |-- artist: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- duration_ms: long (nullable = true)
 |-- energy: double (nullable = true)
 |-- explicit: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- liveness: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- song: string (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- track: string (nullable = true)



In [52]:
feature_path = "hdfs://namenode:8020/silver_layer/silver_feature_music.parquet"
tracks_path = "hdfs://namenode:8020/silver_layer/tracks_data.parquet"
output_path = "hdfs://namenode:8020/gold_layer/gold_feature_matrix.parquet"

features_df = spark.read.parquet(feature_path).withColumnRenamed("track", "track_name")

tracks_df = spark.read.parquet(tracks_path).selectExpr(
    "track_id", "name as track_name", "artist_id as track_artist_id"
)

artists_df = spark.read.parquet("hdfs://namenode:8020/silver_layer/silver_artists.parquet").selectExpr(
    "artist_id", "name as artist_name"
)

tracks_joined = tracks_df.join(
    artists_df,
    tracks_df["track_artist_id"] == artists_df["artist_id"],
    how="left"
).select("track_id", "track_name", "artist_name")

df_joined = features_df.join(
    tracks_joined,
    on=(features_df["track_name"] == tracks_joined["track_name"]) &
       (features_df["artist"] == tracks_joined["artist_name"]),
    how="inner"
)

feature_cols = ['track_id', 'danceability', 'energy', 'loudness', 'speechiness',
                'acousticness', 'instrumentalness', 'liveness', 'valence',
                'tempo', 'duration_ms']

final_df = df_joined.select(feature_cols).dropDuplicates(["track_id"])

final_df.write.mode("overwrite").parquet(output_path)

final_df.show(5)


+--------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+-----------+
|            track_id|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|
+--------------------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+-----------+
|0W4NhJhcqKCqEP2GI...|       0.688| 0.519|  -4.285|     0.0283|       0.064|             0.0|     0.1|  0.318|116.714|     255333|
|0iGR60UzkFoyAZ1uN...|       0.495| 0.894|  -4.814|     0.0441|     0.00453|         5.96E-4|   0.103|  0.213| 126.03|     239894|
|0qcjuYtMWhBjXg0Xw...|        0.94| 0.633|   -3.56|     0.0467|      0.0581|         4.04E-5|   0.281|  0.962|121.003|     248680|
|232puZVLpayvhEMel...|       0.729| 0.675|  -6.003|     0.0312|       0.175|         1.58E-6|    0.55|  0.779|119.968|     245866|
|2aibwv5hGXSgw7Yru...|       0.427|   0.9|  -3.674|     0.0499|       0.116|       

In [55]:
final_df.select("track_id").distinct().count()

3981

In [45]:
feature_path = "hdfs://namenode:8020/silver_layer/silver_feature_music.parquet"
df_feature = spark.read.parquet(feature_path)

In [46]:
df_feature.count()

1941

In [47]:
search_matrix  = "hdfs://namenode:8020/gold_layer/gold_track_search_index.parquet"
df_feature = spark.read.parquet(search_matrix)

In [48]:
df_feature.count()

1758

In [None]:
dashboard_warehouse = "hdfs://namenode:8020/warehouse_layer/warehouse_track_dashboard.parquet"
