# Bronze Layer

- Bronze Layer - Automated Ingestion Script
- Reads all streaming history files from **Unity Catalog Volumes** and creates a **Delta Table** in **Bronze Layer**

In [0]:
dbutils.library.restartPython()

## Initialization

In [0]:
from bronze_config import INGESTION_CONFIG, metadata_configs
import json
import pyspark.sql.functions as F
from pyspark.sql.types import *
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
from datetime import datetime

## Spotify API Authentication

In [0]:
load_dotenv()
cache_path = "/Workspace/Users/pg52694@alunos.uminho.pt/spotify-data-streaming-project/.spotify_token_cache"
scope = "user-read-recently-played user-read-playback-state user-read-currently-playing"

auth_manager = SpotifyOAuth(scope=scope, open_browser=False, cache_path=cache_path)
sp = spotipy.Spotify(auth_manager=auth_manager)

# --- Verifica√ß√£o de Token ---
token_info = auth_manager.get_cached_token()

if not token_info:
    auth_url = auth_manager.get_authorize_url()
    print(f"\n1. Open this link: {auth_url}")
    
    response_url = input("2. Paste the full URL here after the redirect: ")
    code = auth_manager.parse_response_code(response_url)
    
    try:
        token_info = auth_manager.get_access_token(code, as_dict=False)
        print("‚úÖ Authentication successful and token cached!")
    except Exception as e:
        print(f"‚ùå Error obtaining token: {e}")

## Read from json files to write Bronze Table

In [0]:
recently_played_schema = StructType([
    StructField("played_at", StringType(), True),
    StructField("context", StructType([
        StructField("type", StringType(), True),
        StructField("href", StringType(), True),
        StructField("uri", StringType(), True),
        StructField("external_urls", StructType([
            StructField("spotify", StringType(), True)
        ]), True)
    ]), True),
    StructField("track", StructType([
        StructField("album", StructType([
            StructField("album_type", StringType(), True),
            StructField("total_tracks", IntegerType(), True),
            StructField("available_markets", ArrayType(StringType()), True),
            StructField("external_urls", StructType([StructField("spotify", StringType(), True)]), True),
            StructField("href", StringType(), True),
            StructField("id", StringType(), True),
            StructField("images", ArrayType(StructType([
                StructField("url", StringType(), True),
                StructField("height", IntegerType(), True),
                StructField("width", IntegerType(), True)
            ])), True),
            StructField("name", StringType(), True),
            StructField("release_date", StringType(), True),
            StructField("release_date_precision", StringType(), True),
            StructField("type", StringType(), True),
            StructField("uri", StringType(), True),
            StructField("artists", ArrayType(StructType([
                StructField("id", StringType(), True),
                StructField("name", StringType(), True),
                StructField("type", StringType(), True),
                StructField("uri", StringType(), True)
            ])), True)
        ]), True),
        StructField("artists", ArrayType(StructType([
            StructField("id", StringType(), True),
            StructField("name", StringType(), True),
            StructField("type", StringType(), True),
            StructField("uri", StringType(), True)
        ])), True),
        StructField("available_markets", ArrayType(StringType()), True),
        StructField("disc_number", IntegerType(), True),
        StructField("duration_ms", IntegerType(), True),
        StructField("explicit", BooleanType(), True),
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("popularity", IntegerType(), True),
        StructField("preview_url", StringType(), True),
        StructField("track_number", IntegerType(), True),
        StructField("type", StringType(), True),
        StructField("uri", StringType(), True),
        StructField("is_local", BooleanType(), True)
    ]), True)
])

In [0]:
for item in INGESTION_CONFIG:
    print(f"Processing {item['table']}...")
    
    if item["format"] == "json":
        df = spark.read.option("multiLine", "True").json(item["path"])
        
    elif item["format"] == "api_call":
        response = sp.current_user_recently_played(limit=50)
        items = response["items"]
        if items:
            df = spark.createDataFrame(items, schema=recently_played_schema)
            df_track_ids = df.select(F.col("track.id").alias("id")).distinct()
        else:
            continue

    df_final = df.withColumn("processed_at", F.current_timestamp())
    
    df_final.write.format("delta") \
            .mode(item["mode"]) \
            .option("mergeSchema", "true") \
            .saveAsTable(item["table"])

print("Bronze Tables saved with success!")

## Get Metadata

In [0]:
df_streaming_history = spark.read.table("workspace.bronze.spotify_streaming_history_raw")
df_recently_played = spark.read.table("workspace.bronze.spotify_recently_played_raw")

# 1. O Spark j√° resolveu o JSON, por isso acedemos diretamente √† coluna 'items'
# Como 'items' √© um array de structs, usamos o explode diretamente nele.
df_recently_played_track_ids = df_recently_played \
    .select(F.col("track.id").alias("id")) \
    .filter(F.col("id").isNotNull()) \
    .distinct()

# 2. Streaming History (mantemos a l√≥gica do split para o formato spotify:track:ID)
df_streaming_history_track_ids = df_streaming_history \
    .filter(F.col("spotify_track_uri").isNotNull()) \
    .select(F.element_at(F.split(F.col("spotify_track_uri"), ":"), 3).alias("id")) \
    .distinct()

# 3. Uni√£o e cria√ß√£o da lista
df_track_ids = df_streaming_history_track_ids.union(df_recently_played_track_ids).distinct()

# 4. Coleta para lista Python
track_ids_list = [row.id for row in df_track_ids.collect()]

print(f"Total Unique Track IDs: {len(track_ids_list)}")

In [0]:
def save_metadata_to_bronze(data_list, table_name, schema):
    if not data_list:
        return
    
    # 1. Criar DataFrame a partir do JSON bruto (como ele vem da API)
    json_data = [(json.dumps(item),) for item in data_list if item]
    df_raw = spark.createDataFrame(json_data, ["json_string"])
    
    # 2. Parse inicial
    df_parsed = df_raw.select(
        F.from_json(F.col("json_string"), schema).alias("data")
    ).select("data.*")
    
    # Tracks e Artists IDs (√Ålbuns)
    if "tracks" in df_parsed.columns:
        df_parsed = df_parsed.withColumn("tracks_ids", F.col("tracks.items.id"))

    if "artists" in df_parsed.columns:
        df_parsed = df_parsed.withColumn("artists_ids", F.col("artists.id"))

    df_final = df_parsed.withColumn("processed_at", F.current_timestamp())
    
    df_final.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(table_name)
    print(f"‚úÖ {len(data_list)} records added to {table_name}")

In [0]:
track_schema = StructType([
    # --- ALBUM INFO (Object) ---
    StructField("album", StructType([
        StructField("album_type", StringType(), True),
        StructField("total_tracks", IntegerType(), True),
        StructField("available_markets", ArrayType(StringType()), True),
        StructField("external_urls", StructType([
            StructField("spotify", StringType(), True)
        ]), True),
        StructField("href", StringType(), True),
        StructField("id", StringType(), True),
        StructField("images", ArrayType(StructType([
            StructField("url", StringType(), True),
            StructField("height", IntegerType(), True),
            StructField("width", IntegerType(), True)
        ])), True),
        StructField("name", StringType(), True),
        StructField("release_date", StringType(), True),
        StructField("release_date_precision", StringType(), True),
        StructField("type", StringType(), True),
        StructField("uri", StringType(), True),
        # Artistas do √Ålbum
        StructField("artists", ArrayType(StructType([
            StructField("external_urls", StructType([
                StructField("spotify", StringType(), True)
            ]), True),
            StructField("href", StringType(), True),
            StructField("id", StringType(), True),
            StructField("name", StringType(), True),
            StructField("type", StringType(), True),
            StructField("uri", StringType(), True)
        ])), True)
    ]), True),

    # --- TRACK ARTISTS (Array) ---
    StructField("artists", ArrayType(StructType([
        StructField("external_urls", StructType([
            StructField("spotify", StringType(), True)
        ]), True),
        StructField("href", StringType(), True),
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("type", StringType(), True),
        StructField("uri", StringType(), True)
    ])), True),

    # --- TRACK DETAILS ---
    StructField("available_markets", ArrayType(StringType()), True),
    StructField("disc_number", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("explicit", BooleanType(), True),
    StructField("external_ids", StructType([
        StructField("isrc", StringType(), True),
        StructField("ean", StringType(), True),
        StructField("upc", StringType(), True)
    ]), True),
    StructField("external_urls", StructType([
        StructField("spotify", StringType(), True)
    ]), True),
    StructField("href", StringType(), True),
    StructField("id", StringType(), True),
    StructField("is_playable", BooleanType(), True),
    
    StructField("linked_from", StructType([
        StructField("external_urls", StructType([
            StructField("spotify", StringType(), True)
        ]), True),
        StructField("href", StringType(), True),
        StructField("id", StringType(), True),
        StructField("type", StringType(), True),
        StructField("uri", StringType(), True)
    ]), True),

    StructField("restrictions", StructType([
        StructField("reason", StringType(), True)
    ]), True),
    
    StructField("name", StringType(), True),
    StructField("popularity", IntegerType(), True),
    StructField("preview_url", StringType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("is_local", BooleanType(), True)
])

In [0]:
artist_schema = StructType([
    StructField("external_urls", StringType(), True),
    StructField("followers", StructType([
        StructField("href", StringType(), True),
        StructField("total", IntegerType(), True)
    ]), True),
    StructField("genres", ArrayType(StringType()), True),
    StructField("href", StringType(), True),
    StructField("id", StringType(), True),
    StructField("images", ArrayType(StructType([
        StructField("url", StringType(), True),
        StructField("height", IntegerType(), True),
        StructField("width", IntegerType(), True)
    ])), True),
    StructField("name", StringType(), True),
    StructField("popularity", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("uri", StringType(), True),
    StructField("processed_at", TimestampType(), True)
])

In [0]:
album_schema = StructType([
    StructField("album_type", StringType(), True),
    StructField("total_tracks", IntegerType(), True),
    StructField("available_markets", ArrayType(StringType()), True),
    StructField("external_urls", StructType([
        StructField("spotify", StringType(), True)
    ]), True),
    StructField("href", StringType(), True),
    StructField("id", StringType(), True),
    StructField("images", ArrayType(StructType([
        StructField("url", StringType(), True),
        StructField("height", IntegerType(), True),
        StructField("width", IntegerType(), True)
    ])), True),
    StructField("name", StringType(), True),
    StructField("release_date", StringType(), True),
    StructField("release_date_precision", StringType(), True),
    StructField("restrictions", StructType([
        StructField("reason", StringType(), True)
    ]), True),
    StructField("type", StringType(), True),
    StructField("uri", StringType(), True),
    
    # FIX 1: Artists √© uma LISTA (ArrayType)
    StructField("artists", ArrayType(StructType([
        StructField("external_urls", StructType([
            StructField("spotify", StringType(), True)
        ]), True),
        StructField("href", StringType(), True),
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("type", StringType(), True),
        StructField("uri", StringType(), True)
    ])), True),
    
    StructField("tracks", StructType([
        StructField("href", StringType(), True),
        StructField("limit", IntegerType(), True),
        StructField("next", StringType(), True),
        StructField("offset", IntegerType(), True),
        StructField("previous", StringType(), True),
        StructField("total", IntegerType(), True),
        # FIX 2: Items √© uma LISTA
        StructField("items", ArrayType(StructType([
            # FIX 3: Artists dentro da track tamb√©m √© LISTA
            StructField("artists", ArrayType(StructType([
                StructField("external_urls", StructType([
                    StructField("spotify", StringType(), True)
                ]), True),
                StructField("href", StringType(), True),
                StructField("id", StringType(), True),
                StructField("name", StringType(), True),
                StructField("type", StringType(), True),
                StructField("uri", StringType(), True)
            ])), True),
            StructField("available_markets", ArrayType(StringType()), True),
            StructField("disc_number", IntegerType(), True),
            StructField("duration_ms", IntegerType(), True),
            StructField("explicit", BooleanType(), True),
            StructField("external_urls", StructType([
                StructField("spotify", StringType(), True)
            ]), True),
            StructField("id", StringType(), True),
            StructField("is_playable", BooleanType(), True),
            StructField("linked_from", StructType([
                StructField("external_urls", StructType([
                    StructField("spotify", StringType(), True)
                ]), True),
                StructField("href", StringType(), True),
                StructField("id", StringType(), True),
                StructField("type", StringType(), True),
                StructField("uri", StringType(), True)
            ]), True),
            StructField("restrictions", StructType([
                StructField("reason", StringType(), True)
            ]), True),
            StructField("name", StringType(), True),
            StructField("preview_url", StringType(), True),
            StructField("track_number", IntegerType(), True),
            StructField("type", StringType(), True),
            StructField("uri", StringType(), True),
            StructField("is_local", BooleanType(), True)
        ])), True)
    ]), True),
    
    StructField("copyrights", ArrayType(StructType([
        StructField("text", StringType(), True),
        StructField("type", StringType(), True)
    ])), True),
    
    StructField("external_ids", StructType([
        StructField("isrc", StringType(), True),
        StructField("ean", StringType(), True),
        StructField("upc", StringType(), True)
    ]), True),
    
    StructField("genres", ArrayType(StringType()), True),
    StructField("label", StringType(), True),
    StructField("popularity", IntegerType(), True)
])

In [0]:
# 1. Verificar quais tracks j√° existem para n√£o repetir chamadas √† API
# Tenta carregar os IDs existentes
try:
    print(f"Verifying the existence of {metadata_configs["track"]["table"]}...")
    df_existing = spark.read.table(metadata_configs["track"]["table"]).select("id").distinct()
    existing_ids = [row.id for row in df_existing.collect()]
    new_track_ids = [tid for tid in track_ids_list if tid not in existing_ids]
    print(f"‚úÖ Table  found. Filtered {len(existing_ids)} IDs that already exist.")
except Exception as e:
    print(f"‚ö†Ô∏è Table not found or error reading (treating as new). Error: {str(e)[:100]}")
    new_track_ids = track_ids_list

In [0]:
# 2. PROCESSAR APENAS OS NOVOS IDS
all_entities_data = {"track": []}
artist_ids_set = set()
album_ids_set = set()

tracks_config = metadata_configs["track"]
target_table = tracks_config["table"]

if new_track_ids:
    chunk_size = tracks_config["chunk"]
    print(f"Starting API calls for {len (new_track_ids)} new tracks...")
    
    for i in range(0, len(new_track_ids), chunk_size):
        chunk = new_track_ids[i : i + chunk_size]
        try:
            res = sp.tracks(chunk)
            tracks_chunk = res[tracks_config["key"]]
            
            for track in tracks_chunk:
                all_entities_data["track"].append(track)
                # Mantemos os sets originais para o processo de Artistas/Albums seguinte
                album_ids_set.add(track["album"]["id"])
                for artist in track["artists"]:
                    artist_ids_set.add(artist["id"])
        except Exception as e:
            print(f"Error on batch {i} of tracks: {e}")

    # 3. Gravar apenas se houver dados novos
    if all_entities_data["track"]:
        save_metadata_to_bronze(all_entities_data["track"], target_table, track_schema)
else:
    print("‚ú® All trails are already documented in the Bronze table. Nothing to do.")

In [0]:
# --- FASE 2: PROCESSAR ARTISTAS E √ÅLBUNS (BATCHES DIN√ÇMICOS) ---
# Mapeamos as listas de IDs que descobrimos na Fase 1
id_map = {
    "artist": list(artist_ids_set),
    "album": list(album_ids_set)
}

flag = True

for entity in ["artist", "album"]:
    conf = metadata_configs[entity]
    ids_to_process = id_map[entity]
    chunk_size = conf["chunk"]
    
    print(f"Processing {len(ids_to_process)} {entity}s...")
    
    entity_results = []
    for i in range(0, len(ids_to_process), chunk_size):
        chunk = ids_to_process[i : i + chunk_size]
        try:
            api_function = sp.artists if entity == "artist" else sp.albums
            res = api_function(chunk)
            
            # Adicionamos o item bruto. A fun√ß√£o save_metadata trata o resto.
            items = [item for item in res[conf["key"]] if item]
            entity_results.extend(items)

        except Exception as e:
            print(f"Error on batch {i}: {e}")
    
    schema = artist_schema if entity == "artist" else album_schema
    save_metadata_to_bronze(entity_results, conf["table"], schema)

print("\nüöÄ Metadata pipeline finished with success!")

## Check Bronze Tables

In [0]:
%sql
SELECT *
FROM workspace.bronze.spotify_streaming_history_raw

In [0]:
%sql
SELECT *
FROM workspace.bronze.spotify_recently_played_raw

In [0]:
%sql
SELECT *
FROM workspace.bronze.spotify_tracks_raw

In [0]:
%sql
SELECT *
FROM workspace.bronze.spotify_artists_raw

In [0]:
%sql
SELECT *
FROM workspace.bronze.spotify_albums_raw