# Bronze Layer

- Bronze Layer - Automated Ingestion Script
- Reads all streaming history files from **Unity Catalog Volumes** and creates a **Delta Table** in **Bronze Layer**

## Initialization

In [0]:
from bronze_config import INGESTION_CONFIG
import json
import pyspark.sql.functions as F
from pyspark.sql.types import *
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv

In [0]:
def get_spotify_api_data():
    load_dotenv()
    cache_path = "/Workspace/Users/pg52694@alunos.uminho.pt/spotify-data-streaming-project/.spotify_token_cache"
    scope = "user-read-recently-played user-read-playback-state user-read-currently-playing"
    
    auth_manager = SpotifyOAuth(scope=scope, open_browser=False, cache_path=cache_path)
    sp = spotipy.Spotify(auth_manager=auth_manager)
    
    # --- Verificação de Token ---
    token_info = auth_manager.get_cached_token()
    
    if not token_info:
        auth_url = auth_manager.get_authorize_url()
        print(f"\n1. Abre este link: {auth_url}")
        
        response_url = input("2. Cola aqui a URL completa após o redirecionamento: ")
        
        # Extrai o código da URL de forma segura
        code = auth_manager.parse_response_code(response_url)
        
        try:
            # Substituímos o get_access_token pelo fluxo recomendado
            token_info = auth_manager.get_access_token(code, as_dict=False)
            print("✅ Autenticação realizada e token guardado na cache!")
        except Exception as e:
            print(f"❌ Erro ao obter token: {e}")
            return [] # Retorna lista vazia para não quebrar o loop do notebook
        
    results = sp.current_user_recently_played(limit=50)
    return results['items']

In [0]:
schema = StructType([
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_is_explicit", BooleanType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("album_id", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("track_artists_names", ArrayType(StringType()), True),
    StructField("track_artists_ids", ArrayType(StringType()), True),
    StructField("context_type", StringType(), True),
    StructField("context_uri", StringType(), True),
    StructField("ts", StringType(), True), # played_at
    StructField("processed_at", TimestampType(), True)
])

In [0]:
def flatten_api(item_raw):
    track = item_raw.get('track', {})
    album = track.get('album', {})
    context = item_raw.get('context') or {}
    artists = track.get('artists', [])

    return {
        "track_id": track.get('id'),
        "track_name": track.get('name'),
        "track_uri": track.get('uri'),
        "track_duration_ms": track.get('duration_ms'),
        "track_popularity": track.get('popularity'),
        "track_is_explicit": track.get('explicit'),
        "track_number": track.get('track_number'),
        "album_id": album.get('id'),
        "album_name": album.get('name'),
        "album_release_date": album.get('release_date'),
        "album_type": album.get('album_type'),
        "album_uri": album.get('uri'),
        "track_artists_names": [a.get('name') for a in artists],
        "track_artists_ids": [a.get('id') for a in artists],
        "context_type": context.get('type'),
        "context_uri": context.get('uri'),
        "ts": item_raw.get('played_at'),
        "processed_at": None
    }

## Read from json files to write Bronze Table

In [0]:
for item in INGESTION_CONFIG:
    print(f"Processing {item['table']}...")
    
    if item["format"] == "json":
        df = spark.read.option("multiLine", "True").json(item["path"])
        
    elif item["format"] == "api_call":
        # Puxa os dados (a tua função get_spotify_api_data já deve retornar results['items'])
        raw_items = get_spotify_api_data() 
        
        if raw_items:
            # Aplica o flatten simples
            flat_data = [flatten_api(i) for i in raw_items]
            
            # Cria o DataFrame com o schema nativo
            df = spark.createDataFrame(flat_data, schema=schema)
        else:
            continue

    # Adiciona timestamp e salva
    df_final = df.withColumn("processed_at", F.current_timestamp())
    
    df_final.write.format("delta") \
            .mode("append") \
            .option("mergeSchema", "true") \
            .saveAsTable(item["table"])

print("Bronze Tables saved with success!")

## Check Bronze Tables

In [0]:
%sql
SELECT *
FROM workspace.bronze.spotify_streaming_history_raw

In [0]:
%sql
SELECT *
FROM workspace.bronze.spotify_api_recently_played_tracks_raw