In [0]:
import json
import spotipy
import os
from datetime import datetime
from spotipy.oauth2 import SpotifyOAuth
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import *
from dotenv import load_dotenv

load_dotenv()
cache_path = "/Volumes/workspace/default/spotify-data/.spotify_token_cache"
scope = "user-read-recently-played user-read-playback-state user-read-currently-playing"

auth_manager = SpotifyOAuth(scope=scope, open_browser=False, cache_path=cache_path)
sp = spotipy.Spotify(auth_manager=auth_manager)

delta_history_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/recently_played"

schema = StructType([
    StructField("album_type", StringType(), True),
    StructField("total_tracks", IntegerType(), True),
    StructField("album_external_urls", StringType(), True),
    StructField("album_href", StringType(), True),
    StructField("album_id", StringType(), True),
    StructField("images", ArrayType(StructType([
        StructField("height", IntegerType(), True),
        StructField("url", StringType(), True),
        StructField("width", IntegerType(), True)
    ])), True),
    StructField("album_name", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("type", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("artists", ArrayType(StructType([
        StructField("external_urls", StringType(), True),
        StructField("href", StringType(), True),
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("type", StringType(), True),
        StructField("uri", StringType(), True)
    ])), True),
    StructField("disc_number", IntegerType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("explicit", BooleanType(), True),
    StructField("track_external_ids", StringType(), True),
    StructField("track_external_urls", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("popularity", IntegerType(), True),
    StructField("preview_url", StringType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("track_type", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("is_local", BooleanType(), True),
    StructField("played_at", StringType(), True),
    StructField("context_type", StringType(), True),
    StructField("context_href", StringType(), True),
    StructField("context_external_urls", StringType(), True),
    StructField("context_uri", StringType(), True)
])

def flatten_recent_tracks(item_raw):
    """
    Extrai TODOS os campos do JSON para o formato do novo schema.
    """
    track = item_raw.get('track', {})
    album = track.get('album', {})
    context = item_raw.get('context') or {}

    return {
        "album_type": album.get('album_type'),
        "total_tracks": album.get('total_tracks'),
        "album_external_urls": json.dumps(album.get('external_urls')),
        "album_href": album.get('href'),
        "album_id": album.get('id'),
        "images": album.get('images'),
        "album_name": album.get('name'),
        "album_release_date": album.get('release_date'),
        "album_release_date_precision": album.get('release_date_precision'),
        "type": album.get('type'),
        "album_uri": album.get('uri'),
        "artists": [
            {
                "external_urls": json.dumps(a.get('external_urls')),
                "href": a.get('href'),
                "id": a.get('id'),
                "name": a.get('name'),
                "type": a.get('type'),
                "uri": a.get('uri')
            } for a in track.get('artists', [])
        ],
        "disc_number": track.get('disc_number'),
        "duration_ms": track.get('duration_ms'),
        "explicit": track.get('explicit'),
        "track_external_ids": json.dumps(track.get('external_ids')),
        "track_external_urls": json.dumps(track.get('external_urls')),
        "track_href": track.get('href'),
        "track_id": track.get('id'),
        "track_name": track.get('name'),
        "popularity": track.get('popularity'),
        "preview_url": track.get('preview_url'),
        "track_number": track.get('track_number'),
        "track_type": track.get('type'),
        "track_uri": track.get('uri'),
        "is_local": track.get('is_local'),
        "played_at": item_raw.get('played_at'),
        "context_type": context.get('type'),
        "context_href": context.get('href'),
        "context_external_urls": json.dumps(context.get('external_urls')),
        "context_uri": context.get('uri')
    }

def fetch_and_save_recently_played():
    print(f"[{datetime.now().strftime('%H:%M:%S')}] A procurar músicas recentes...")
    
    # 1. GARANTIR QUE O DIRETÓRIO EXISTE
    # Criamos o diretório se não existir para evitar o erro PATH_NOT_FOUND
    dbutils.fs.mkdirs(delta_history_path)
    
    recent_raw = sp.current_user_recently_played(limit=50)
    
    if not recent_raw or not recent_raw.get('items'):
        print("Nenhuma música encontrada.")
        return

    flat_list = [flatten_recent_tracks(item) for item in recent_raw['items']]
    
    df_new = spark.createDataFrame(flat_list, schema=schema)
    df_new = df_new.withColumn("processed_at", current_timestamp())

    # 2. LÓGICA DE DEDUPLICAÇÃO
    try:
        # Tentamos ler a tabela. Se estiver vazia ou acabar de ser criada, 
        # o Spark pode lançar erro se não houver ficheiros Parquet lá dentro.
        df_existing = spark.read.format("delta").load(delta_history_path)
        
        df_to_insert = df_new.join(
            df_existing, 
            (df_new.track_id == df_existing.track_id) & (df_new.played_at == df_existing.played_at), 
            "left_anti"
        )
    except:
        # Se a tabela não puder ser lida (ex: diretório vazio sem metadados Delta),
        # assumimos que tudo o que recebemos é novo.
        print("ℹ️ Primeira execução ou tabela vazia detetada.")
        df_to_insert = df_new

    # 3. GRAVAÇÃO
    count = df_to_insert.count()
    if count > 0:
        df_to_insert.write.format("delta") \
            .mode("append") \
            .option("mergeSchema", "true") \
            .save(delta_history_path)
        print(f"✅ Sucesso: {count} novas músicas guardadas.")
    else:
        print("ℹ️ Nenhuma música nova para adicionar.")


fetch_and_save_recently_played()