In [0]:
path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/recently_played/"
dbutils.fs.rm(path, True)
print(f"Diretório {path} removido com sucesso.")

In [0]:
import spotipy
import time
from pathlib import Path
from datetime import datetime
from spotipy.oauth2 import SpotifyOAuth
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import *
from dotenv import load_dotenv

In [0]:
load_dotenv()
cache_path = "/Volumes/workspace/default/spotify-data/.spotify_token_cache"
scope = "user-read-recently-played user-read-playback-state user-read-currently-playing"
delta_history_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/recently_played"

auth_manager = SpotifyOAuth(scope=scope, open_browser=False, cache_path=cache_path)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [0]:
schema = StructType([
    StructField("album_id", StringType(), True),
    StructField("album_image", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_popularity", IntegerType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_total_tracks", IntegerType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("album_artists_uris", ArrayType(StringType()), True),
    StructField("album_artists_names", ArrayType(StringType()), True),
    StructField("track_artists_uris", ArrayType(StringType()), True),
    StructField("track_artists_names", ArrayType(StringType()), True),
    StructField("context_type", StringType(), True),
    StructField("context_uri", StringType(), True),
    StructField("ms_played", LongType(), True),
    StructField("offline", BooleanType(), True),
    StructField("offline_timestamp", LongType(), True),
    StructField("platform", StringType(), True),
    StructField("processed_at", TimestampType(), True),
    StructField("reason_end", StringType(), True),
    StructField("reason_start", StringType(), True),
    StructField("shuffle", BooleanType(), True),
    StructField("skipped", BooleanType(), True),
    StructField("track_disc_number", IntegerType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_id", StringType(), True),
    StructField("track_is_explicit", BooleanType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("track_type", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("ts", StringType(), True)
])

In [0]:
def flatten_recent_tracks(item_raw):
    track = item_raw.get('track', {})
    album = track.get('album', {})
    track_artists = track.get('artists', [])
    album_artists = album.get('artists', [])
    context = item_raw.get('context') or {}
    images = album.get('images', [])
    
    # 1. Obter info extra do Álbum (Nota: isto tornará o script lento)
    album_uri = album.get('uri')
    album_details = sp.album(album_uri) # Usamos .album() para um único ID
    
    # 2. Extrair os novos campos
    # Copyrights: Geralmente é uma lista de dicts [{'text': '...', 'type': 'P'}]
    copyrights = [c.get('text') for c in album_details.get('copyrights', [])]
    
    # Label: String simples
    label = album_details.get('label')
    
    # Tracks Names: Estão dentro de tracks -> items
    tracks_names = [t.get('name') for t in album_details.get('tracks', {}).get('items', [])]

    return {
        "processed_at": None,
        "album_id": album.get('id'),
        "album_image": images[0].get('url') if images else None,
        "album_name": album.get('name'),
        "album_popularity": album.get('popularity'),
        "album_release_date": album.get('release_date'),
        "album_release_date_precision": album.get('release_date_precision'),
        "album_total_tracks": album.get('total_tracks'),
        "album_type": album.get('album_type'),
        "album_uri": album_uri,
        
        # --- NOVOS CAMPOS ---
        "album_copyrights": copyrights,
        "album_label": label,
        "album_tracks_names": tracks_names,
        # --------------------

        "album_artists_uris": [alb.get('uri') for alb in album_artists],
        "album_artists_names": [alb.get('name') for alb in album_artists],
        "track_artists_names": [art.get('name') for art in track_artists],
        "track_artists_uris": [art.get('uri') for art in track_artists],
        "track_disc_number": track.get('disc_number'),
        "track_duration_ms": track.get('duration_ms'),
        "track_is_explicit": track.get('explicit'),
        "track_id": track.get('id'),
        "track_name": track.get('name'),
        "track_popularity": track.get('popularity'),
        "track_number": track.get('track_number'),
        "track_type": track.get('type'),
        "track_uri": track.get('uri'),
        "ts": item_raw.get('played_at'),
        "context_type": context.get('type'),
        "context_uri": context.get('uri'),
        "ms_played": track.get('duration_ms'),
        "offline": False,
        "offline_timestamp": 0,
        "platform": "Spotify API",
        "reason_end": "unknown",
        "reason_start": "unknown",
        "shuffle": False if context.get('type') == "album" else True,
        "skipped": False
    }

In [0]:
def fetch_and_save_recently_played():
    print(f"[{datetime.now().strftime('%H:%M:%S')}] A procurar músicas recentes...")
    
    dbutils.fs.mkdirs(delta_history_path)
    recent_raw = sp.current_user_recently_played(limit=50)
    
    if not recent_raw or not recent_raw.get('items'):
        print("Nenhuma música encontrada.")
        return None

    flat_list = [flatten_recent_tracks(item) for item in recent_raw['items']]
    df_new = spark.createDataFrame(flat_list, schema=schema).withColumn("processed_at", current_timestamp())

    try:
        df_existing = spark.read.format("delta").load(delta_history_path)
        # Correção: O JOIN deve usar a coluna 'ts' que definimos no schema
        df_to_insert = df_new.join(
            df_existing, 
            (df_new.track_id == df_existing.track_id) & (df_new.ts == df_existing.ts), 
            "left_anti"
        )
    except Exception:
        print("ℹ️ Primeira execução ou tabela vazia detetada.")
        df_to_insert = df_new

    count = df_to_insert.count()
    if count > 0:
        df_to_insert.write.format("delta") \
            .mode("append") \
            .option("mergeSchema", "true") \
            .save(delta_history_path)
        print(f"✅ Sucesso: {count} novas músicas guardadas.")
    else:
        print("ℹ️ Nenhuma música nova para adicionar.")
    
    return df_to_insert.select(*sorted(df_to_insert.columns))

In [0]:
df = fetch_and_save_recently_played()

In [0]:
df.printSchema()

In [0]:
display(df.columns)