In [0]:
path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/recently_played/"
dbutils.fs.rm(path, True)
print(f"Diret√≥rio {path} removido com sucesso.")

In [0]:
import spotipy
import time
from pathlib import Path
from datetime import datetime
import requests
from spotipy.oauth2 import SpotifyOAuth
from pyspark.sql.functions import current_timestamp, col, lit, when
from pyspark.sql.types import *
from dotenv import load_dotenv

In [0]:
load_dotenv()
cache_path = "/Volumes/workspace/default/spotify-data/.spotify_token_cache"
scope = "user-read-recently-played user-read-playback-state user-read-currently-playing"
delta_history_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/recently_played"

auth_manager = SpotifyOAuth(scope=scope, open_browser=False, cache_path=cache_path)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [0]:
schema = StructType([
    StructField("album_id", StringType(), True),
    StructField("album_image", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_popularity", IntegerType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_total_tracks", IntegerType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("album_artists_uris", ArrayType(StringType()), True),
    StructField("album_copyrights", ArrayType(StringType()), True),
    StructField("album_label", StringType(), True),
    StructField("album_tracks_names", ArrayType(StringType()), True),
    StructField("album_artists_names", ArrayType(StringType()), True),
    
    StructField("track_artists_uris", ArrayType(StringType()), True),
    StructField("track_artists_ids", ArrayType(StringType()), True),
    StructField("track_artists_names", ArrayType(StringType()), True),
    StructField("main_artist_uri", StringType(), True),

    StructField("artist_genres", ArrayType(StringType()), True),
    StructField("artist_id", StringType(), True),
    StructField("artist_uri", StringType(), True),
    StructField("artist_image", StringType(),True),
    StructField("artist_name", StringType(), True),
    StructField("artist_popularity", IntegerType(), True),
    StructField("artist_total_followers", LongType(), True),

    StructField("ip_addr", StringType(), True),
    StructField("context_type", StringType(), True),
    StructField("context_uri", StringType(), True),
    StructField("ms_played", LongType(), True),
    StructField("offline", BooleanType(), True),
    StructField("offline_timestamp", LongType(), True),
    StructField("platform", StringType(), True),
    StructField("processed_at", TimestampType(), True),
    StructField("reason_end", StringType(), True),
    StructField("reason_start", StringType(), True),
    StructField("shuffle", BooleanType(), True),
    StructField("skipped", BooleanType(), True),
    StructField("track_disc_number", IntegerType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_id", StringType(), True),
    StructField("track_is_explicit", BooleanType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("track_type", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("ts", StringType(), True),
    StructField("conn_country", StringType(), True),
    StructField("incognito_mode", BooleanType(), True)
])

In [0]:
def get_bulk_metadata(items):
    # 1. Coletar IDs √∫nicos
    album_ids = list(set([item['track']['album']['id'] for item in items if item.get('track')]))
    artist_ids = []
    for item in items:
        artists = item.get('track', {}).get('artists', [])
        artist_ids.extend([a['id'] for a in artists if a.get('id')])
    artist_ids = list(set(artist_ids))

    album_map = {}
    artist_map = {}

    # 2. Bulk Albums (Limite: 20 por call)
    for i in range(0, len(album_ids), 20):
        chunk = album_ids[i:i + 20]
        if chunk:
            albums_data = sp.albums(chunk)['albums']
            for a in albums_data:
                if a: album_map[a['id']] = a

    # 3. Bulk Artists (Limite: 50 por call)
    for i in range(0, len(artist_ids), 50):
        chunk = artist_ids[i:i + 50]
        if chunk:
            artists_data = sp.artists(chunk)['artists']
            for art in artists_data:
                if art: artist_map[art['id']] = art

    return album_map, artist_map

In [0]:
def flatten_recent_tracks(item_raw, album_map, artist_map):
    track = item_raw.get('track', {})
    album_brief = track.get('album', {})
    album_id = album_brief.get('id')
    
    # Metadados completos do mapa
    album_full = album_map.get(album_id, {})
    artists_full = artist_map.get(track.get('artists', [{}])[0].get('id'), {})

    if artists_full:
        artist_genres = artists_full.get('genres')
        artist_id = artists_full.get('id')
        artist_uri = artists_full.get('uri')
        artist_image = artists_full.get('images', [{}])
        artist_name = artists_full.get('name') 
        artist_popularity = artists_full.get('popularity')
        artist_total_followers = artists_full.get('followers', {}).get('total')


    # Imagens e Artistas
    images = album_full.get('images', [])
    track_artists = track.get('artists', [])
    album_artists = album_brief.get('artists', [])
    context = item_raw.get('context') or {}

    user_profile = sp.current_user()
    user_country = user_profile.get('country')

    
    return {
        "processed_at": None,
        "album_id": album_id,
        "album_image": images[0].get('url') if images else None,
        "album_name": album_full.get('name'),
        "album_popularity": album_full.get('popularity'),
        "album_release_date": album_full.get('release_date'),
        "album_release_date_precision": album_full.get('release_date_precision'),
        "album_total_tracks": album_full.get('total_tracks'),
        "album_type": album_full.get('album_type'),
        "album_uri": album_full.get('uri'),
        "album_copyrights": [c.get('text') for c in album_full.get('copyrights', [])],
        "album_label": album_full.get('label'),
        "album_tracks_names": [t.get('name') for t in album_full.get('tracks', {}).get('items', [])],
        "album_artists_uris": [alb.get('uri') for alb in album_artists],
        "album_artists_names": [alb.get('name') for alb in album_artists],

        "artist_genres": artist_genres,
        "artist_id": artist_id,
        "artist_uri": artist_uri,
        "artist_image": artist_image[0].get('url'),
        "artist_name": artist_name,
        "artist_popularity": artist_popularity,
        "artist_total_followers": artist_total_followers,

        "track_artists_names": [art.get('name') for art in track_artists],
        "track_artists_uris": [art.get('uri') for art in track_artists],
        "track_artists_ids": [art.get('id') for art in track_artists],
        "main_artist_uri": [track_artists[0].get('uri')],
        "track_disc_number": track.get('disc_number'),
        "track_duration_ms": track.get('duration_ms'),
        "track_is_explicit": track.get('explicit'),
        "track_id": track.get('id'),
        "track_name": track.get('name'),
        "track_popularity": track.get('popularity'),
        "track_number": track.get('track_number'),
        "track_type": track.get('type'),
        "track_uri": track.get('uri'),
        "ts": item_raw.get('played_at'),
        "context_type": context.get('type'),
        "context_uri": context.get('uri'),
        "ms_played": track.get('duration_ms'),
        "offline": False,
        "offline_timestamp": 0,
        "platform": "Spotify API",
        "reason_end": "unknown",
        "reason_start": "unknown",
        "shuffle": False if context.get('type') == "album" else True,
        "skipped": False,
        "conn_country": user_country,
        "ip_addr_decrypted": "127.0.0.1",
        "incognito_mode": False
    }

In [0]:
def fetch_recently_played_enriched():
    """Procura, enriquece e devolve as m√∫sicas recentes sem gravar em Delta."""
    print(f"[{datetime.now().strftime('%H:%M:%S')}] A processar m√∫sicas recentes...")
    
    # 1. Fetch API
    recent_raw = sp.current_user_recently_played(limit=50)
    if not recent_raw or not recent_raw.get('items'):
        print("Nenhuma m√∫sica encontrada.")
        return None
    
    items = recent_raw['items']
    
    # 2. Bulk Metadata (Albums & Artistas)
    album_map, artist_map = get_bulk_metadata(items)

    # 3. Flattening
    flat_list = [flatten_recent_tracks(item, album_map, artist_map) for item in items]
    
    # 4. Criar DataFrame e Tratar Nulos para compatibilidade
    df_new = spark.createDataFrame(flat_list, schema=schema) \
        .withColumn("processed_at", current_timestamp()) \
        .withColumn("reason_start", when(col("reason_start").isNull(), lit("unknown")).otherwise(col("reason_start"))) \
        .withColumn("reason_end", when(col("reason_end").isNull(), lit("unknown")).otherwise(col("reason_end"))) \
        .withColumn("track_type", when(col("track_type").isNull(), lit("track")).otherwise(col("track_type")))

    # 5. Deduplica√ß√£o contra os dados j√° existentes (opcional, mas recomendado)
    try:
        df_existing = spark.read.format("delta").load(delta_history_path)
        df_final = df_new.join(
            df_existing, 
            (df_new.track_id == df_existing.track_id) & (df_new.ts == df_existing.ts), 
            "left_anti"
        )
    except Exception:
        # Se a tabela n√£o existir, devolvemos todos os novos dados encontrados
        df_final = df_new

    return df_final.select(*sorted(df_final.columns))

In [0]:
df_recentlyPlayed = fetch_recently_played_enriched()

if df_recentlyPlayed and df_recentlyPlayed.count() > 0:
    count = df_recentlyPlayed.count()
    print(f"üíæ A gravar {count} novas m√∫sicas em: {delta_history_path}")
    
    dbutils.fs.mkdirs(delta_history_path)
    
    df_recentlyPlayed.write.format("delta") \
        .mode("append") \
        .option("mergeSchema", "true") \
        .save(delta_history_path)
    
    print("‚úÖ Grava√ß√£o conclu√≠da com sucesso.")
    display(df_recentlyPlayed)
else:
    print("‚ÑπÔ∏è Nenhuma m√∫sica nova detetada. Nada foi gravado.")

In [0]:
df_recentlyPlayed.printSchema()

In [0]:
display(df_recentlyPlayed.columns)