In [0]:
import json
import spotipy
import os
from datetime import datetime
from spotipy.oauth2 import SpotifyOAuth
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import *
import time
from dotenv import load_dotenv
load_dotenv()

cache_path = "/Volumes/workspace/default/spotify-data/.spotify_token_cache"

scopes = [
    "user-library-read",
    "user-read-playback-state",
    "user-read-currently-playing",
    "user-follow-read",
    "user-read-recently-played",
    "user-top-read"
]
scope = " ".join(scopes)

auth_manager = SpotifyOAuth(
    scope=scope,
    open_browser=False,
    cache_path=cache_path
)

token_info = auth_manager.get_cached_token()

if not token_info:
    auth_url = auth_manager.get_authorize_url()
    print(f"Acede a este link: {auth_url}")
    response_url = input("Cole a URL de redirecionamento: ")
    code = auth_manager.parse_response_code(response_url)
    token_info = auth_manager.get_access_token(code)
    print("Autentica√ß√£o manual conclu√≠da e token guardado no Volume!")
else:
    print("Autenticado automaticamente via cache no Volume!")

sp = spotipy.Spotify(auth_manager=auth_manager)

delta_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/current_playing"

from pyspark.sql.types import *

schema = StructType([
    StructField("timestamp", LongType(), True),
    StructField("progress_ms", LongType(), True),
    StructField("is_playing", BooleanType(), True),
    StructField("currently_playing_type", StringType(), True),
    # Device
    StructField("device_id", StringType(), True),
    StructField("device_is_active", BooleanType(), True),
    StructField("device_is_private_session", BooleanType(), True),
    StructField("device_is_restricted", BooleanType(), True),
    StructField("device_name", StringType(), True),
    StructField("device_type", StringType(), True),
    StructField("device_volume_percent", IntegerType(), True),
    StructField("device_supports_volume", BooleanType(), True),
    # Context
    StructField("context_type", StringType(), True),
    StructField("context_uri", StringType(), True),
    # Track Item
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_explicit", BooleanType(), True),
    StructField("track_number", IntegerType(), True),
    # Album
    StructField("album_id", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_image_url", StringType(), True),
    # Artist (Guardamos o principal como string e todos como lista de nomes)
    StructField("artist_main_name", StringType(), True),
    StructField("artist_all_names", StringType(), True), 
    # Status
    StructField("repeat_state", StringType(), True),
    StructField("shuffle_state", BooleanType(), True)
])

def flatten_current_playing(raw_data):
    if not raw_data or not raw_data.get('item'):
        return None
    
    item = raw_data.get('item') or {}
    album = item.get('album') or {}
    device = raw_data.get('device') or {}
    context = raw_data.get('context') or {}
    artists = item.get('artists', [])
    images = album.get('images', [])

    # Criar uma string com todos os artistas separados por v√≠rgula
    all_artists = ", ".join([a.get('name') for a in artists])

    return {
        "timestamp": raw_data.get('timestamp'),
        "progress_ms": raw_data.get('progress_ms'),
        "is_playing": raw_data.get('is_playing'),
        "currently_playing_type": raw_data.get('currently_playing_type'),
        # Device
        "device_id": device.get('id'),
        "device_is_active": device.get('is_active'),
        "device_is_private_session": device.get('is_private_session'),
        "device_is_restricted": device.get('is_restricted'),
        "device_name": device.get('name'),
        "device_type": device.get('type'),
        "device_volume_percent": device.get('volume_percent'),
        "device_supports_volume": device.get('supports_volume'),
        # Context
        "context_type": context.get('type'),
        "context_uri": context.get('uri'),
        # Track
        "track_id": item.get('id'),
        "track_name": item.get('name'),
        "track_uri": item.get('uri'),
        "track_duration_ms": item.get('duration_ms'),
        "track_popularity": item.get('popularity'),
        "track_explicit": item.get('explicit'),
        "track_number": item.get('track_number'),
        # Album
        "album_id": album.get('id'),
        "album_name": album.get('name'),
        "album_release_date": album.get('release_date'),
        "album_image_url": images[0].get('url') if images else None,
        # Artists
        "artist_main_name": artists[0].get('name') if artists else None,
        "artist_all_names": all_artists,
        # Status
        "repeat_state": raw_data.get('repeat_state'),
        "shuffle_state": raw_data.get('shuffle_state')
    }

print(f"Iniciando captura incremental em: {delta_path}")
last_saved_timestamp = None
last_track_id = None

while True:
    try:
        current_track_raw = sp.current_user_playing_track()
        
        if current_track_raw:
            flat_dict = flatten_current_playing(current_track_raw)
            current_track_id = flat_dict['track_id']

            if flat_dict:
                current_ts = flat_dict['timestamp']
                track_id = flat_dict['track_name']
                already_exists = False
                
                # VERIFICA√á√ÉO ROBUSTA:
                # Primeiro verificamos se a pasta existe no sistema de ficheiros
                if os.path.exists(delta_path):
                    try:
                        if current_track_id == last_track_id:
                            already_exists = True
                        else:
                            already_exists = False
                            last_track_id = current_track_id

                    except Exception as e:
                        # Se a pasta existe mas a tabela Delta ainda n√£o est√° inicializada
                        # (ex: pasta vazia), assumimos que n√£o existe o registo
                        already_exists = False
                
                # L√ìGICA DE GRAVA√á√ÉO
                if not already_exists:
                    df_new = spark.createDataFrame([flat_dict], schema=schema)
                    df_new = df_new.withColumn("processed_at", current_timestamp())
                    
                    # O modo 'append' cria a pasta e a tabela automaticamente se n√£o existirem
                    df_new.write.format("delta") \
                        .mode("append") \
                        .option("mergeSchema", "true") \
                        .save(delta_path)
                    
                    # print(f"[{datetime.now().strftime('%H:%M:%S')}] ‚úÖ M√∫sica guardada: {flat_dict['track_name']}")
                else:
                    pass
            else:
                print("Dados inv√°lidos (an√∫ncio ou pausa).")
        else:
            print("Nenhuma m√∫sica a tocar.")
            
    except Exception as e:
        print(f"Erro na execu√ß√£o: {e}")
    
    time.sleep(30)

In [0]:
import json
import spotipy
import os
from datetime import datetime
from spotipy.oauth2 import SpotifyOAuth
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import *
from dotenv import load_dotenv

# Configura√ß√µes de Autentica√ß√£o
load_dotenv()
cache_path = "/Volumes/workspace/default/spotify-data/.spotify_token_cache"
scope = "user-read-recently-played user-read-playback-state user-read-currently-playing"

auth_manager = SpotifyOAuth(scope=scope, open_browser=False, cache_path=cache_path)
sp = spotipy.Spotify(auth_manager=auth_manager)

# Caminho da Tabela Delta (Hist√≥rico Geral)
delta_history_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/recently_played"

# Schema Achatado (Flattened)
schema = StructType([
    StructField("played_at", StringType(), True), # Data/Hora exata da reprodu√ß√£o
    StructField("timestamp_ms", LongType(), True),
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_explicit", BooleanType(), True),
    StructField("album_id", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_image_url", StringType(), True),
    StructField("artist_main_name", StringType(), True),
    StructField("artist_all_names", StringType(), True),
    StructField("context_type", StringType(), True),
    StructField("context_uri", StringType(), True)
])

def flatten_recent_tracks(item_raw):
    """
    Processa cada item da lista 'items' do recently_played
    """
    track = item_raw.get('track', {})
    album = track.get('album', {})
    artists = track.get('artists', [])
    images = album.get('images', [])
    context = item_raw.get('context') or {}

    all_artists = ", ".join([a.get('name') for a in artists])

    return {
        "played_at": item_raw.get('played_at'),
        "timestamp_ms": int(datetime.strptime(item_raw.get('played_at'), '%Y-%m-%dT%H:%M:%S.%fZ').timestamp() * 1000),
        "track_id": track.get('id'),
        "track_name": track.get('name'),
        "track_uri": track.get('uri'),
        "track_duration_ms": track.get('duration_ms'),
        "track_popularity": track.get('popularity'),
        "track_explicit": track.get('explicit'),
        "album_id": album.get('id'),
        "album_name": album.get('name'),
        "album_release_date": album.get('release_date'),
        "album_image_url": images[0].get('url') if images else None,
        "artist_main_name": artists[0].get('name') if artists else None,
        "artist_all_names": all_artists,
        "context_type": context.get('type'),
        "context_uri": context.get('uri')
    }

def fetch_and_save_recently_played():
    print(f"[{datetime.now().strftime('%H:%M:%S')}] A procurar m√∫sicas recentes...")
    
    # Obt√©m as √∫ltimas 50 m√∫sicas ouvidas
    recent_raw = sp.current_user_recently_played(limit=50)
    
    if not recent_raw or not recent_raw.get('items'):
        print("Nenhuma m√∫sica encontrada no hist√≥rico recente.")
        return

    # Mapeia os dados usando a fun√ß√£o de flatten
    flat_list = [flatten_recent_tracks(item) for item in recent_raw['items']]
    
    # Cria o DataFrame Spark
    df_new = spark.createDataFrame(flat_list, schema=schema)
    df_new = df_new.withColumn("processed_at", current_timestamp())

    # L√≥gica de Deduplica√ß√£o (N√£o inserir m√∫sicas que j√° existem na tabela Delta)
    if os.path.exists(delta_history_path):
        # Se a tabela existe, usamos um LEFT ANTI JOIN para pegar apenas o que √© novo
        df_existing = spark.read.format("delta").load(delta_history_path)
        
        # Filtramos m√∫sicas que t√™m o mesmo track_id E o mesmo played_at
        df_to_insert = df_new.join(
            df_existing, 
            (df_new.track_id == df_existing.track_id) & (df_new.played_at == df_existing.played_at), 
            "left_anti"
        )
    else:
        df_to_insert = df_new

    # Grava√ß√£o em Delta
    count = df_to_insert.count()
    if count > 0:
        df_to_insert.write.format("delta") \
            .mode("append") \
            .option("mergeSchema", "true") \
            .save(delta_history_path)
        print(f"‚úÖ Sucesso: {count} novas m√∫sicas guardadas em {delta_history_path}")
    else:
        print("‚ÑπÔ∏è Nenhuma m√∫sica nova para adicionar.")

# Execu√ß√£o √∫nica
fetch_and_save_recently_played()

In [0]:
# # APAGAR
 
# delta_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/delta/current_playing"
# # O par√¢metro recurse=True garante que ele apaga a pasta e todos os ficheiros dentro dela
# dbutils.fs.rm(delta_path, recurse=True)
# print(f"‚úÖ Todo o conte√∫do em {delta_path} foi removido com sucesso.")

In [0]:
# current_playback = sp.current_playback()
# print("Current Playback\n", current_playback)
# save_raw_data(current_playback, "current_playback")

In [0]:
# current_user_playing_track = sp.current_user_playing_track()
# print("Current User Playing Track\n", current_user_playing_track)
# save_raw_data(current_user_playing_track, "current_user_playing_track")

In [0]:
# currently_playing = sp.currently_playing()
# print("Currently Playing\n", currently_playing)
# save_raw_data(currently_playing, "currently_playing")

In [0]:
current_user_recently_played = sp.current_user_recently_played()
print("\nüéß M√∫sicas reproduzidas recentemente:")

for item in current_user_recently_played['items']:
    track = item['track']
    artist_name = track['artists'][0]['name']
    track_name = track['name']
    track_id = track['id']

    print(f"{artist_name} ‚Äì {track_name} ({track_id})")

In [0]:
df_history = spark.read.format("delta").load(delta_path)
display(df_history.orderBy(col("processed_at").desc()))