In [0]:
import json
import spotipy
import os
from datetime import datetime
from spotipy.oauth2 import SpotifyOAuth
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType
import time
from dotenv import load_dotenv
load_dotenv()

cache_path = "/Volumes/workspace/default/spotify-data/.spotify_token_cache"

scopes = [
    "user-library-read",
    "user-read-playback-state",
    "user-read-currently-playing",
    "user-follow-read",
    "user-read-recently-played",
    "user-top-read"
]
scope = " ".join(scopes)

auth_manager = SpotifyOAuth(
    scope=scope,
    open_browser=False,
    cache_path=cache_path
)

token_info = auth_manager.get_cached_token()

if not token_info:
    auth_url = auth_manager.get_authorize_url()
    print(f"Acede a este link: {auth_url}")
    response_url = input("Cole a URL de redirecionamento: ")
    code = auth_manager.parse_response_code(response_url)
    token_info = auth_manager.get_access_token(code)
    print("Autentica√ß√£o manual conclu√≠da e token guardado no Volume!")
else:
    print("Autenticado automaticamente via cache no Volume!")

sp = spotipy.Spotify(auth_manager=auth_manager)

delta_path = "/Volumes/workspace/default/spotify-data/streaming_history/delta/current_playing"

def flatten_current_playing(raw_data):
    """
    Transforma o dicion√°rio complexo do Spotify numa estrutura simples (flat)
    """
    if not raw_data or not raw_data.get('item'):
        return None
    
    item = raw_data['item']
    album = item.get('album', {})
    artists = item.get('artists', [])
    
    flat_data = {
        "track_id": item.get('id'),
        "track_name": item.get('name'),
        "artist_name": artists[0].get('name') if artists else None,
        "album_name": album.get('name'),
        "duration_ms": item.get('duration_ms'),
        "progress_ms": raw_data.get('progress_ms'),
        "is_playing": raw_data.get('is_playing'),
        "device_name": raw_data.get('device', {}).get('name'),
        "timestamp_ms": raw_data.get('timestamp')
    }
    return flat_data
    
schema = StructType([
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("artist_name", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("duration_ms", LongType(), True),
    StructField("progress_ms", LongType(), True),
    StructField("is_playing", BooleanType(), True),
    StructField("device_name", StringType(), True),
    StructField("timestamp_ms", LongType(), True)
])

print(f"Iniciando captura incremental em: {delta_path}")
last_saved_timestamp = None
last_track_id = None

while True:
    try:
        current_track_raw = sp.current_user_playing_track()
        
        if current_track_raw:
            flat_dict = flatten_current_playing(current_track_raw)
            current_track_id = flat_dict.get('track_id')

            if flat_dict:
                current_ts = flat_dict.get('timestamp_ms')
                track_id = flat_dict.get('track_id')
                already_exists = False
                
                # VERIFICA√á√ÉO ROBUSTA:
                # Primeiro verificamos se a pasta existe no sistema de ficheiros
                if os.path.exists(delta_path):
                    try:
                        if current_track_id == last_track_id and last_track_id != None:
                            already_exists = True
                        else:
                            already_exists = False
                            last_track_id = current_track_id

                    except Exception as e:
                        # Se a pasta existe mas a tabela Delta ainda n√£o est√° inicializada
                        # (ex: pasta vazia), assumimos que n√£o existe o registo
                        already_exists = False
                
                # L√ìGICA DE GRAVA√á√ÉO
                if not already_exists:
                    df_new = spark.createDataFrame([flat_dict], schema=schema)
                    df_new = df_new.withColumn("processed_at", current_timestamp())
                    
                    # O modo 'append' cria a pasta e a tabela automaticamente se n√£o existirem
                    df_new.write.format("delta") \
                        .mode("append") \
                        .option("mergeSchema", "true") \
                        .save(delta_path)
                    
                    print(f"[{datetime.now().strftime('%H:%M:%S')}] ‚úÖ M√∫sica guardada: {flat_dict['track_name']}")
                else:
                    pass
            else:
                print("Dados inv√°lidos (an√∫ncio ou pausa).")
        else:
            print("Nenhuma m√∫sica a tocar.")
            
    except Exception as e:
        print(f"Erro na execu√ß√£o: {e}")
    
    time.sleep(30)

In [0]:
# raw_data_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/current/"

# def save_raw_data(data, filename_prefix):
#     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
#     filename = f"{filename_prefix}_{timestamp}.json"
#     full_path = os.path.join(raw_data_path, filename)
#     os.makedirs(raw_data_path, exist_ok=True)
#     with open(full_path, "w") as f:
#         json.dump(data, f, indent=2)
#     print(f"‚úÖ Ficheiro salvo: {full_path}")

In [0]:
# current_playback = sp.current_playback()
# print("Current Playback\n", current_playback)
# save_raw_data(current_playback, "current_playback")

In [0]:
# current_user_playing_track = sp.current_user_playing_track()
# print("Current User Playing Track\n", current_user_playing_track)
# save_raw_data(current_user_playing_track, "current_user_playing_track")

In [0]:
# current_user_recently_played = sp.current_user_recently_played()
# print("\nüéß M√∫sicas reproduzidas recentemente:")

# for item in current_user_recently_played['items']:
#     track = item['track']
#     artist_name = track['artists'][0]['name']
#     track_name = track['name']
#     track_id = track['id']

#     print(f"{artist_name} ‚Äì {track_name} ({track_id})")

#     # Fetch audio analysis safely
#     try:
#         track['audio_analysis'] = sp.audio_analysis(track_id)
#     except spotipy.exceptions.SpotifyException as e:
#         print(f"Cannot fetch audio analysis for {track_name} ({track_id}): {e}")
#         track['audio_analysis'] = None

#     # Fetch audio features safely
#     try:
#         track['audio_features'] = sp.audio_features([track_id])[0]  # note the list
#     except spotipy.exceptions.SpotifyException as e:
#         print(f"Cannot fetch audio features for {track_name} ({track_id}): {e}")
#         track['audio_features'] = None

# save_raw_data(current_user_recently_played, "current_user_recently_played")

In [0]:
# currently_playing = sp.currently_playing()
# print("Currently Playing\n", currently_playing)
# save_raw_data(currently_playing, "currently_playing")

In [0]:
# APAGAR
 
delta_path = "/Volumes/workspace/default/spotify-data/streaming_history/delta/current_playing"
# O par√¢metro recurse=True garante que ele apaga a pasta e todos os ficheiros dentro dela
dbutils.fs.rm(delta_path, recurse=True)
print(f"‚úÖ Todo o conte√∫do em {delta_path} foi removido com sucesso.")

In [0]:
df_history = spark.read.format("delta").load(delta_path)
display(df_history.orderBy(col("processed_at").desc()))