In [2]:
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import desc, count, col, sum as Fsum, when
import json, pprint
from typing import Any, Dict, List, Optional
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, IntegerType, ArrayType
from requests.exceptions import ReadTimeout, ConnectionError
import pprint
import time
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import sys

In [3]:
spark = SparkSession.builder.appName("SpotifyStreamingHistory").getOrCreate()
print(spark)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/20 20:30:37 WARN Utils: Your hostname, MartaPC, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/20 20:30:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/20 20:30:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/12/20 20:30:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


<pyspark.sql.session.SparkSession object at 0x7e675f18f410>


In [4]:
project_root = Path().resolve().parents[0]
path_api_raw = Path(project_root) / "data" / "raw"
path_api_processed = Path(project_root) / "data" / "processed"
print("path_api_raw:", path_api_raw)
print("path_api_processed:", path_api_processed)

path_api_raw: /home/marta/spotify-data-streaming-project/data/raw
path_api_processed: /home/marta/spotify-data-streaming-project/data/processed


## An√°lise dos **json** files das API calls

Quero analisar os dados dos ficheiros json e transform√°-los em dataframes e fazer as transforma√ß√µes necess√°rias

Available Markets

In [None]:
path_markets = path_api_raw / "available_markets.json"

with open(path_markets, "r", encoding="utf-8") as f:
    data = json.load(f)

markets = data["markets"]

df_api_markets = spark.createDataFrame([(m,) for m in markets], ["market"])
df_api_markets.show(10, truncate=False)
df_api_markets.printSchema()
print("N√∫mero total de markets:", df_api_markets.count())

Categories

In [None]:
path_categories = path_api_raw / "categories.json"

with open(path_categories, "r", encoding="utf-8") as f:
    data = json.load(f)

items = data["categories"]["items"]

df_api_categories = spark.createDataFrame(items)
df_api_categories.show(10, truncate=False)
df_api_categories.printSchema()
print("Total categories", df_api_categories.count())

Current Playback

In [None]:
path_current_playback = path_api_raw / "current_playback.json"
with open(path_current_playback, 'r') as f:
    data = json.load(f)
pprint.pprint(data, depth=10)

In [None]:
path_current_playback = path_api_raw / "current_playback.json"

if path_current_playback.exists() and path_current_playback.stat().st_size > 0:
    with open(path_current_playback, "r", encoding="utf-8") as f:
        data = json.load(f)

    if data:
        # Vari√°veis de n√≠vel raiz (static)
        device = data.get("device", {})
        context = data.get("context", {})
        context_external_urls = context.get("external_urls", {})
        disallows = data.get("actions", {}).get("disallows", {})
        
        # Estrutura para processar m√∫ltiplos items
        all_flattened_items = []

        # CORRE√á√ÉO: Prepara a lista de items para itera√ß√£o.
        # Se 'item' for um dicion√°rio (resposta normal), coloca-o numa lista. 
        # Se o utilizador modificar o JSON para ter uma lista, ela ser√° usada.
        items_source = data.get("item")
        if items_source is None:
            items_to_process = []
        elif isinstance(items_source, list):
            items_to_process = items_source
        else: # Assume single item (dict)
            items_to_process = [items_source]

        
        # LOOP: Itera sobre todos os items encontrados
        for item_track in items_to_process:
            
            # Extra√ß√£o de vari√°veis espec√≠ficas do item
            album = item_track.get("album", {})
            album_external_urls = album.get("external_urls", {})
            artists = item_track.get("artists", [])
            
            # --- Pr√©-processamento e Serializa√ß√£o de Listas Complexas ---
            
            # 1. Imagens do √Ålbum
            album_images_data = [{"height": img.get("height"), "width": img.get("width"), "url": img.get("url")} for img in album.get("images", [])]
            album_images_urls = json.dumps(album_images_data)

            # 2. Artistas da Faixa
            artists_external_urls = json.dumps([a.get("external_urls", {}).get("spotify") for a in artists])
            artists_hrefs = json.dumps([a.get("href") for a in artists])
            artists_ids = json.dumps([a.get("id") for a in artists])
            artists_names = json.dumps([a.get("name") for a in artists])
            artists_types = json.dumps([a.get("type") for a in artists])
            artists_uris = json.dumps([a.get("uri") for a in artists])
            
            # --- Dicion√°rio Achatado (Flattened) ---

            flattened = {
                # Campos de Dispositivo
                "device_id": device.get("id"),
                "device_is_active": device.get("is_active"),
                "device_is_private_session": device.get("is_private_session"),
                "device_is_restricted": device.get("is_restricted"),
                "device_name": device.get("name"),
                "device_supports_volume": device.get("supports_volume"),
                "device_type": device.get("type"),
                "device_volume_percent": device.get("volume_percent"),
                
                # Campos de Reprodu√ß√£o Atual (N√≠vel Raiz)
                "shuffle_state": data.get("shuffle_state"),
                "smart_shuffle": data.get("smart_shuffle"),
                "repeat_state": data.get("repeat_state"),
                "is_playing": data.get("is_playing"),
                "timestamp": data.get("timestamp"),
                "context_external_urls": context_external_urls.get("spotify"),
                "context_href": context.get("href"),
                "context_type": context.get("type"),
                "context_uri": context.get("uri"),
                "progress_ms": data.get("progress_ms"),
                "currently_playing_type": data.get("currently_playing_type"),
                "actions_disallows_resuming": disallows.get("resuming"),
                
                # Faixa (Item) - Album
                "item_album_album_type": album.get("album_type"), # Chave √∫nica para album_type
                "item_album_external_urls": album_external_urls.get("spotify"),
                "item_album_href": album.get("href"),
                "item_album_id": album.get("id"),
                "item_album_images_urls": album_images_urls, # Serializado
                "item_album_name": album.get("name"),
                "item_album_release_date": album.get("release_date"),
                "item_album_release_date_precision": album.get("release_date_precision"),
                "item_album_total_tracks": album.get("total_tracks"),
                "item_album_type": album.get("type"),
                "item_album_uri": album.get("uri"),
                
                # Faixa (Item) - Artistas (Serializados)
                "item_artists_external_urls": artists_external_urls,
                "item_artists_hrefs": artists_hrefs,
                "item_artists_ids": artists_ids, 
                "item_artists_names": artists_names,
                "item_artists_types": artists_types,
                "item_artists_uris": artists_uris,
                
                # Faixa (Item) - Detalhes da M√∫sica
                "item_disc_number": item_track.get("disc_number"),
                "item_duration_ms": item_track.get("duration_ms"),
                "item_explicit": item_track.get("explicit"),
                "item_external_ids_isrc": item_track.get("external_ids", {}).get("isrc"), # Acesso seguro
                "item_external_urls_spotify": item_track.get("external_urls", {}).get("spotify"), # Acesso seguro
                "item_href": item_track.get("href"),
                "item_id": item_track.get("id"),
                "item_is_local": item_track.get("is_local"),
                "item_name": item_track.get("name"),
                "item_popularity": item_track.get("popularity"),
                "item_preview_url": item_track.get("preview_url", {}) or "",
                "item_track_number": item_track.get("track_number"),
                "item_type": item_track.get("type"),
                "item_uri": item_track.get("uri"),
            }
            
            all_flattened_items.append(flattened)


        try:
            # Cria a DataFrame Spark a partir de todos os items achatados
            df_api_playback = spark.createDataFrame(all_flattened_items)
            print("‚úÖ DataFrame criada com sucesso:")
            df_api_playback.show(truncate=50)
            df_api_playback.printSchema()
            print(f"Total rows: {df_api_playback.count()}")
            print(f"Total columns: {len(df_api_playback.columns)}")
        except NameError:
            print("‚ùå Erro: A vari√°vel 'spark' n√£o est√° definida. Certifique-se de que a sess√£o Spark est√° ativa.")
        except Exception as e:
            print(f"‚ùå Erro ao criar DataFrame Spark: {e}")
            
    else:
        print("‚ö†Ô∏è JSON vazio ou null")
else:
    print("‚ö†Ô∏è Ficheiro current_playback.json n√£o existe ou est√° vazio")

Current User Followed Artists

In [None]:
path_api_followed_artists = path_api_raw / "current_user_followed_artists.json"

with open(path_api_followed_artists, "r", encoding="utf-8") as f:
    data = json.load(f)

items = data["artists"]["items"]

df_api_followed_artists = spark.createDataFrame(items)
df_api_followed_artists.show(10, truncate=False)
df_api_followed_artists.printSchema()
print("Total followed artists", df_api_followed_artists.count())

Current User Playing Track

In [None]:
path_current_user_playing_track = path_api_raw / "current_user_playing_track.json"
with open(path_current_user_playing_track, 'r') as f:
    data = json.load(f)
pprint.pprint(data, depth=10)

In [None]:
path_current_user_playing_track = path_api_raw / "current_user_playing_track.json"

if path_current_user_playing_track.exists() and path_current_user_playing_track.stat().st_size > 0:
    with open(path_current_user_playing_track, "r", encoding="utf-8") as f:
        data = json.load(f)

    if data:
        # Extra√ß√£o de campos de n√≠vel raiz (Contexto e A√ß√µes)
        context = data.get("context", {})
        context_external_urls = context.get("external_urls", {})
        disallows = data.get("actions", {}).get("disallows", {})
        
        all_flattened_items = []
        
        # L√≥gica para suportar um √∫nico item (padr√£o da API) ou m√∫ltiplos items (pedido do utilizador)
        items_source = data.get("item")
        
        if items_source is None:
            items_to_process = []
        elif isinstance(items_source, list):
            items_to_process = items_source
        else: # items_source √© um dicion√°rio (single item)
            items_to_process = [items_source]
        
        if not items_to_process:
            print("‚ö†Ô∏è Item √© nulo ou a lista de items est√° vazia: Nenhuma faixa a processar.")
        else:
            # LOOP: Itera sobre todos os items encontrados (item_track √© a faixa individual)
            for item_track in items_to_process:
                
                # Extra√ß√£o de sub-objetos da faixa atual
                item_album = item_track.get("album", {})
                
                # -------------------------------------------------------------
                # 1. Artistas do √Ålbum
                # -------------------------------------------------------------
                item_album_artists = item_album.get("artists", [])
                # Serializa√ß√£o de listas para evitar CANNOT_DETERMINE_TYPE
                item_album_artists_external_urls = json.dumps([a.get("external_urls", {}).get("spotify") for a in item_album_artists])
                item_album_artists_hrefs = json.dumps([a.get("href") for a in item_album_artists])
                item_album_artists_ids = json.dumps([a.get("id") for a in item_album_artists])
                item_album_artists_names = json.dumps([a.get("name") for a in item_album_artists])
                item_album_artists_types = json.dumps([a.get("type") for a in item_album_artists])
                item_album_artists_uris = json.dumps([a.get("uri") for a in item_album_artists])
                
                # -------------------------------------------------------------
                # 2. Mercados e Imagens do √Ålbum (Serializa√ß√£o)
                # -------------------------------------------------------------
                item_album_available_markets = json.dumps(item_album.get("available_markets", []))
                item_album_external_urls = item_album.get("external_urls", {}).get("spotify")
                item_album_href = item_album.get("href")
                item_album_id = item_album.get("id")
                item_album_images_data = [{"height": img.get("height"), "width": img.get("width"), "url": img.get("url")} for img in item_album.get("images", [])]
                item_album_images_urls = json.dumps(item_album_images_data)
                item_album_name = item_album.get("name")
                item_album_release_date = item_album.get("release_date")
                item_album_release_date_precision = item_album.get("release_date_precision")
                item_album_total_tracks = item_album.get("total_tracks")
                item_album_type = item_album.get("type")
                item_album_uri = item_album.get("uri")
                
                # -------------------------------------------------------------
                # 3. Artistas da Faixa (Item)
                # -------------------------------------------------------------
                item_artists = item_track.get("artists", [])
                # Serializa√ß√£o de listas para evitar CANNOT_DETERMINE_TYPE
                item_artists_external_urls = json.dumps([a.get("external_urls", {}).get("spotify") for a in item_artists])
                item_artists_hrefs = json.dumps([a.get("href") for a in item_artists])
                item_artists_ids = json.dumps([a.get("id") for a in item_artists])
                item_artists_names = json.dumps([a.get("name") for a in item_artists])
                item_artists_types = json.dumps([a.get("type") for a in item_artists])
                item_artists_uris = json.dumps([a.get("uri") for a in item_artists])
                
                # 4. Mercados da Faixa (Serializa√ß√£o)
                item_available_markets = json.dumps(item_track.get("available_markets", []))


                # Dicion√°rio Achatado (Flattened)
                flattened = {
                    # Campos de Reprodu√ß√£o Atual (N√≠vel Raiz) - Repetidos para cada Faixa
                    "is_playing": data.get("is_playing"),
                    "timestamp": data.get("timestamp"),
                    "current_playing_type": data.get("currently_playing_type"),
                    "actions_resuming": disallows.get("resuming"),
                    "progress_ms": data.get("progress_ms"),

                    # Contexto (Context) - Repetidos para cada Faixa
                    "context_external_urls": context_external_urls.get("spotify"),
                    "context_href": context.get("href"),
                    "context_type": context.get("type"),
                    "context_uri": context.get("uri"),

                    # Faixa (Item) - Detalhes da M√∫sica
                    "disc_number": item_track.get("disc_number"),
                    "duration_ms": item_track.get("duration_ms"),
                    "explicit": item_track.get("explicit"),
                    "external_ids": item_track.get("external_ids", {}).get("isrc"),
                    "external_urls": item_track.get("external_urls", {}).get("spotify"),
                    "href": item_track.get("href"),
                    "id": item_track.get("id"),
                    "is_local": item_track.get("is_local"),
                    "name": item_track.get("name"),
                    "popularity": item_track.get("popularity"),
                    "preview_url": item_track.get("preview_url") or "",
                    "track_number": item_track.get("track_number"),
                    "type": item_track.get("type"),
                    "uri": item_track.get("uri"),
                    "item_available_markets": item_available_markets, # Serializado
                    
                    # Faixa (Item) - Album
                    "item_album_album_type": item_album.get("album_type"),
                    "item_album_external_urls": item_album_external_urls,
                    "item_album_href": item_album_href,
                    "item_album_id": item_album_id,
                    "item_album_images_urls": item_album_images_urls, # Serializado
                    "item_album_name": item_album_name,
                    "item_album_release_date": item_album_release_date,
                    "item_album_release_date_precision": item_album_release_date_precision,
                    "item_album_total_tracks": item_album_total_tracks,
                    "item_album_type": item_album_type,
                    "item_album_uri": item_album_uri,
                    "item_album_available_markets": item_album_available_markets, # Serializado
                    
                    # Faixa (Item) - Artistas da Faixa (Serializados)
                    "item_artists_external_urls": item_artists_external_urls,
                    "item_artists_hrefs": item_artists_hrefs,
                    "item_artists_ids": item_artists_ids,
                    "item_artists_names": item_artists_names,
                    "item_artists_types": item_artists_types,
                    "item_artists_uris": item_artists_uris,

                    # Faixa (Item) - Album - Artistas do √Ålbum (Serializados)
                    "item_album_artists_external_urls": item_album_artists_external_urls,
                    "item_album_artists_hrefs": item_album_artists_hrefs,
                    "item_album_artists_ids": item_album_artists_ids,
                    "item_album_artists_names": item_album_artists_names,
                    "item_album_artists_types": item_album_artists_types,
                    "item_album_artists_uris": item_album_artists_uris,
                }
                all_flattened_items.append(flattened)

            try:
                # O DataFrame √© criado a partir de todos os dicion√°rios na lista
                df_api_playing_track = spark.createDataFrame(all_flattened_items)
                print("‚úÖ DataFrame criada com sucesso:")
                df_api_playing_track.show(truncate=50)
                df_api_playing_track.printSchema()
                print(f"Total rows: {df_api_playing_track.count()}")
                print(f"Total columns: {len(df_api_playing_track.columns)}")
            except NameError:
                print("‚ùå Erro: A vari√°vel 'spark' n√£o est√° definida. Certifique-se de que a sess√£o Spark est√° ativa.")
            except Exception as e:
                print(f"‚ùå Erro ao criar DataFrame Spark: {e}")
            
    else:
        print("‚ö†Ô∏è JSON vazio ou null")
else:
    print("‚ö†Ô∏è Ficheiro current_user_playing_track.json n√£o existe ou est√° vazio")

Current User Playlists

In [None]:
path_current_user_playlists = path_api_raw / "current_user_playlists.json"
with open(path_current_user_playlists, 'r') as f:
    data = json.load(f)
pprint.pprint(data, depth=10)

In [None]:
if path_current_user_playlists.exists() and path_current_user_playlists.stat().st_size > 0:
    with open(path_current_user_playlists, "r", encoding="utf-8") as f:
        data = json.load(f)

    if data:
        
        all_flattened_playlists = []
        items = data.get("items", [])

        if not items:
            print("‚ö†Ô∏è A lista de playlists ('items') est√° vazia.")
        else:
            
            for item in items:
                item_external_urls = item.get("external_urls", {})
                owner = item.get("owner", {})
                owner_external_urls = owner.get("external_urls", {})
                tracks = item.get("tracks", {})
                
                # Serializa√ß√£o de campos complexos para evitar o erro CANNOT_DETERMINE_TYPE
                item_images_data = [{"height": img.get("height"), "width": img.get("width"), "url": img.get("url")} for img in item.get("images", [])]
                item_images_urls = json.dumps(item_images_data)

                # item_available_markets foi removido, pois n√£o existe na API de Playlists.
                
                flattened = {
                    # Campos do Item (Playlist)
                    "item_collaborative": item.get("collaborative"),
                    "item_description": item.get("description"),
                    "item_external_urls_spotify": item_external_urls.get("spotify"),
                    "item_href": item.get("href"),
                    "item_id": item.get("id"),
                    "item_name": item.get("name"),
                    "item_public": item.get("public"),
                    "item_snapshot_id": item.get("snapshot_id"),
                    "item_type": item.get("type"),
                    "item_uri": item.get("uri"),
                    "item_images_urls": item_images_urls, # Serializado
                    
                    # Campos do Propriet√°rio (Owner)
                    "item_owner_display_name": owner.get("display_name"),
                    "item_owner_external_urls_spotify": owner_external_urls.get("spotify"),
                    "item_owner_href": owner.get("href"),
                    "item_owner_id": owner.get("id"),
                    "item_owner_type": owner.get("type"),
                    "item_owner_uri": owner.get("uri"),
                    
                    # Campos de Faixas (Tracks)
                    "item_tracks_href": tracks.get("href"),
                    "item_tracks_total": tracks.get("total"),
                }
                all_flattened_playlists.append(flattened)

            try:
                df_api_playlists = spark.createDataFrame(all_flattened_playlists)
                
                print("‚úÖ DataFrame Spark criada com sucesso:")
                df_api_playlists.show(truncate=50)
                df_api_playlists.printSchema()
                print("Total rows:", df_api_playlists.count())
                print("Total columns:", len(df_api_playlists.columns))

                # Verifica√ß√£o de contagem
                df_count = df_api_playlists.count()
                json_total = data.get("total")
                
                if json_total is not None and json_total == df_count:
                    print(f"‚úÖ O 'total' do JSON de raiz ({json_total}) √© igual ao n√∫mero de linhas do DataFrame ({df_count}).")
                elif json_total is not None and json_total > df_count:
                    print(f"‚ö†Ô∏è O 'total' do JSON de raiz ({json_total}) √© MAIOR do que o n√∫mero de linhas do DataFrame ({df_count}). Isto √© **esperado** em resultados paginados.")
                else:
                    print(f"‚ùå Falha na verifica√ß√£o. JSON total: {json_total}, DF count: {df_count}.")


            except NameError:
                print("‚ùå Erro: A vari√°vel 'spark' n√£o est√° definida. Certifique-se de que a sess√£o Spark est√° ativa.")
            except Exception as e:
                print(f"‚ùå Erro ao criar DataFrame Spark: {e}")
            
    else:
        print("‚ö†Ô∏è JSON vazio ou null")
else:
    print("‚ö†Ô∏è Ficheiro current_user_playlists.json n√£o existe ou est√° vazio")

Current User Recently Played

In [None]:
path_current_user_recently_played = path_api_raw / "current_user_recently_played.json"

if not path_current_user_recently_played.exists():
    print("‚ö†Ô∏è Ficheiro current_user_recently_played.json n√£o existe.")
else:
    try:
        data: Dict[str, Any] = {}

        with open(path_current_user_recently_played, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if raw_data is not None and isinstance(raw_data, dict):
            data = raw_data
        
        if not data:
             print("‚ö†Ô∏è O ficheiro JSON continha 'null' ou est√° vazio ap√≥s a leitura.")
             items = []
             next = href = limit = None
        else:
            # 1. Extra√ß√£o de Campos de N√≠vel Raiz (Pagina√ß√£o/Metadata)
            items: List[Dict[str, Any]] = data.get("items", [])
            
            # Garante que estas vari√°veis est√£o definidas para o dict 'data'
            next = data.get("next", None)
            href = data.get("href", None)
            limit = data.get("limit", None)

        if not items:
            print("‚ö†Ô∏è A lista de faixas reproduzidas recentemente ('items') est√° vazia.")
        else:
            all_flattened_tracks: List[Dict[str, Any]] = []

            for item in items:
                
                if item is None or not isinstance(item, dict):
                    continue
                
                # Garante que track, album, e context s√£o dicts, mesmo se null
                track: Dict[str, Any] = item.get("track") or {} 
                album: Dict[str, Any] = track.get("album") or {}
                context: Dict[str, Any] = item.get("context") or {}

                # Obter a LISTA de artistas do √°lbum e extrair o primeiro
                album_artists_list: List[Dict[str, Any]] = album.get("artists") or []
                first_album_artist: Dict[str, Any] = album_artists_list[0] if album_artists_list else {}

                # Obter a LISTA de artistas da faixa e extrair o primeiro
                track_artists_list: List[Dict[str, Any]] = track.get("artists") or []
                first_track_artist: Dict[str, Any] = track_artists_list[0] if track_artists_list else {}

                # L√≥gica de Imagem
                album_images_list = album.get("images") or []
                album_images_data = [{"height": img.get("height"), "width": img.get("width"), "url": img.get("url")} for img in album_images_list]
                album_images = json.dumps(album_images_data)
                
                # Preparar audio_analysis e audio_features para serializa√ß√£o
                audio_analysis_data = track.get("audio_analysis", None)
                audio_features_data = track.get("audio_features", None)
                
                flattened = {
                    "next": next,
                    "href": href,
                    "limit": limit,
                    "cursors_after": data.get("cursors") and data.get("cursors").get("after", None),
                    "cursors_before": data.get("cursors") and data.get("cursors").get("before", None),
                    
                    "album_type": album.get("album_type", None),
                    "album_artists_external_urls": first_album_artist.get("external_urls", {}).get("spotify", None),
                    "album_artists_href": first_album_artist.get("href", None),
                    "album_artists_id": first_album_artist.get("id", None),
                    "album_artists_name": first_album_artist.get("name", None),
                    "album_artists_type": first_album_artist.get("type", None),
                    "album_artists_uri": first_album_artist.get("uri", None),
                    
                    "album_available_markets": json.dumps(album.get("available_markets") or []),

                    "album_external_urls": album.get("external_urls", {}).get("spotify", None),
                    "album_href": album.get("href", None),
                    "album_id": album.get("id", None),
                    
                    "album_images": album_images,
                    "album_name": album.get("name", None),
                    "album_release_date": album.get("release_date", None),
                    "album_release_date_precision": album.get("release_date_precision", None),
                    "album_total_tracks": album.get("total_tracks", None),
                    "album_type": album.get("type", None),
                    "album_uri": album.get("uri", None),

                    "artists_external_urls": first_track_artist.get("external_urls", {}).get("spotify", None),
                    "artists_href": first_track_artist.get("href", None),
                    "artists_id": first_track_artist.get("id", None),
                    "artists_name": first_track_artist.get("name", None),
                    "artists_type": first_track_artist.get("type", None),
                    "artists_uri": first_track_artist.get("uri", None),

                    "track_available_markets": json.dumps(track.get("available_markets") or []),
                    "track_disc_number": track.get("disc_number", None),
                    "track_duration_ms": track.get("duration_ms", None),
                    "track_explicit": track.get("explicit", None),
                    "track_external_ids_isrc": track.get("external_ids") and track.get("external_ids").get("isrc", None),
                    "track_external_urls": track.get("external_urls", {}).get("spotify", None),
                    "track_href": track.get("href", None),
                    "track_id": track.get("id", None),
                    "track_is_local": track.get("is_local", None),
                    "track_name": track.get("name", None),
                    "track_popularity": track.get("popularity", None),
                    "track_preview_url": track.get("preview_url", "") or "",
                    "track_number": track.get("track_number", None),
                    "track_type": track.get("type", None),
                    "track_uri": track.get("uri", None),

                    "item_played_at": item.get("played_at", None),
                    "item_context_external_urls": context.get("external_urls", {}).get("spotify", None),
                    "item_context_href": context.get("href", None),
                    "item_context_type": context.get("type", None),
                    "item_context_uri": context.get("uri", None)
                }

                all_flattened_tracks.append(flattened)

            try:
                df_api_recently_played = spark.createDataFrame(all_flattened_tracks)
                
                print("‚úÖ DataFrame Spark criada com sucesso:")
                df_api_recently_played.show(5, truncate=False)
                df_api_recently_played.printSchema()
                print(f"Total recently played tracks: {df_api_recently_played.count()}")

            except NameError:
                print("‚ùå ERRO: Vari√°vel 'spark' n√£o definida. Por favor, inicialize a sua SparkSession.")
            except Exception as e:
                print(f"‚ùå Erro ao criar DataFrame Spark (verifique o esquema): {e}")

    except json.JSONDecodeError as e:
        print(f"‚ùå Erro de decodifica√ß√£o JSON: O ficheiro n√£o √© um JSON v√°lido: {e}")
    except Exception as e:
        print(f"‚ùå Erro inesperado ao processar o ficheiro: {e}")

Saved Albums

In [None]:
# Fun√ß√£o auxiliar para garantir que o valor √© None ou uma string
def safe_string_cast(value: Any) -> Optional[str]:
    """Converte valores complexos (dict, list) para string JSON, ou None caso contr√°rio."""
    if value is None:
        return None
    if isinstance(value, (dict, list)):
        # Serializa para garantir que Spark recebe uma StringType
        return json.dumps(value)
    # Garante que qualquer outro tipo (como int/float) √© convertido para string, se necess√°rio
    return str(value) if value is not None else None


# 1. DEFINI√á√ÉO DO ESQUEMA EXPL√çCITO DO DATAFRAME
# Isto resolve o erro CANNOT_DETERMINE_TYPE
saved_albums_schema = StructType([
    # Campos de N√≠vel Raiz
    StructField("href", StringType(), True),
    StructField("limit", LongType(), True),
    StructField("next", StringType(), True),
    StructField("offset", LongType(), True),
    StructField("total", LongType(), True),
    
    # Campos de N√≠vel Item
    StructField("item_added_at", StringType(), True),
    
    # Campos de √Ålbum
    StructField("album_id", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_total_tracks_album", LongType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_label", StringType(), True),
    StructField("album_popularity", LongType(), True),
    StructField("album_uri", StringType(), True),
    StructField("album_external_urls", StringType(), True),
    StructField("album_href", StringType(), True),
    StructField("album_artist_id", StringType(), True),
    StructField("album_artist_name", StringType(), True),
    
    # Campos de √Ålbum Complexos (JSON Strings)
    StructField("album_artists_full_json", StringType(), True),
    StructField("album_images_json", StringType(), True),
    StructField("album_available_markets_json", StringType(), True),
    StructField("album_genres_json", StringType(), True),
    StructField("album_copyrights_json", StringType(), True),
    StructField("album_external_ids_json", StringType(), True),
    
    # Campos de Faixa (Track)
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_number", LongType(), True),
    StructField("track_disc_number", LongType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_explicit", BooleanType(), True), # Campo Booleano
    StructField("track_is_local", BooleanType(), True), # Campo Booleano
    StructField("track_preview_url", StringType(), True),
    StructField("track_popularity", LongType(), True),
    StructField("track_artist_id", StringType(), True),
    StructField("track_artist_name", StringType(), True),
    StructField("track_artist_uri", StringType(), True),
    
    # Campos de Faixa Complexos (JSON Strings)
    StructField("track_external_urls", StringType(), True),
    StructField("track_external_ids_isrc", StringType(), True),
    StructField("track_artists_full_json", StringType(), True),
    StructField("track_available_markets_json", StringType(), True),
])

In [None]:
path_current_user_saved_albums = path_api_raw / "current_user_saved_albums.json"

if not path_current_user_saved_albums.exists():
    print("‚ö†Ô∏è Ficheiro current_user_saved_albums.json n√£o existe.")
else:
    try:
        data: Dict[str, Any] = {}

        with open(path_current_user_saved_albums, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if raw_data is not None and isinstance(raw_data, dict):
            data = raw_data
        
        if not data:
             print("‚ö†Ô∏è O ficheiro JSON continha 'null' ou est√° vazio ap√≥s a leitura.")
             items = []
        else:
            # Campos de Pagina√ß√£o e N√≠vel Raiz
            items: List[Dict[str, Any]] = data.get("items", [])
            
            href = data.get("href", None)
            limit = data.get("limit", None)
            next_url = data.get("next", None)
            offset = data.get("offset", None)
            total = data.get("total", None)


        if not items:
            print("‚ö†Ô∏è A lista de √°lbuns guardados ('items') est√° vazia.")
        else:
            all_flattened_album_tracks: List[Dict[str, Any]] = []

            # === LOOP PRINCIPAL: √Ålbuns ===
            for item in items:
                
                if item is None or not isinstance(item, dict):
                    continue
                
                album: Dict[str, Any] = item.get("album") or {} 
                added_at = item.get("added_at", None)

                # Extra√ß√£o de dados do √Ålbum
                album_artists_list: List[Dict[str, Any]] = album.get("artists") or []
                first_album_artist: Dict[str, Any] = album_artists_list[0] if album_artists_list else {}

                # Serializa√ß√£o de Estruturas Complexas do √Ålbum para String JSON
                album_artists_json = json.dumps(album_artists_list)
                album_images_list = album.get("images") or []
                album_images_json = json.dumps(album_images_list)
                album_markets_json = json.dumps(album.get("available_markets") or [])
                album_copyrights_json = json.dumps(album.get("copyrights") or [])
                album_genres_json = json.dumps(album.get("genres") or [])

                album_external_ids_data = album.get("external_ids")
                album_external_ids_json = safe_string_cast(album_external_ids_data)
                
                # Bloco de Metadados do √Ålbum (a ser duplicado)
                album_metadata = {
                    # Campos de N√≠vel Raiz (j√° garantidos como String/Long no topo)
                    "href": safe_string_cast(href), "limit": limit, "next": safe_string_cast(next_url), "offset": offset, "total": total,
                    "item_added_at": safe_string_cast(added_at),
                    
                    # Campos de √Ålbum (Simples)
                    "album_id": safe_string_cast(album.get("id")),
                    "album_name": safe_string_cast(album.get("name")),
                    "album_type": safe_string_cast(album.get("album_type")),
                    "album_total_tracks_album": album.get("total_tracks", None),
                    "album_release_date": safe_string_cast(album.get("release_date")),
                    "album_release_date_precision": safe_string_cast(album.get("release_date_precision")),
                    "album_label": safe_string_cast(album.get("label")),
                    "album_popularity": album.get("popularity", None),
                    "album_uri": safe_string_cast(album.get("uri")),
                    "album_external_urls": safe_string_cast(album.get("external_urls", {}).get("spotify")),
                    "album_href": safe_string_cast(album.get("href")),
                    
                    # Campos do PRIMEIRO Artista do √Ålbum
                    "album_artist_id": safe_string_cast(first_album_artist.get("id")),
                    "album_artist_name": safe_string_cast(first_album_artist.get("name")),
                    
                    # Campos de √Ålbum Complexos (Serializados)
                    "album_artists_full_json": album_artists_json,
                    "album_images_json": album_images_json,
                    "album_available_markets_json": album_markets_json,
                    "album_genres_json": album_genres_json,
                    "album_copyrights_json": album_copyrights_json,
                    "album_external_ids_json": album_external_ids_json,
                }
                
                # === EXTRA√á√ÉO DE TRACKS ===
                album_tracks_data: Dict[str, Any] = album.get("tracks") or {}
                album_tracks_items: List[Dict[str, Any]] = album_tracks_data.get("items") or []

                # === LOOP SECUND√ÅRIO: Faixas ===
                for track in album_tracks_items:
                    
                    # Pr√©-extra√ß√£o para casting seguro (Num√©rico e Booleano)
                    # Nota: N√£o precisamos do casting int() aqui, pois o esquema for√ßa LongType e None √© aceito.
                    
                    track_external_ids_data = track.get("external_ids") or {}
                    
                    track_metadata = {
                        # Campos da Faixa (Simples com safe_string_cast)
                        "track_id": safe_string_cast(track.get("id")),
                        "track_name": safe_string_cast(track.get("name")),
                        "track_uri": safe_string_cast(track.get("uri")),
                        "track_number": track.get("track_number", None),
                        "track_disc_number": track.get("disc_number", None),
                        "track_duration_ms": track.get("duration_ms", None),
                        "track_explicit": track.get("explicit", None),
                        "track_is_local": track.get("is_local", None),
                        "track_preview_url": safe_string_cast(track.get("preview_url")),
                        "track_popularity": track.get("popularity", None),
                        
                        # Campos do PRIMEIRO Artista da Faixa
                        "track_artist_id": safe_string_cast(track.get("artists", [{}])[0].get("id")),
                        "track_artist_name": safe_string_cast(track.get("artists", [{}])[0].get("name")),
                        
                        # Informa√ß√£o Externa da Faixa
                        "track_external_urls": safe_string_cast(track.get("external_urls", {}).get("spotify")),
                        "track_external_ids_isrc": safe_string_cast(track_external_ids_data.get("isrc")),
                        
                        # Campos de Faixa Complexos (Serializados)
                        "track_artists_full_json": safe_string_cast(track.get("artists")),
                        "track_available_markets_json": safe_string_cast(track.get("available_markets")),
                    }

                    # Combina metadados do √°lbum (duplicados) com metadados da faixa
                    flattened_record = {**album_metadata, **track_metadata}
                    all_flattened_album_tracks.append(flattened_record)

            try:
                # 2. CRIA√á√ÉO DO DATAFRAME USANDO O ESQUEMA EXPL√çCITO
                df_api_saved_albums = spark.createDataFrame(all_flattened_album_tracks, schema=saved_albums_schema)
                
                print("‚úÖ DataFrame Spark criada com sucesso para √°lbuns/faixas (Esquema For√ßado):")
                print(f"Total de registos (Faixas): {df_api_saved_albums.count()}")
                df_api_saved_albums.show(5, truncate=False)
                df_api_saved_albums.printSchema()

            except NameError:
                print("‚ùå ERRO: Vari√°vel 'spark' n√£o definida. Por favor, inicialize a sua SparkSession.")
            except Exception as e:
                print(f"‚ùå Erro ao criar DataFrame Spark (Erro Inesperado ap√≥s For√ßar Esquema): {e}")

    except json.JSONDecodeError as e:
        print(f"‚ùå Erro de decodifica√ß√£o JSON: O ficheiro n√£o √© um JSON v√°lido: {e}")
    except Exception as e:
        print(f"‚ùå Erro inesperado ao processar o ficheiro: {e}")

Saved Tracks

In [None]:
# 1. DEFINI√á√ÉO DO ESQUEMA EXPL√çCITO DO DATAFRAME
saved_tracks_schema = StructType([
    # Campos de N√≠vel Raiz (Pagina√ß√£o)
    StructField("href", StringType(), True),
    StructField("limit", LongType(), True),
    StructField("next", StringType(), True),
    StructField("offset", LongType(), True),
    StructField("total", LongType(), True),
    
    # Campos de N√≠vel Item
    StructField("item_added_at", StringType(), True),
    
    # Campos da Faixa (Track - Simples)
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_type", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_disc_number", LongType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_explicit", BooleanType(), True),
    StructField("track_is_local", BooleanType(), True),
    StructField("track_popularity", LongType(), True),
    StructField("track_preview_url", StringType(), True),
    StructField("track_number", LongType(), True),
    
    # Campos da Faixa (Track - Complexos)
    StructField("track_external_urls", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("track_external_ids_json", StringType(), True), # Serializa todo o dicion√°rio external_ids
    StructField("track_artists_full_json", StringType(), True), # Lista completa de artistas da faixa
    StructField("track_available_markets_json", StringType(), True), # Lista de mercados
    
    # Campos do PRIMEIRO Artista da Faixa (Flattened)
    StructField("track_artist_id", StringType(), True),
    StructField("track_artist_name", StringType(), True),
    StructField("track_artist_uri", StringType(), True),
    
    # Campos do √Ålbum (Track.Album - Simples)
    StructField("album_id", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_total_tracks", LongType(), True),
    StructField("album_uri", StringType(), True),
    
    # Campos do √Ålbum (Track.Album - Complexos)
    StructField("album_external_urls", StringType(), True),
    StructField("album_href", StringType(), True),
    StructField("album_images_json", StringType(), True),
    StructField("album_artists_full_json", StringType(), True),
    StructField("album_available_markets_json", StringType(), True),
])

In [None]:
path_current_user_saved_tracks = path_api_raw / "current_user_saved_tracks.json"

if not path_current_user_saved_tracks.exists():
    print("‚ö†Ô∏è Ficheiro current_user_saved_tracks.json n√£o existe.")
else:
    try:
        data: Dict[str, Any] = {}

        with open(path_current_user_saved_tracks, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if raw_data is not None and isinstance(raw_data, dict):
            data = raw_data
        
        if not data:
             print("‚ö†Ô∏è O ficheiro JSON continha 'null' ou est√° vazio ap√≥s a leitura.")
             items = []
        else:
            # Campos de Pagina√ß√£o e N√≠vel Raiz
            items: List[Dict[str, Any]] = data.get("items", [])
            
            href = data.get("href", None)
            limit = data.get("limit", None)
            next_url = data.get("next", None)
            offset = data.get("offset", None)
            total = data.get("total", None)


        if not items:
            print("‚ö†Ô∏è A lista de faixas guardadas ('items') est√° vazia.")
        else:
            all_flattened_tracks: List[Dict[str, Any]] = []

            # === LOOP PRINCIPAL: Faixas Guardadas ===
            for item in items:
                
                if item is None or not isinstance(item, dict):
                    continue
                
                # Extra√ß√£o de campos de topo
                added_at = item.get("added_at", None)
                track: Dict[str, Any] = item.get("track") or {} 
                
                # Extra√ß√£o de estruturas aninhadas
                album: Dict[str, Any] = track.get("album") or {}
                track_artists_list: List[Dict[str, Any]] = track.get("artists") or []
                
                # Primeiro Artista da Faixa (para flattening)
                first_track_artist: Dict[str, Any] = track_artists_list[0] if track_artists_list else {}

                # Serializa√ß√£o de Estruturas Complexas
                album_images_json = safe_string_cast(album.get("images"))
                album_artists_json = safe_string_cast(album.get("artists"))
                album_markets_json = safe_string_cast(album.get("available_markets"))
                
                track_external_ids_json = safe_string_cast(track.get("external_ids"))
                track_artists_json = safe_string_cast(track_artists_list)
                track_markets_json = safe_string_cast(track.get("available_markets"))
                
                
                flattened_record = {
                    # Campos de N√≠vel Raiz
                    "href": safe_string_cast(href), "limit": limit, "next": safe_string_cast(next_url), 
                    "offset": offset, "total": total,
                    
                    # Campos de N√≠vel Item
                    "item_added_at": safe_string_cast(added_at),
                    
                    # Campos da Faixa (Track)
                    "track_id": safe_string_cast(track.get("id")),
                    "track_name": safe_string_cast(track.get("name")),
                    "track_type": safe_string_cast(track.get("type")),
                    "track_uri": safe_string_cast(track.get("uri")),
                    "track_disc_number": track.get("disc_number", None),
                    "track_duration_ms": track.get("duration_ms", None),
                    "track_explicit": track.get("explicit", None),
                    "track_is_local": track.get("is_local", None),
                    "track_popularity": track.get("popularity", None),
                    "track_preview_url": safe_string_cast(track.get("preview_url")),
                    "track_number": track.get("track_number", None),

                    # Campos da Faixa (Track - Complexos/URLs)
                    "track_external_urls": safe_string_cast(track.get("external_urls", {}).get("spotify")),
                    "track_href": safe_string_cast(track.get("href")),
                    "track_external_ids_json": track_external_ids_json,
                    "track_artists_full_json": track_artists_json,
                    "track_available_markets_json": track_markets_json,
                    
                    # Campos do PRIMEIRO Artista da Faixa (Flattened)
                    "track_artist_id": safe_string_cast(first_track_artist.get("id")),
                    "track_artist_name": safe_string_cast(first_track_artist.get("name")),
                    "track_artist_uri": safe_string_cast(first_track_artist.get("uri")),
                    
                    # Campos do √Ålbum (Track.Album - Simples)
                    "album_id": safe_string_cast(album.get("id")),
                    "album_name": safe_string_cast(album.get("name")),
                    "album_type": safe_string_cast(album.get("album_type")),
                    "album_release_date": safe_string_cast(album.get("release_date")),
                    "album_release_date_precision": safe_string_cast(album.get("release_date_precision")),
                    "album_total_tracks": album.get("total_tracks", None),
                    "album_uri": safe_string_cast(album.get("uri")),
                    
                    # Campos do √Ålbum (Track.Album - Complexos/URLs)
                    "album_external_urls": safe_string_cast(album.get("external_urls", {}).get("spotify")),
                    "album_href": safe_string_cast(album.get("href")),
                    "album_images_json": album_images_json,
                    "album_artists_full_json": album_artists_json,
                    "album_available_markets_json": album_markets_json,
                }
                
                all_flattened_tracks.append(flattened_record)

            try:
                # 2. CRIA√á√ÉO DO DATAFRAME USANDO O ESQUEMA EXPL√çCITO
                df_api_saved_tracks = spark.createDataFrame(all_flattened_tracks, schema=saved_tracks_schema)
                
                print("‚úÖ DataFrame Spark criada com sucesso para faixas guardadas (Esquema For√ßado):")
                print(f"Total de registos (Faixas): {df_api_saved_tracks.count()}")
                df_api_saved_tracks.show(5, truncate=False)
                df_api_saved_tracks.printSchema()

            except NameError:
                print("‚ùå ERRO: Vari√°vel 'spark' n√£o definida. Por favor, inicialize a sua SparkSession.")
            except Exception as e:
                print(f"‚ùå Erro ao criar DataFrame Spark (Erro Inesperado ap√≥s For√ßar Esquema): {e}")

    except json.JSONDecodeError as e:
        print(f"‚ùå Erro de decodifica√ß√£o JSON: O ficheiro n√£o √© um JSON v√°lido: {e}")
    except Exception as e:
        print(f"‚ùå Erro inesperado ao processar o ficheiro: {e}")

Top Artists

In [None]:
path_current_user_top_artists = path_api_raw / "current_user_top_artists.json"

with open(path_current_user_top_artists, "r", encoding="utf-8") as f:
    data = json.load(f)

items = data["items"]

df_api_top_artists = spark.createDataFrame(items)
df_api_top_artists.show(5, truncate=False)
df_api_top_artists.printSchema()
print("Total top artists", df_api_top_artists.count())

Top Tracks

In [None]:
# 1. DEFINI√á√ÉO DO ESQUEMA EXPL√çCITO DO DATAFRAME
top_tracks_schema = StructType([
    # Campos de N√≠vel Raiz (Pagina√ß√£o)
    StructField("href", StringType(), True),
    StructField("limit", LongType(), True),
    StructField("next", StringType(), True),
    StructField("offset", LongType(), True),
    StructField("total", LongType(), True),
    
    # Campos da Faixa (Track - Simples)
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_type", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_disc_number", LongType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_explicit", BooleanType(), True),
    StructField("track_is_local", BooleanType(), True),
    StructField("track_popularity", LongType(), True),
    StructField("track_preview_url", StringType(), True),
    StructField("track_number", LongType(), True),
    
    # Campos da Faixa (Track - Complexos)
    StructField("track_external_urls", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("track_external_ids_json", StringType(), True), # Serializa todo o dicion√°rio external_ids
    StructField("track_artists_full_json", StringType(), True), # Lista completa de artistas da faixa
    StructField("track_available_markets_json", StringType(), True), # Lista de mercados
    
    # Campos do PRIMEIRO Artista da Faixa (Flattened)
    StructField("track_artist_id", StringType(), True),
    StructField("track_artist_name", StringType(), True),
    StructField("track_artist_uri", StringType(), True),
    
    # Campos do √Ålbum (Track.Album - Simples)
    StructField("album_id", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_total_tracks", LongType(), True),
    StructField("album_uri", StringType(), True),
    
    # Campos do √Ålbum (Track.Album - Complexos)
    StructField("album_external_urls", StringType(), True),
    StructField("album_href", StringType(), True),
    StructField("album_images_json", StringType(), True),
    StructField("album_artists_full_json", StringType(), True),
    StructField("album_available_markets_json", StringType(), True),
])

In [None]:
path_current_user_top_tracks = path_api_raw / "current_user_top_tracks.json"

if not path_current_user_top_tracks.exists():
    print("‚ö†Ô∏è Ficheiro current_user_top_tracks.json n√£o existe.")
else:
    try:
        data: Dict[str, Any] = {}

        with open(path_current_user_top_tracks, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if raw_data is not None and isinstance(raw_data, dict):
            data = raw_data
        
        if not data:
             print("‚ö†Ô∏è O ficheiro JSON continha 'null' ou est√° vazio ap√≥s a leitura.")
             items = []
        else:
            # Campos de Pagina√ß√£o e N√≠vel Raiz
            items: List[Dict[str, Any]] = data.get("items", [])
            
            href = data.get("href", None)
            limit = data.get("limit", None)
            next_url = data.get("next", None)
            offset = data.get("offset", None)
            total = data.get("total", None)


        if not items:
            print("‚ö†Ô∏è A lista de faixas de topo ('items') est√° vazia.")
        else:
            all_flattened_tracks: List[Dict[str, Any]] = []

            # === LOOP PRINCIPAL: Faixas de Topo ===
            for track in items: # Os itens s√£o as faixas diretamente
                
                if track is None or not isinstance(track, dict):
                    continue
                
                # Extra√ß√£o de estruturas aninhadas
                album: Dict[str, Any] = track.get("album") or {}
                track_artists_list: List[Dict[str, Any]] = track.get("artists") or []
                
                # Primeiro Artista da Faixa (para flattening)
                first_track_artist: Dict[str, Any] = track_artists_list[0] if track_artists_list else {}

                # Serializa√ß√£o de Estruturas Complexas (√Ålbum)
                album_images_json = safe_string_cast(album.get("images"))
                album_artists_json = safe_string_cast(album.get("artists"))
                album_markets_json = safe_string_cast(album.get("available_markets"))
                
                # Serializa√ß√£o de Estruturas Complexas (Faixa)
                track_external_ids_json = safe_string_cast(track.get("external_ids"))
                track_artists_json = safe_string_cast(track_artists_list)
                track_markets_json = safe_string_cast(track.get("available_markets"))
                
                
                flattened_record = {
                    # Campos de N√≠vel Raiz
                    "href": safe_string_cast(href), "limit": limit, "next": safe_string_cast(next_url), 
                    "offset": offset, "total": total,
                    
                    # Campos da Faixa (Track)
                    "track_id": safe_string_cast(track.get("id")),
                    "track_name": safe_string_cast(track.get("name")),
                    "track_type": safe_string_cast(track.get("type")),
                    "track_uri": safe_string_cast(track.get("uri")),
                    "track_disc_number": track.get("disc_number", None),
                    "track_duration_ms": track.get("duration_ms", None),
                    "track_explicit": track.get("explicit", None),
                    "track_is_local": track.get("is_local", None),
                    "track_popularity": track.get("popularity", None),
                    "track_preview_url": safe_string_cast(track.get("preview_url")),
                    "track_number": track.get("track_number", None),

                    # Campos da Faixa (Track - Complexos/URLs)
                    "track_external_urls": safe_string_cast(track.get("external_urls", {}).get("spotify")),
                    "track_href": safe_string_cast(track.get("href")),
                    "track_external_ids_json": track_external_ids_json,
                    "track_artists_full_json": track_artists_json,
                    "track_available_markets_json": track_markets_json,
                    
                    # Campos do PRIMEIRO Artista da Faixa (Flattened)
                    "track_artist_id": safe_string_cast(first_track_artist.get("id")),
                    "track_artist_name": safe_string_cast(first_track_artist.get("name")),
                    "track_artist_uri": safe_string_cast(first_track_artist.get("uri")),
                    
                    # Campos do √Ålbum (Track.Album - Simples)
                    "album_id": safe_string_cast(album.get("id")),
                    "album_name": safe_string_cast(album.get("name")),
                    "album_type": safe_string_cast(album.get("album_type")),
                    "album_release_date": safe_string_cast(album.get("release_date")),
                    "album_release_date_precision": safe_string_cast(album.get("release_date_precision")),
                    "album_total_tracks": album.get("total_tracks", None),
                    "album_uri": safe_string_cast(album.get("uri")),
                    
                    # Campos do √Ålbum (Track.Album - Complexos/URLs)
                    "album_external_urls": safe_string_cast(album.get("external_urls", {}).get("spotify")),
                    "album_href": safe_string_cast(album.get("href")),
                    "album_images_json": album_images_json,
                    "album_artists_full_json": album_artists_json,
                    "album_available_markets_json": album_markets_json,
                }
                
                all_flattened_tracks.append(flattened_record)

            try:
                # 2. CRIA√á√ÉO DO DATAFRAME USANDO O ESQUEMA EXPL√çCITO
                df_api_top_tracks = spark.createDataFrame(all_flattened_tracks, schema=top_tracks_schema)
                
                print("‚úÖ DataFrame Spark criada com sucesso para faixas de topo (Esquema For√ßado):")
                print(f"Total de registos (Faixas): {df_api_top_tracks.count()}")
                df_api_top_tracks.show(5, truncate=False)
                df_api_top_tracks.printSchema()

            except NameError:
                print("‚ùå ERRO: Vari√°vel 'spark' n√£o definida. Por favor, inicialize a sua SparkSession.")
            except Exception as e:
                print(f"‚ùå Erro ao criar DataFrame Spark (Erro Inesperado ap√≥s For√ßar Esquema): {e}")

    except json.JSONDecodeError as e:
        print(f"‚ùå Erro de decodifica√ß√£o JSON: O ficheiro n√£o √© um JSON v√°lido: {e}")
    except Exception as e:
        print(f"‚ùå Erro inesperado ao processar o ficheiro: {e}")

Current User

In [None]:
path_current_user = path_api_raw / "current_user.json"

if not path_current_user.exists():
    print("‚ö†Ô∏è Ficheiro current_user.json n√£o existe.")
else:
    try:
        data: Dict[str, Any] = {}

        with open(path_current_user, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if raw_data is not None and isinstance(raw_data, dict):
            data = raw_data
        
        if not data:
             print("‚ö†Ô∏è O ficheiro JSON continha 'null' ou est√° vazio ap√≥s a leitura.")
        else:
            
            # O Dicion√°rio que representa a linha
            data = {
                "display_name": data.get("display_name", None),
                "external_urls": data.get("external_urls", None).get("spotify", None),
                # "followers_href": data.get("followers", None).get("href", None),
                "followers_total": data.get("followers", None).get("total", None),
                "href": data.get("href", None),
                "user_id": data.get("id", None),
                "images_json": data.get("images", []),
                "type": data.get("type", None), # Corrigir para usar None como valor predefinido
                "uri": data.get("uri", None)
            }
            
            # üí° Ponto crucial: Envolver o dicion√°rio em uma lista para criar uma linha no DataFrame
            flattened = [data] 
                
            try:
                # Cria√ß√£o do DataFrame sem o argumento 'schema'
                df_api_current_user = spark.createDataFrame(flattened) 
                
                print("‚úÖ DataFrame Spark criada com sucesso!")
                print(f"Total de registos: {df_api_current_user.count()}")
                df_api_current_user.show(5, truncate=False)
                df_api_current_user.printSchema()

            except NameError:
                print("‚ùå ERRO: Vari√°vel 'spark' n√£o definida. Por favor, inicialize a sua SparkSession.")
            except Exception as e:
                print(f"‚ùå Erro ao criar DataFrame Spark (Erro Inesperado ap√≥s Infer√™ncia): {e}")

    except json.JSONDecodeError as e:
        print(f"‚ùå Erro de decodifica√ß√£o JSON: O ficheiro n√£o √© um JSON v√°lido: {e}")
    except Exception as e:
        print(f"‚ùå Erro inesperado ao processar o ficheiro: {e}")

Currently Playing

In [None]:
# Definindo o esquema para o DataFrame "Currently Playing"
currently_playing_schema = StructType([
    # Campos de N√≠vel Raiz
    StructField("is_playing", BooleanType(), True),
    StructField("timestamp", LongType(), True),
    StructField("progress_ms", LongType(), True),
    StructField("currently_playing_type", StringType(), True),
    
    # Contexto
    StructField("context_external_urls", StringType(), True),
    StructField("context_href", StringType(), True),
    StructField("context_uri", StringType(), True),
    StructField("context_type", StringType(), True),
    
    # A√ß√µes
    StructField("actions_disallows_json", StringType(), True),
    
    # Campos da Faixa (Track)
    StructField("track_id", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_type", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_disc_number", IntegerType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_explicit", BooleanType(), True),
    StructField("track_is_local", BooleanType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_preview_url", StringType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("track_external_urls", StringType(), True),
    StructField("track_href", StringType(), True),
    StructField("track_external_ids_json", StringType(), True),
    StructField("track_artists_full_json", StringType(), True),
    StructField("track_available_markets_json", StringType(), True),
    
    # Campos do Artista da Faixa
    StructField("track_artist_id", StringType(), True),
    StructField("track_artist_name", StringType(), True),
    StructField("track_artist_uri", StringType(), True),
    
    # Campos do √Ålbum (Track.Album)
    StructField("album_id", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_total_tracks", IntegerType(), True),
    StructField("album_uri", StringType(), True),
    StructField("album_external_urls", StringType(), True),
    StructField("album_href", StringType(), True),
    StructField("album_images_json", StringType(), True),
    StructField("album_artists_full_json", StringType(), True),
    StructField("album_available_markets_json", StringType(), True),
])

In [None]:
path_currently_playing = path_api_raw / "currently_playing.json"

data: Dict[str, Any] = {}
all_flattened_tracks: List[Dict[str, Any]] = []

if not path_currently_playing.exists():
    print(f"‚ö†Ô∏è Ficheiro {path_currently_playing.name} n√£o existe no caminho: {path_currently_playing.parent}")

else:
    try:
        # 1. LEITURA DO FICHEIRO JSON
        with open(path_currently_playing, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if raw_data is not None and isinstance(raw_data, dict):
            data = raw_data
        
        if not data:
            print(f"‚ö†Ô∏è O ficheiro JSON '{path_currently_playing.name}' continha 'null' ou est√° vazio ap√≥s a leitura.")
        
        else:
            # === EXTRA√á√ÉO DE CAMPOS DE N√çVEL RAIZ ===
            is_playing: bool | None = data.get("is_playing")
            timestamp: int | None = data.get("timestamp")
            progress_ms: int | None = data.get("progress_ms")
            currently_playing_type: str | None = data.get("currently_playing_type")
            
            # Estruturas Aninhadas
            track: Dict[str, Any] = data.get("item") or {}
            context: Dict[str, Any] = data.get("context") or {}
            actions: Dict[str, Any] = data.get("actions") or {}

            if not track:
                print("‚ö†Ô∏è O campo 'item' (faixa) est√° vazio ou n√£o existe, n√£o h√° dados de m√∫sica para processar.")
            else:
                # === FLATTENING (ACHATAMENTO) E SERIALIZA√á√ÉO ===
                
                # Extra√ß√£o de estruturas aninhadas da FAUXA
                album: Dict[str, Any] = track.get("album") or {}
                track_artists_list: List[Dict[str, Any]] = track.get("artists") or []
                
                # Primeiro Artista da Faixa (para flattening)
                first_track_artist: Dict[str, Any] = track_artists_list[0] if track_artists_list else {}

                # Serializa√ß√£o de Estruturas Complexas (Contexto)
                context_external_urls = safe_string_cast(context.get("external_urls", {}).get("spotify"))
                context_href = safe_string_cast(context.get("href"))
                context_uri = safe_string_cast(context.get("uri"))
                context_type = safe_string_cast(context.get("type"))
                
                # Serializa√ß√£o de Estruturas Complexas (A√ß√µes e outros)
                actions_disallows_json = safe_string_cast(actions.get("disallows"))
                
                # Serializa√ß√£o de Estruturas Complexas (√Ålbum)
                album_images_json = safe_string_cast(album.get("images"))
                album_artists_json = safe_string_cast(album.get("artists"))
                album_markets_json = safe_string_cast(album.get("available_markets"))
                
                # Serializa√ß√£o de Estruturas Complexas (Faixa)
                track_external_ids_json = safe_string_cast(track.get("external_ids"))
                track_artists_json = safe_string_cast(track_artists_list)
                track_markets_json = safe_string_cast(track.get("available_markets"))
                
                
                # === CRIA√á√ÉO DO REGISTO ACHATADO ===
                flattened_record = {
                    # Campos de N√≠vel Raiz
                    "is_playing": is_playing, 
                    "timestamp": timestamp, 
                    "progress_ms": progress_ms, 
                    "currently_playing_type": safe_string_cast(currently_playing_type),
                    
                    # Campos de Contexto (Context)
                    "context_external_urls": context_external_urls,
                    "context_href": context_href,
                    "context_uri": context_uri,
                    "context_type": context_type,
                    
                    # Campos de A√ß√µes (Actions)
                    "actions_disallows_json": actions_disallows_json,
                    
                    # Campos da Faixa (Track)
                    "track_id": safe_string_cast(track.get("id")),
                    "track_name": safe_string_cast(track.get("name")),
                    "track_type": safe_string_cast(track.get("type")),
                    "track_uri": safe_string_cast(track.get("uri")),
                    "track_disc_number": track.get("disc_number", None),
                    "track_duration_ms": track.get("duration_ms", None),
                    "track_explicit": track.get("explicit", None),
                    "track_is_local": track.get("is_local", None),
                    "track_popularity": track.get("popularity", None),
                    "track_preview_url": safe_string_cast(track.get("preview_url")),
                    "track_number": track.get("track_number", None),

                    # Campos da Faixa (Track - Complexos/URLs)
                    "track_external_urls": safe_string_cast(track.get("external_urls", {}).get("spotify")),
                    "track_href": safe_string_cast(track.get("href")),
                    "track_external_ids_json": track_external_ids_json,
                    "track_artists_full_json": track_artists_json,
                    "track_available_markets_json": track_markets_json,
                    
                    # Campos do PRIMEIRO Artista da Faixa (Flattened)
                    "track_artist_id": safe_string_cast(first_track_artist.get("id")),
                    "track_artist_name": safe_string_cast(first_track_artist.get("name")),
                    "track_artist_uri": safe_string_cast(first_track_artist.get("uri")),
                    
                    # Campos do √Ålbum (Track.Album - Simples)
                    "album_id": safe_string_cast(album.get("id")),
                    "album_name": safe_string_cast(album.get("name")),
                    "album_type": safe_string_cast(album.get("album_type")),
                    "album_release_date": safe_string_cast(album.get("release_date")),
                    "album_release_date_precision": safe_string_cast(album.get("release_date_precision")),
                    "album_total_tracks": album.get("total_tracks", None),
                    "album_uri": safe_string_cast(album.get("uri")),
                    
                    # Campos do √Ålbum (Track.Album - Complexos/URLs)
                    "album_external_urls": safe_string_cast(album.get("external_urls", {}).get("spotify")),
                    "album_href": safe_string_cast(album.get("href")),
                    "album_images_json": album_images_json,
                    "album_artists_full_json": album_artists_json,
                    "album_available_markets_json": album_markets_json,
                }
                
                # Adicionamos a √∫nica faixa √† lista
                all_flattened_tracks.append(flattened_record)

    except json.JSONDecodeError as e:
        print(f"‚ùå Erro de decodifica√ß√£o JSON: O ficheiro n√£o √© um JSON v√°lido: {e}")
    except Exception as e:
        print(f"‚ùå Erro inesperado ao processar o ficheiro: {e}")

# === CRIA√á√ÉO DO DATAFRAME SPARK ===
if all_flattened_tracks:
    try:
        # Nota: O esquema ser√° inferido pelo Spark, uma vez que n√£o foi fornecido um esquema expl√≠cito.
        df_api_currently_playing = spark.createDataFrame(
            all_flattened_tracks, 
            schema=currently_playing_schema
        )

        print("‚úÖ DataFrame Spark criado com sucesso!")
        print(f"Total de registos: {df_api_currently_playing.count()}")
        df_api_currently_playing.show(5, truncate=False)
        df_api_currently_playing.printSchema()

    except NameError:
        print("\n‚ùå ERRO: Vari√°vel 'spark' n√£o definida. Por favor, inicialize a sua SparkSession.")
    except Exception as e:
        print(f"\n‚ùå Erro ao criar DataFrame Spark: {e}")

Devices

In [None]:
path_devices = path_api_raw / "devices.json"

with open(path_devices, "r", encoding="utf-8") as f:
    data = json.load(f)

if not data:
    print("‚ö†Ô∏è O ficheiro JSON est√° vazio!")
else:
    devices: List[Dict[str, Any]] = data.get("devices", [])

    if devices is None or not isinstance(devices, list) or not devices:
        print("‚ö†Ô∏è A lista de dispositivos ('devices') est√° vazia ou n√£o existe.")
    else:
        df_api_devices = spark.createDataFrame(devices)
        df_api_devices.show(5, truncate=False)
        df_api_devices.printSchema()
        print("Total devices", df_api_devices.count())

Me

In [None]:
path_me = path_api_raw / "me.json"

with open(path_me, "r", encoding="utf-8") as f:
    data = json.load(f)

if not data:
    print("‚ö†Ô∏è O ficheiro JSON est√° vazio!")
else:
    df_api_me = spark.createDataFrame([data])
    df_api_me.show(5, truncate=False)
    df_api_me.printSchema()
    print("Total me", df_api_me.count())

New Releases

In [None]:
path_new_releases = path_api_raw / "new_releases.json" # Assumindo path_api definido

data: Dict[str, Any] = {}
all_flattened_albums: List[Dict[str, Any]] = []

if not path_new_releases.exists():
    print("‚ö†Ô∏è Ficheiro new_releases.json n√£o existe.")
else:
    try:
        # 1. Leitura do ficheiro JSON
        with open(path_new_releases, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if raw_data is not None and isinstance(raw_data, dict):
            data = raw_data
        
        if not data or "albums" not in data:
            print("‚ö†Ô∏è O ficheiro JSON continha 'null', est√° vazio, ou n√£o cont√©m o campo 'albums'.")
            items = []
            albums_metadata = {}
        else:
            # Metadata de Pagina√ß√£o (do n√≠vel 'albums')
            albums_metadata: Dict[str, Any] = data.get("albums", {})
            
            # Lista de Itens (√Ålbuns)
            items: List[Dict[str, Any]] = albums_metadata.get("items", [])
            
            # Extra√ß√£o dos campos de pagina√ß√£o
            pagination_href = albums_metadata.get("href", None)
            pagination_limit = albums_metadata.get("limit", None)
            pagination_next = albums_metadata.get("next", None)
            pagination_offset = albums_metadata.get("offset", None)
            pagination_total = albums_metadata.get("total", None)


        if not items:
            print("‚ö†Ô∏è A lista de novos lan√ßamentos ('albums.items') est√° vazia.")
        else:
            for item in items: # item √© um √°lbum
                
                if item is None or not isinstance(item, dict):
                    continue
                
                # Garante que artists √© uma lista
                album_artists_list: List[Dict[str, Any]] = item.get("artists") or []
                
                # Primeiro Artista do √Ålbum
                first_album_artist: Dict[str, Any] = album_artists_list[0] if album_artists_list else {}

                # Serializa√ß√£o de Estruturas Complexas para JSON strings
                images_json = json.dumps(item.get("images") or [], ensure_ascii=False)
                artists_full_json = json.dumps(album_artists_list, ensure_ascii=False)
                markets_json = json.dumps(item.get("available_markets") or [], ensure_ascii=False)
                
                
                flattened = {
                    # Campos de Pagina√ß√£o (do n√≠vel 'albums')
                    "pagination_href": pagination_href,
                    "pagination_limit": pagination_limit,
                    "pagination_next": pagination_next,
                    "pagination_offset": pagination_offset,
                    "pagination_total": pagination_total,
                    
                    # Campos do √Ålbum
                    "album_id": item.get("id", None),
                    "album_name": item.get("name", None),
                    "album_type": item.get("album_type", None),
                    "album_total_tracks": item.get("total_tracks", None),
                    "album_release_date": item.get("release_date", None),
                    "album_release_date_precision": item.get("release_date_precision", None),
                    "album_uri": item.get("uri", None),
                    "album_href": item.get("href", None),
                    "album_external_urls": item.get("external_urls", {}).get("spotify", None),
                    
                    # Campos do PRIMEIRO Artista do √Ålbum (Flattened)
                    "artist_id": first_album_artist.get("id", None),
                    "artist_name": first_album_artist.get("name", None),
                    "artist_uri": first_album_artist.get("uri", None),
                    "artist_href": first_album_artist.get("href", None),
                    
                    # Campos Serializados (JSON strings)
                    "images_json": images_json,
                    "artists_full_json": artists_full_json,
                    "available_markets_json": markets_json,
                }

                all_flattened_albums.append(flattened)

            if 'spark' in locals():
                try:
                    # Cria√ß√£o do DataFrame Spark
                    df_api_new_releases = spark.createDataFrame(all_flattened_albums)
                    
                    print("‚úÖ DataFrame Spark criada com sucesso (√Ålbuns Achatados).")
                    df_api_new_releases.show(5, truncate=False)
                    df_api_new_releases.printSchema()
                    print(f"Total de novos lan√ßamentos (√Ålbuns): {df_api_new_releases.count()}")

                except Exception as e:
                    print(f"‚ùå Erro ao criar DataFrame Spark (verifique o esquema): {e}")
            else:
                 print("\n‚ö†Ô∏è SparkSession n√£o inicializada. O processamento Python foi conclu√≠do. Use `spark.createDataFrame(all_flattened_albums)` no seu ambiente PySpark.")


    except json.JSONDecodeError as e:
        print(f"‚ùå Erro de decodifica√ß√£o JSON: O ficheiro n√£o √© um JSON v√°lido: {e}")
    except Exception as e:
        print(f"‚ùå Erro inesperado ao processar o ficheiro: {e}")

Dataframes da API:
- df_api_markets
- df_api_categories
- df_api_playback
- df_api_followed_artists
- df_api_playing_track
- df_api_playlists
- df_api_recently_played
- df_api_saved_albums
- df_api_saved_tracks
- df_api_top_artists
- df_api_top_tracks
- df_api_current_user
- df_api_devices
- df_api_me
- df_api_new_releases

In [None]:
df_api_playback.show(5, truncate=False)

# Streaming History

In [5]:
# Reduz o n√∫mero de parti√ß√µes de shuffle (o padr√£o √© 200, o que √© demais para o teu PC)
spark.conf.set("spark.sql.shuffle.partitions", "4")

# 1. Definir o caminho
path_processed_streamingHistory = str(path_api_processed / "streaming_history")

# Ler a pasta que cont√©m o JSON
df_streamingHistory = spark.read.json(path_processed_streamingHistory)

# Fazer cache para que as an√°lises futuras sejam r√°pidas
df_streamingHistory.cache()

# Confirmar que os dados chegaram bem
print(f"‚úÖ Sucesso! O DataFrame cont√©m {df_streamingHistory.count()} linhas.")
df_streamingHistory.printSchema()
df_streamingHistory.show(5, truncate=True)

25/12/20 20:31:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

‚úÖ Sucesso! O DataFrame cont√©m 175776 linhas.
root
 |-- album_artists: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- album_copyrights: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- album_images: string (nullable = true)
 |-- album_label: string (nullable = true)
 |-- album_popularity: long (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- album_total_tracks: long (nullable = true)
 |-- album_tracks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- album_uri: string (nullable = true)
 |-- artist_followers_total: long (nullable = true)
 |-- artist_genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- artist_image: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_popularity: long (nullable = true)
 |-- artist_uris_list: array (nullable = true)
 |    |-- el

                                                                                

+-------------+----------------+------------+-----------+----------------+------------------+----------------------------+------------------+------------+---------+----------------------+-------------+------------+-----------+-----------------+----------------+------------+--------------------+--------------------+--------------+--------------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+--------------------+--------------------+------------+-------+-------+--------------------+-----------------+-----------------+-----------------+-----------------+----------+------------+----------------+--------------------+
|album_artists|album_copyrights|album_images|album_label|album_popularity|album_release_date|album_release_date_precision|album_total_tracks|album_tracks|album_uri|artist_followers_total|artist_genres|artist_image|artist_name|artist_popularity|artist_uris_list|conn_country|        episode_name|  

In [None]:
df_streamingHistory.describe().show()

[Stage 6:>                                                          (0 + 2) / 2]

In [None]:
df_streamingHistory.select("endTime").distinct().orderBy("endTime").show(10, truncate=False)