In [1]:
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import desc, count, col, sum as Fsum, when
import json, pprint
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType, BooleanType, LongType
from requests.exceptions import ReadTimeout, ConnectionError
import pprint
import time
import math
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import sys

In [None]:
spark = SparkSession.builder.appName("SpotifyStreamingHistory").getOrCreate()
print(spark)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/21 11:49:28 WARN Utils: Your hostname, MartaPC, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/21 11:49:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


In [None]:
project_root = Path().resolve().parents[0]
input_path = str(project_root / "data" / "raw" / "streaming_history" / "Streaming_History_Audio_*.json")
df_streamingHistory = spark.read.option("multiline", "true").json(input_path)
print(f"Lidos ficheiros de: {input_path}")

25/12/21 11:40:03 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: /home/marta/spotify-data-streaming-project/data/raw/streaming_history/Streaming_History_Audio_*.json.
java.io.FileNotFoundException: File /home/marta/spotify-data-streaming-project/data/raw/streaming_history/Streaming_History_Audio_*.json does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource

Lidos ficheiros de: /home/marta/spotify-data-streaming-project/data/raw/streaming_history/Streaming_History_Audio_*.json


In [None]:
print("Total values: ", df_streamingHistory.count())
print("Distincted values:", df_streamingHistory.distinct().count())

                                                                                

Total values:  175948


[Stage 6:>                                                          (0 + 2) / 2]

Distincted values: 175776


                                                                                

In [None]:
df_streamingHistory.printSchema()

root
 |-- audiobook_chapter_title: string (nullable = true)
 |-- audiobook_chapter_uri: string (nullable = true)
 |-- audiobook_title: string (nullable = true)
 |-- audiobook_uri: string (nullable = true)
 |-- conn_country: string (nullable = true)
 |-- episode_name: string (nullable = true)
 |-- episode_show_name: string (nullable = true)
 |-- incognito_mode: boolean (nullable = true)
 |-- ip_addr: string (nullable = true)
 |-- master_metadata_album_album_name: string (nullable = true)
 |-- master_metadata_album_artist_name: string (nullable = true)
 |-- master_metadata_track_name: string (nullable = true)
 |-- ms_played: long (nullable = true)
 |-- offline: boolean (nullable = true)
 |-- offline_timestamp: long (nullable = true)
 |-- platform: string (nullable = true)
 |-- reason_end: string (nullable = true)
 |-- reason_start: string (nullable = true)
 |-- shuffle: boolean (nullable = true)
 |-- skipped: boolean (nullable = true)
 |-- spotify_episode_uri: string (nullable = true)
 |

Verificação de valores repetidos (reparei que há linhas com o mesmo 'ts' e isso não pode acontecer...)

In [None]:
windowSpec = Window.partitionBy("ts").orderBy("spotify_track_uri")
df_with_duplicatesNum = df_streamingHistory.withColumn("ts_count", count("*").over(windowSpec))
# este dataframe é apenas para visualizar e analisar os valores duplicados
df_with_duplicates = df_with_duplicatesNum.filter(col("ts_count") > 1)
duplicates_num = df_with_duplicates.count()
df_with_duplicates.orderBy("ts_count", "ts").show(truncate=False)

                                                                                

+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+---------------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+---------------------------------------------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+--------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr        |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|ms_played|offline|offline_timestamp|platform                                     |reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |ts_count|
+-----------------------+---------------------+---------------+-------------+-----------

Razões para isto poder ter acontecido:
- Dados Agrupados (Batching): O cliente Spotify (aplicação) enviou um lote de eventos para o servidor na mesma altura.
- Ações Rápidas/Simultâneas: O utilizador fez uma ação muito rápida (por exemplo, deu skip à música duas vezes seguidas no mesmo segundo).
- Dados "Duplicados" (Técnico): Não são duplicados no sentido estrito (as músicas são diferentes), mas são eventos simultâneos que partilham a chave ts.

Os valores duplicados têm de ser removidos apenas se possuirem o mesmo 'ts' e a música for idêntica (valores duplicados). Há casos em que o 'ts' é igual, mas são músicas diferentes, ter isso em consideração!

In [None]:
before_count = df_streamingHistory.count()
df_streamingHistory = df_streamingHistory.dropDuplicates()
after_count = df_streamingHistory.count()
print(f"'ts' duplicates count: {duplicates_num}")
print(f"Removed {before_count - after_count} duplicate values.")



'ts' duplicates count: 18162
Removed 172 duplicate values.


                                                                                

Verificar outra vez os valores duplicados não têm a mesma música duplicada

In [None]:
df_with_duplicatesNum = df_streamingHistory.withColumn("ts_count", count("*").over(windowSpec))
# este dataframe é apenas para visualizar e analisar os valores duplicados
df_with_duplicates = df_with_duplicatesNum.filter(col("ts_count") > 1)
df_with_duplicates.orderBy("ts_count", "ts").show(truncate=False)

[Stage 33:>                                                         (0 + 2) / 2]

+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+---------------+-----------------------------------------------------+---------------------------------+-------------------------------------------------------------------------+---------+-------+-----------------+---------------------------------------------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+--------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr        |master_metadata_album_album_name                     |master_metadata_album_artist_name|master_metadata_track_name                                               |ms_played|offline|offline_timestamp|platform                                     |reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri   

                                                                                

In [None]:
df_streamingHistory_final = df_streamingHistory.withColumn("ts", F.to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))
df_streamingHistory_final.show(truncate=False)



+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+--------------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+----------------------------------+----------+------------+-------+-------+-------------------+------------------------------------+-------------------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr       |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|ms_played|offline|offline_timestamp|platform                          |reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                 |
+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+-----------

                                                                                

In [None]:
df_streamingHistory.orderBy(desc("ts")).show(truncate=False)



+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+--------------------------------------+--------------------------------+---------------------------------+------------------------------------------+---------+-------+-----------------+--------+----------------------------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr                               |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name                |ms_played|offline|offline_timestamp|platform|reason_end                  |reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+-----------------------+---------------------+---------------+---

                                                                                

In [None]:
print("Describe: ")
df_streamingHistory.describe().show()

Describe: 


25/12/21 11:40:45 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------------+---------------------+---------------+-------------+------------+--------------------+-----------------+-------------+---------------------------------+---------------------------------+--------------------------+------------------+--------------------+--------------------+----------+------------+--------------------+--------------------+--------------------+
|summary|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|        episode_name|episode_show_name|      ip_addr| master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|         ms_played|   offline_timestamp|            platform|reason_end|reason_start| spotify_episode_uri|   spotify_track_uri|                  ts|
+-------+-----------------------+---------------------+---------------+-------------+------------+--------------------+-----------------+-------------+---------------------------------+-------------------------

                                                                                

Existem colunas que não possuem valores nenhuns. Tratar disso

In [None]:
# Conta valores não nulos por coluna
non_null_counts = df_streamingHistory.select([
    Fsum(when(col(c).isNotNull(), 1).otherwise(0)).alias(c)
    for c in df_streamingHistory.columns
]).collect()[0].asDict()

# Filtra colunas cujo count > 0
cols_to_keep = [c for c, v in non_null_counts.items() if v > 0]
cols_to_drop = [c for c, v in non_null_counts.items() if v == 0]

print("Colunas a remover (sem valores):", cols_to_drop)
print("Total:", len(cols_to_drop))

# Cria novo DataFrame sem as colunas vazias
df_streamingHistory = df_streamingHistory.select(cols_to_keep)

                                                                                

Colunas a remover (sem valores): ['audiobook_chapter_title', 'audiobook_chapter_uri', 'audiobook_title', 'audiobook_uri']
Total: 4


### Analisar os valores das colunas que existem do dataframe do histórico

Country (conn_country)

In [None]:
conn_country_df = df_streamingHistory.select("conn_country").distinct()
count_num = conn_country_df.count()
print(f"\nDistinct values for conn_country ({count_num} total):")
conn_country_df.show(count_num, truncate=False)

                                                                                


Distinct values for conn_country (8 total):
+------------+
|conn_country|
+------------+
|NL          |
|PT          |
|ES          |
|JP          |
|ZZ          |
|GB          |
|DE          |
|AT          |
+------------+



Supostamente não estive a ouvir música nestes países todos. Investigar

In [None]:
country_counts = (
    df_streamingHistory.groupBy("conn_country")
      .agg(count("*").alias("num_streams"))
      .orderBy(col("num_streams").desc())
)
country_counts.show(truncate=False)



+------------+-----------+
|conn_country|num_streams|
+------------+-----------+
|PT          |175484     |
|ES          |138        |
|ZZ          |98         |
|DE          |27         |
|NL          |20         |
|GB          |5          |
|AT          |3          |
|JP          |1          |
+------------+-----------+



                                                                                

In [None]:
df_streamingHistory.filter(col("conn_country") == "NL") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

df_streamingHistory.filter(col("conn_country") == "GB") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

df_streamingHistory.filter(col("conn_country") == "AT") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

# Verificação das linhas que têm a localização no Japão (nunca lá estive)
df_streamingHistory.filter(col("conn_country") == "JP") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

                                                                                

+------------+------------+-----------------+--------------+--------------+--------------------------------+---------------------------------+------------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr       |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name          |ms_played|offline|offline_timestamp|platform|reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+--------------+--------------------------------+---------------------------------+------------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------------------------

                                                                                

+------------+------------+-----------------+--------------+-----------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+-------------------------------------------+----------------------------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr    |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|ms_played|offline|offline_timestamp|platform                                   |reason_end                  |reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+-----------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+--------------------------------

                                                                                

+------------+------------+-----------------+--------------+------------+--------------------------------+---------------------------------+----------------------------------------+---------+-------+-----------------+--------+----------------------------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr     |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name              |ms_played|offline|offline_timestamp|platform|reason_end                  |reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+------------+--------------------------------+---------------------------------+----------------------------------------+---------+-------+-----------------+--------+----------------------------+------------+------



+------------+------------+-----------------+--------------+-------------+--------------------------------------------+---------------------------------+-------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr      |master_metadata_album_album_name            |master_metadata_album_artist_name|master_metadata_track_name     |ms_played|offline|offline_timestamp|platform|reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+-------------+--------------------------------------------+---------------------------------+-------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------

                                                                                

O que pode ter acontecido é o Spotify ter registado o *conn_country* como JP (Japão) porque o IP público no momento da sincronização ou da stream estava associado a um servidor da M247 localizado em Tóquio — mesmo estando em Portugal

Incognito Mode e Offline

In [None]:
reason_end_df = df_streamingHistory.select("incognito_mode").distinct()
count_num = reason_end_df.count()
print(f"\nDistinct values for reason_end ({count_num} total):")
reason_end_df.show(count_num, truncate=False)


Distinct values for reason_end (2 total):
+--------------+
|incognito_mode|
+--------------+
|true          |
|false         |
+--------------+



                                                                                

Offline

In [None]:
df_reason_end = df_streamingHistory.select("offline").distinct()
count_num = df_reason_end.count()
print(f"\nDistinct values for incognito_mode ({count_num} total):")
df_reason_end.show(count_num, truncate=False)


Distinct values for incognito_mode (2 total):




+-------+
|offline|
+-------+
|true   |
|false  |
+-------+



                                                                                

Reason End

In [None]:
df_reason_end = df_streamingHistory.select("reason_end").distinct()
count_num = df_reason_end.count()
print(f"\nDistinct values for reason_end ({count_num} total):")
df_reason_end.show(count_num, truncate=False)


Distinct values for reason_end (12 total):
+----------------------------+
|reason_end                  |
+----------------------------+
|backbtn                     |
|logout                      |
|trackdone                   |
|unknown                     |
|fwdbtn                      |
|endplay                     |
|trackerror                  |
|unexpected-exit             |
|unexpected-exit-while-paused|
|remote                      |
|appload                     |
|                            |
+----------------------------+



Reason Start

In [None]:
reason_start_df = df_streamingHistory.select("reason_start").distinct()
count_num = reason_start_df.count()
print(f"\nDistinct values for reason_start ({count_num} total):")
reason_start_df.show(count_num, truncate=False)


Distinct values for reason_start (11 total):
+-----------------+
|reason_start     |
+-----------------+
|appload          |
|backbtn          |
|trackdone        |
|playbtn          |
|unknown          |
|fwdbtn           |
|trackerror       |
|clickrow         |
|remote           |
|                 |
|switched-to-audio|
+-----------------+



## Agregar dados do histórico com extra info

Agora quero juntar os dados do histórico completo mais informação adicional sobre músicas, álbuns e artistas, para complementar informação.

Adicionar mais informação relativamente às tracks através de API calls do Spotipy ao dataframe "principal" - *df_streamingHistory*

In [None]:
load_dotenv()

scopes = [
    "user-library-read",
    "user-read-playback-state",
    "user-read-currently-playing",
    "user-follow-read",
    "user-read-recently-played",
    "user-top-read"
]
scope = " ".join(scopes)

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope, open_browser=True))

In [None]:
path_api_raw = Path(project_root) / "data" / "raw"
path_api_processed = Path(project_root) / "data" / "processed"
print("path_api_raw:", path_api_raw)
print("path_api_processed:", path_api_processed)

path_api_raw: /home/marta/spotify-data-streaming-project/data/raw
path_api_processed: /home/marta/spotify-data-streaming-project/data/processed


É importante obter os URIs únicos para usar como parâmetro nas API calls e obter, desta forma, a informação extra acerca das tracks, albums e artists

Lista de URIs das tracks (streaming tracks)

In [None]:
df_tracks_uris = df_streamingHistory.filter(col("spotify_track_uri").isNotNull())
tracks_uris_list = [row.spotify_track_uri for row in df_tracks_uris.select("spotify_track_uri").distinct().collect()]

print("✅ Processo Concluído.")
print(tracks_uris_list[:5])
print(f"Número de URIs ÚNICOS encontrados: {len(tracks_uris_list)}")

✅ Processo Concluído.
['spotify:track:0Lhy1sDDdtUFY2pyq2ZVi9', 'spotify:track:79xNgIQLEnhdcz7LuSLM8T', 'spotify:track:4Oz9NAlujs00kYGL4WTUNw', 'spotify:track:2XkDm5m2vPowecEAAR5gmb', 'spotify:track:27sBcXtgTBSJRdUxei1a7J']
Número de URIs ÚNICOS encontrados: 26828


Função para obter informação para, mais tarde, extrair as URIs dos albums e dos artists correspondentes às tracks

In [23]:
def get_data(uris_list, type, successful_sleep=0.2):
    """Divide a lista de URIs em grupos, chama a API e monitora o progresso."""
    results = []
    API_BATCH_SIZE = 20
    total_uris = len(uris_list)
    
    if type == "tracks":
        sp_function, key = sp.tracks, 'tracks'
    elif type == "albums":
        sp_function, key = sp.albums, 'albums'
    elif type == "artists":
        sp_function, key = sp.artists, 'artists' 
    else:
        print(f"Tipo desconhecido: {type}")
        return results

    milestones = {10: False, 20: False, 30: False, 40: False, 50: False, 60: False, 70: False, 80: False, 90: False}

    for i in range(0, total_uris, API_BATCH_SIZE):
        batch = uris_list[i:i + API_BATCH_SIZE]
        uris_processados = i + len(batch)
        
        percent_done = (uris_processados / total_uris) * 100
        for milestone, reached in milestones.items():
            if not reached and percent_done >= milestone:
                print(f"[{type.upper()}] ✅ Progresso: {milestone}% concluído ({uris_processados}/{total_uris} URIs)")
                milestones[milestone] = True

        try:
            response = sp_function(batch)

            if key in response:
                results.extend([item for item in response[key] if item is not None])
            
            if successful_sleep > 0:
                time.sleep(successful_sleep)
                
        except (spotipy.SpotifyException, ReadTimeout, ConnectionError) as e:
            error_msg = str(e)
            print(f"⚠️ Erro na chamada da API ({type} Batch {i//API_BATCH_SIZE}, IDs: {batch[0]}...): {error_msg}")
            if '429' in error_msg or 'timed out' in error_msg or 'RemoteDisconnected' in error_msg:
                print("Dormindo por 10 segundos devido a erro de rede/servidor...")
                time.sleep(10)
            continue
            
    print(f"[{type.upper()}] 🏁 100% concluído.")
    return results

In [24]:
JSON_BATCH_SIZE = 1000

def process_extra_info(uris_list, entity_type, folder_path, json_batch_size=JSON_BATCH_SIZE):
    folder_path.mkdir(parents=True, exist_ok=True)
    existing_parts = list(folder_path.glob(f"{entity_type}_part_*.json"))
    
    # 1. Tentar Carregar do Cache
    if existing_parts:
        data_full = []
        for part_path in existing_parts:
            with open(part_path, 'r', encoding='utf-8') as f:
                content = json.load(f)
                # Normaliza: extrai a lista se estiver dentro de um dicionário {'artists': [...]}
                if isinstance(content, dict) and entity_type in content:
                    data_full.extend(content[entity_type])
                else:
                    data_full.extend(content)
        print(f"✨ [{entity_type.upper()}] Dados carregados do disco: {len(data_full)} itens.")
        return data_full

    # 2. Se não houver cache, extrair via API
    print(f"🚀 [{entity_type.upper()}] Iniciando extração de {len(uris_list)} URIs...")
    raw_data = get_data(uris_list, type=entity_type)
    
    # Normalizar dados da API (Spotify às vezes devolve {'tracks': [...]})
    items_list = raw_data.get(entity_type, []) if isinstance(raw_data, dict) else raw_data

    if items_list:
        num_parts = math.ceil(len(items_list) / json_batch_size)
        for i in range(num_parts):
            batch = items_list[i * json_batch_size : (i + 1) * json_batch_size]
            part_filename = folder_path / f"{entity_type}_part_{i+1}.json"
            
            # Decidimos aqui: faixas salvamos como lista pura, álbuns/artistas com chave raiz
            save_content = {entity_type: batch} if entity_type in ['albums', 'artists'] else batch
            
            with open(part_filename, 'w', encoding='utf-8') as f:
                json.dump(save_content, f, indent=2, ensure_ascii=False)
        
        print(f"✅ [{entity_type.upper()}] Sucesso! {len(items_list)} itens guardados em {num_parts} ficheiros.")
        return items_list
    
    print(f"⚠️ [{entity_type.upper()}] A API não retornou resultados.")
    return []

In [25]:
base_extra_path = path_api_raw / "streaming_history/extra_info"

In [26]:
if 'tracks_uris_list' in locals() or 'tracks_uris_list' in globals():
    all_tracks_metadata = process_extra_info(
        tracks_uris_list, "tracks", base_extra_path / "tracks", JSON_BATCH_SIZE
    )

✨ [TRACKS] Dados carregados do disco: 26828 itens.


É preciso a lista de URIs dos albums e artists para a chamada das API calls *sp.albums()* e *sp.artists()*

In [27]:
album_uri_map = {} 
artists_uris_map = {}

for track_data in all_tracks_metadata:
    if track_data:
        track_uri = track_data['uri']
        
        album_uri = track_data.get("album", {}).get("uri", None)        
        if album_uri:
            album_uri_map[track_uri] = album_uri
            
        artists_list = track_data.get("artists", [])
        artist_uris = [artist.get("uri") for artist in artists_list if "uri" in artist]        
        if artist_uris:
            artists_uris_map[track_uri] = artist_uris

            
all_unique_artist_uris = []

for artist_list in artists_uris_map.values():
    all_unique_artist_uris.extend(artist_list)

unique_artist_count = len(set(all_unique_artist_uris))

print(f"Total de Faixas Mapeadas para Álbuns ({len(album_uri_map)}) e Artists ({len(artists_uris_map)})")
print(f"Total de URIs de Álbum Únicos ({len(set(album_uri_map.values()))}) e Artists ({unique_artist_count})")

if album_uri_map and artists_uris_map:
    albums_amostra = dict(list(album_uri_map.items())[:5])
    pprint.pprint(albums_amostra)
    artists_amostra = dict(list(artists_uris_map.items())[:5])
    pprint.pprint(artists_amostra)

Total de Faixas Mapeadas para Álbuns (26828) e Artists (26828)
Total de URIs de Álbum Únicos (19261) e Artists (9640)
{'spotify:track:2TktkzfozZifbQhXjT6I33': 'spotify:album:5z7TD11Qh81Gbf52hd5zAv',
 'spotify:track:4hWAPHrfyQDsIXzRmn7YM7': 'spotify:album:1p7kGQaYJPtlwhRGVWOGuA',
 'spotify:track:4sx6NRwL6Ol3V6m9exwGlQ': 'spotify:album:3z53jSP5i9bCPVOu3PARM8',
 'spotify:track:5KfUqRRbm6ACSzLxUS57R2': 'spotify:album:4bPrS4muAMw2G5M6ARaZ01',
 'spotify:track:6xniBg7UJ03jcyvOgfWVJk': 'spotify:album:3hzt5mEdhd9YF81TGH1pOx'}
{'spotify:track:2TktkzfozZifbQhXjT6I33': ['spotify:artist:5FxD8fkQZ6KcsSYupDVoSO'],
 'spotify:track:4hWAPHrfyQDsIXzRmn7YM7': ['spotify:artist:0QcblRyHbgYTLOKlP5BE66',
                                          'spotify:artist:6kY7DKDwm2bt996rMF4CLK'],
 'spotify:track:4sx6NRwL6Ol3V6m9exwGlQ': ['spotify:artist:2tIP7SsRs7vjIcLrU85W8J'],
 'spotify:track:5KfUqRRbm6ACSzLxUS57R2': ['spotify:artist:5KpmWCJ5NqsY9meqhjwbxR'],
 'spotify:track:6xniBg7UJ03jcyvOgfWVJk': ['spotify:artist:

In [28]:
track_metadata_records = []

for track_uri, album_uri in album_uri_map.items():
    
    artists_list_for_track = artists_uris_map.get(track_uri, [])
    
    record = {
        "spotify_track_uri": track_uri,
        "album_uri": album_uri,
        "artist_uris_list": artists_list_for_track # Lista de URIs de Artista
    }
    track_metadata_records.append(record)

print(f"✅ Conversão para {len(track_metadata_records)} registos Python concluída.")

✅ Conversão para 26828 registos Python concluída.


In [29]:
track_schema = StructType([
    StructField("spotify_track_uri", StringType(), False),
    StructField("album_uri", StringType(), True),
    StructField("artist_uris_list", ArrayType(StringType()), True) # Um-para-Muitos
])

df_tracks_metadata = spark.createDataFrame(track_metadata_records, schema=track_schema)

print(f"\n✅ Criação do DataFrame PySpark 'df_tracks_metadata' concluída.")
df_tracks_metadata.printSchema()
print("Total rows: ", df_tracks_metadata.count())
df_tracks_metadata.show(5, truncate=False)


✅ Criação do DataFrame PySpark 'df_tracks_metadata' concluída.
root
 |-- spotify_track_uri: string (nullable = false)
 |-- album_uri: string (nullable = true)
 |-- artist_uris_list: array (nullable = true)
 |    |-- element: string (containsNull = true)



25/12/21 11:41:42 WARN TaskSetManager: Stage 118 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.
[Stage 118:>                                                        (0 + 2) / 2]

Total rows:  26828
+------------------------------------+------------------------------------+------------------------------------------------------------------------------+
|spotify_track_uri                   |album_uri                           |artist_uris_list                                                              |
+------------------------------------+------------------------------------+------------------------------------------------------------------------------+
|spotify:track:5KfUqRRbm6ACSzLxUS57R2|spotify:album:4bPrS4muAMw2G5M6ARaZ01|[spotify:artist:5KpmWCJ5NqsY9meqhjwbxR]                                       |
|spotify:track:6xniBg7UJ03jcyvOgfWVJk|spotify:album:3hzt5mEdhd9YF81TGH1pOx|[spotify:artist:34qyuX5yO72yzL8Z4JclBc]                                       |
|spotify:track:4hWAPHrfyQDsIXzRmn7YM7|spotify:album:1p7kGQaYJPtlwhRGVWOGuA|[spotify:artist:0QcblRyHbgYTLOKlP5BE66, spotify:artist:6kY7DKDwm2bt996rMF4CLK]|
|spotify:track:2TktkzfozZifbQhXjT6I33|spotify:album

25/12/21 11:41:47 WARN TaskSetManager: Stage 121 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.


Adicionar agora informação extra de acordo com os URIs das tracks, albums e artists

In [30]:
album_uris_list = (
    df_tracks_metadata.select("album_uri").distinct()
    .filter(col("album_uri").isNotNull())
    .rdd.map(lambda row: row[0])
    .collect()
)

artists_uris_list = (
    df_tracks_metadata.select(col("artist_uris_list")[0].alias("artist_uri"))
    .filter(col("artist_uri").isNotNull())
    .distinct()
    .rdd.map(lambda row: row[0])
    .collect()
)

print(f"URIs Únicos para buscar: Tracks={len(tracks_uris_list)}, Albums={len(album_uris_list)}, Artists={len(artists_uris_list)}")

25/12/21 11:41:47 WARN TaskSetManager: Stage 122 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.
25/12/21 11:41:48 WARN TaskSetManager: Stage 125 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.


URIs Únicos para buscar: Tracks=26828, Albums=19261, Artists=6857


In [31]:
all_tracks_metadata_full = all_tracks_metadata.copy()

In [32]:
if 'album_uris_list' in locals() or 'album_uris_list' in globals():
    all_albums_metadata_full = process_extra_info(
        album_uris_list, "albums", base_extra_path / "albums", JSON_BATCH_SIZE
    )

✨ [ALBUMS] Dados carregados do disco: 19261 itens.


In [33]:
if 'artists_uris_list' in locals() or 'artists_uris_list' in globals():
    all_artists_metadata_full = process_extra_info(
        artists_uris_list, "artists", base_extra_path / "artists", JSON_BATCH_SIZE
    )

✨ [ARTISTS] Dados carregados do disco: 6857 itens.


In [34]:
schema_tracks = StructType([
    StructField("track_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_is_explicit", BooleanType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("track_disc_number", IntegerType(), True),
    StructField("track_is_playable", BooleanType(), True)
])

tracks_flat = [
    {
        "track_uri": t.get("uri"),
        "track_name": t.get("name"),
        "track_popularity": t.get("popularity"),
        "track_duration_ms": t.get("duration_ms"),
        "track_is_explicit": t.get("explicit"),
        "track_number": t.get("track_number"),
        "track_disc_number": t.get("disc_number"),
        "track_is_playable": t.get("is_playable")
    } 
    for t in all_tracks_metadata_full if t is not None
]

df_extraInfo_tracks = spark.createDataFrame(tracks_flat, schema=schema_tracks)
df_extraInfo_tracks.show(5, truncate=False)

+------------------------------------+----------------------------------------+----------------+-----------------+-----------------+------------+-----------------+-----------------+
|track_uri                           |track_name                              |track_popularity|track_duration_ms|track_is_explicit|track_number|track_disc_number|track_is_playable|
+------------------------------------+----------------------------------------+----------------+-----------------+-----------------+------------+-----------------+-----------------+
|spotify:track:5KfUqRRbm6ACSzLxUS57R2|Sugar                                   |0               |114416           |true             |1           |1                |NULL             |
|spotify:track:6xniBg7UJ03jcyvOgfWVJk|FELT                                    |36              |105045           |false            |2           |1                |NULL             |
|spotify:track:4hWAPHrfyQDsIXzRmn7YM7|breakfast at a funeral                  |13         

In [35]:
schema_albums = StructType([
    StructField("album_type", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("album_total_tracks", IntegerType(), True),
    StructField("album_images", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_artists", ArrayType(StringType()), True),
    StructField("album_tracks", ArrayType(StringType()), True),
    StructField("album_copyrights", ArrayType(StringType()), True),
    StructField("album_label", StringType(), True),
    StructField("album_popularity", IntegerType(), True)
])

albums_source = all_albums_metadata_full.get("albums", []) if isinstance(all_albums_metadata_full, dict) else all_albums_metadata_full

albums_flat = []
for a in albums_source:
    if a is None: continue
    
    artist_names = [art.get("name") for art in a.get("artists", [])]
    track_names  = [t.get("name") for t in a.get("tracks", {}).get("items", [])]
    copyrights   = [c.get("text") for c in a.get("copyrights", [])]
    
    first_image = a.get("images")[0].get("url") if a.get("images") else None

    albums_flat.append({
        "album_uri": a.get("uri"),
        "album_total_tracks": a.get("total_tracks"),
        "album_images": first_image,
        "album_release_date": a.get("release_date"),
        "album_release_date_precision": a.get("release_date_precision"),
        "album_artists": artist_names,
        "album_tracks": track_names,
        "album_copyrights": copyrights,
        "album_label": a.get("label"),
        "album_popularity": a.get("popularity")
    })

df_extraInfo_albums = spark.createDataFrame(albums_flat, schema=schema_albums)
df_extraInfo_albums.show(5, truncate=True)

25/12/21 11:42:38 WARN TaskSetManager: Stage 129 contains a task of very large size (3650 KiB). The maximum recommended task size is 1000 KiB.


+----------+--------------------+------------------+--------------------+------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+----------------+
|album_type|           album_uri|album_total_tracks|        album_images|album_release_date|album_release_date_precision|       album_artists|        album_tracks|    album_copyrights|         album_label|album_popularity|
+----------+--------------------+------------------+--------------------+------------------+----------------------------+--------------------+--------------------+--------------------+--------------------+----------------+
|      NULL|spotify:album:6FT...|                11|https://i.scdn.co...|        2017-10-31|                         day|            [WILLOW]|[Boy, An Awkward ...|[© 2017 MSFTSMusi...|Universal Music G...|               0|
|      NULL|spotify:album:0pJ...|                 1|https://i.scdn.co...|        2017-03-27|                

                                                                                

In [36]:
schema_artists = StructType([
    StructField("artist_uri", StringType(), True),
    StructField("artist_name", StringType(), True),
    StructField("artist_followers_total", LongType(), True),
    StructField("artist_genres", ArrayType(StringType()), True),
    StructField("artist_image", StringType(), True),
    StructField("artist_popularity", IntegerType(), True)
])

artists_source = all_artists_metadata_full.get("artists", []) if isinstance(all_artists_metadata_full, dict) else all_artists_metadata_full

artists_flat = []
for art in artists_source:
    if art is None: continue
    
    # Extrair a URL da primeira imagem disponível
    images = art.get("images", [])
    first_image = images[0].get("url") if images else None
    
    # Extrair o total de seguidores (campo aninhado)
    followers_dict = art.get("followers", {})
    total_followers = followers_dict.get("total") if isinstance(followers_dict, dict) else 0

    artists_flat.append({
        "artist_uri": art.get("uri"),
        "artist_name": art.get("name"),
        "artist_followers_total": total_followers,
        "artist_genres": art.get("genres", []),
        "artist_image": first_image,
        "artist_popularity": art.get("popularity")
    })

df_extraInfo_artists = spark.createDataFrame(artists_flat, schema=schema_artists)
df_extraInfo_artists.show(5, truncate=True)

+--------------------+--------------------+----------------------+--------------------+--------------------+-----------------+
|          artist_uri|         artist_name|artist_followers_total|       artist_genres|        artist_image|artist_popularity|
+--------------------+--------------------+----------------------+--------------------+--------------------+-----------------+
|spotify:artist:4Y...|      Kevian Kraemer|                 72240|                  []|https://i.scdn.co...|               47|
|spotify:artist:7m...|        Jimmy Fallon|                 83349|         [christmas]|https://i.scdn.co...|               56|
|spotify:artist:6s...|         Zach Herron|                134592|                  []|https://i.scdn.co...|               26|
|spotify:artist:2Q...|Bob Marley & The ...|              13494959|[reggae, roots re...|https://i.scdn.co...|               81|
|spotify:artist:48...|         Asher Angel|                197606|                  []|https://i.scdn.co...|   

                                                                                

In [37]:
# Criar a coluna de ligação para os artistas
df_extraInfo = df_tracks_metadata.withColumn(
    "main_artist_uri", 
    col("artist_uris_list")[0]
)

In [38]:
# JOIN com Faixas (Nomes diferentes: spotify_track_uri vs track_uri)
df_extraInfo = df_extraInfo.join(
    df_extraInfo_tracks, 
    df_extraInfo["spotify_track_uri"] == df_extraInfo_tracks["track_uri"], 
    how="left"
).drop(df_extraInfo_tracks["track_uri"]) # Removemos a duplicada após o join

In [39]:
# JOIN com Álbuns (Nomes diferentes: album_uri vs album_uri - se os nomes forem iguais o "on" funciona)
# Vou assumir que no df_extraInfo a coluna se chama 'album_uri'
df_extraInfo = df_extraInfo.join(
    df_extraInfo_albums, 
    on="album_uri", 
    how="left"
)

In [40]:
# JOIN com Artistas
df_extraInfo_final = df_extraInfo.join(
    df_extraInfo_artists,
    df_extraInfo["main_artist_uri"] == df_extraInfo_artists["artist_uri"],
    how="left"
).drop(df_extraInfo_artists["artist_uri"]).drop("main_artist_uri")

In [41]:
# 1. Definir o caminho (convertendo Path para string, que o Spark exige)
path_processed_extraInfo = str(path_api_processed / "extra_info")

# 2. Guardar usando o Spark
# .coalesce(1) garante que os dados sejam unidos num único ficheiro dentro da pasta
df_extraInfo_final.coalesce(1).write.mode("overwrite") \
    .option("encoding", "UTF-8") \
    .json(path_processed_extraInfo)

print(f"\n✅ DataFrame 'df_extraInfo' guardado em: {path_processed_extraInfo}")

25/12/21 11:42:58 WARN TaskSetManager: Stage 131 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.
25/12/21 11:43:03 WARN TaskSetManager: Stage 132 contains a task of very large size (1002 KiB). The maximum recommended task size is 1000 KiB.
25/12/21 11:43:08 WARN TaskSetManager: Stage 134 contains a task of very large size (3650 KiB). The maximum recommended task size is 1000 KiB.
                                                                                


✅ DataFrame 'df_extraInfo' guardado em: /home/marta/spotify-data-streaming-project/data/processed/extra_info


In [42]:
df_extraInfo_final.cache()

DataFrame[album_uri: string, spotify_track_uri: string, artist_uris_list: array<string>, track_name: string, track_popularity: int, track_duration_ms: bigint, track_is_explicit: boolean, track_number: int, track_disc_number: int, track_is_playable: boolean, album_type: string, album_total_tracks: int, album_images: string, album_release_date: string, album_release_date_precision: string, album_artists: array<string>, album_tracks: array<string>, album_copyrights: array<string>, album_label: string, album_popularity: int, artist_name: string, artist_followers_total: bigint, artist_genres: array<string>, artist_image: string, artist_popularity: int]

In [None]:
print("✅ Join concluído com sucesso! Total rows: ", df_extraInfo_final.count())

25/12/20 20:59:40 WARN TaskSetManager: Stage 145 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.
25/12/20 20:59:40 WARN TaskSetManager: Stage 146 contains a task of very large size (1002 KiB). The maximum recommended task size is 1000 KiB.
25/12/20 20:59:41 WARN TaskSetManager: Stage 148 contains a task of very large size (3650 KiB). The maximum recommended task size is 1000 KiB.
[Stage 158:>                                                      (0 + 2) / 200]

In [43]:
df_extraInfo_final.printSchema()

root
 |-- album_uri: string (nullable = true)
 |-- spotify_track_uri: string (nullable = false)
 |-- artist_uris_list: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- track_name: string (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- track_duration_ms: long (nullable = true)
 |-- track_is_explicit: boolean (nullable = true)
 |-- track_number: integer (nullable = true)
 |-- track_disc_number: integer (nullable = true)
 |-- track_is_playable: boolean (nullable = true)
 |-- album_type: string (nullable = true)
 |-- album_total_tracks: integer (nullable = true)
 |-- album_images: string (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- album_release_date_precision: string (nullable = true)
 |-- album_artists: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- album_tracks: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- album_copyrights: array (nullable = true)

In [None]:
df_extraInfo_final.show(10, truncate=False)

25/12/21 11:44:17 WARN TaskSetManager: Stage 145 contains a task of very large size (3650 KiB). The maximum recommended task size is 1000 KiB.
[Stage 145:>  (0 + 2) / 2][Stage 146:>  (0 + 0) / 2][Stage 147:>  (0 + 0) / 2]2]

In [None]:
# # TEMPORARIOS - APAGAR MAIS TARDE

# # 1. Definir o caminho da pasta
# path_api_processed = Path(project_root) / "data" / "processed"
# path_to_read = str(path_api_processed / "extra_info")

# # 2. Ler a pasta para o DataFrame
# df_extraInfo_final = spark.read.json(path_to_read)

# # 3. Verificar o resultado
# df_extraInfo_final.show(5)
# print(f"Total de registos: {df_extraInfo_final.count()}")

In [None]:
# Juntar o DataFrame de Histórico (df_streamingHistory) com os metadados
df_streamingHistory = df_streamingHistory.join(
    df_extraInfo_final, 
    on="spotify_track_uri", 
    how="left"
)

print(f"\n✅ DataFrame 'df_streamingHistory' atualizado com metadados adicionais.")

In [None]:
# Reduz o número de partições de shuffle (o padrão é 200, o que é demais para o teu PC)
spark.conf.set("spark.sql.shuffle.partitions", "4")

# 1. Definir o caminho
path_processed_streamingHistory = str(path_api_processed / "streaming_history")

# 2. Guardar sem forçar o coalesce(1) imediatamente (evita mover tudo para um só núcleo de uma vez)
# Guardar primeiro em Parquet é MUITO mais leve para a ligação WSL
temp_path = str(path_api_processed / "temp_streaming_history")

df_streamingHistory.write.mode("overwrite").parquet(temp_path)

# 3. Agora que está em disco de forma estável, converte para JSON
spark.read.parquet(temp_path).coalesce(1).write.mode("overwrite").json(path_processed_streamingHistory)

print(f"\n✅ Concluído com sucesso!")

# Fim