In [1]:
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import desc, count, col, sum as Fsum, when
import json, pprint
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from requests.exceptions import ReadTimeout, ConnectionError
import pprint
import time
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import sys

In [2]:
spark = SparkSession.builder.appName("SpotifyStreamingHistory").getOrCreate()
print(spark)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/18 10:17:38 WARN Utils: Your hostname, MartaPC, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/18 10:17:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/18 10:17:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<pyspark.sql.session.SparkSession object at 0x74075b1cef90>


In [3]:
project_root = Path().resolve().parents[0]
input_path = str(project_root / "data" / "raw" / "streaming_history")
df_streamingHistory = spark.read.option("multiline", "true").json(input_path)

                                                                                

In [4]:
print("Total values: ", df_streamingHistory.count())
print("Distincted values:", df_streamingHistory.distinct().count())

                                                                                

Total values:  176243


[Stage 6:>                                                          (0 + 2) / 2]

Distincted values: 176071


                                                                                

In [5]:
df_streamingHistory.printSchema()

root
 |-- audiobook_chapter_title: string (nullable = true)
 |-- audiobook_chapter_uri: string (nullable = true)
 |-- audiobook_title: string (nullable = true)
 |-- audiobook_uri: string (nullable = true)
 |-- conn_country: string (nullable = true)
 |-- episode_name: string (nullable = true)
 |-- episode_show_name: string (nullable = true)
 |-- incognito_mode: boolean (nullable = true)
 |-- ip_addr: string (nullable = true)
 |-- master_metadata_album_album_name: string (nullable = true)
 |-- master_metadata_album_artist_name: string (nullable = true)
 |-- master_metadata_track_name: string (nullable = true)
 |-- ms_played: long (nullable = true)
 |-- offline: boolean (nullable = true)
 |-- offline_timestamp: long (nullable = true)
 |-- platform: string (nullable = true)
 |-- reason_end: string (nullable = true)
 |-- reason_start: string (nullable = true)
 |-- shuffle: boolean (nullable = true)
 |-- skipped: boolean (nullable = true)
 |-- spotify_episode_uri: string (nullable = true)
 |

Verifica√ß√£o de valores repetidos (reparei que h√° linhas com o mesmo 'ts' e isso n√£o pode acontecer...)

In [6]:
windowSpec = Window.partitionBy("ts").orderBy("spotify_track_uri")
df_with_duplicatesNum = df_streamingHistory.withColumn("ts_count", count("*").over(windowSpec))
# este dataframe √© apenas para visualizar e analisar os valores duplicados
df_with_duplicates = df_with_duplicatesNum.filter(col("ts_count") > 1)
duplicates_num = df_with_duplicates.count()
df_with_duplicates.orderBy("ts_count", "ts").show(truncate=False)

                                                                                

+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+---------------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+---------------------------------------------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+--------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr        |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|ms_played|offline|offline_timestamp|platform                                     |reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |ts_count|
+-----------------------+---------------------+---------------+-------------+-----------

Raz√µes para isto poder ter acontecido:
- Dados Agrupados (Batching): O cliente Spotify (aplica√ß√£o) enviou um lote de eventos para o servidor na mesma altura.
- A√ß√µes R√°pidas/Simult√¢neas: O utilizador fez uma a√ß√£o muito r√°pida (por exemplo, deu skip √† m√∫sica duas vezes seguidas no mesmo segundo).
- Dados "Duplicados" (T√©cnico): N√£o s√£o duplicados no sentido estrito (as m√∫sicas s√£o diferentes), mas s√£o eventos simult√¢neos que partilham a chave ts.

Os valores duplicados t√™m de ser removidos apenas se possuirem o mesmo 'ts' e a m√∫sica for id√™ntica (valores duplicados). H√° casos em que o 'ts' √© igual, mas s√£o m√∫sicas diferentes, ter isso em considera√ß√£o!

In [7]:
before_count = df_streamingHistory.count()
df_streamingHistory = df_streamingHistory.dropDuplicates()
after_count = df_streamingHistory.count()
print(f"'ts' duplicates count: {duplicates_num}")
print(f"Removed {before_count - after_count} duplicate values.")

[Stage 22:>                                                         (0 + 2) / 2]

'ts' duplicates count: 18209
Removed 172 duplicate values.


                                                                                

Verificar outra vez os valores duplicados n√£o t√™m a mesma m√∫sica duplicada

In [8]:
df_with_duplicatesNum = df_streamingHistory.withColumn("ts_count", count("*").over(windowSpec))
# este dataframe √© apenas para visualizar e analisar os valores duplicados
df_with_duplicates = df_with_duplicatesNum.filter(col("ts_count") > 1)
df_with_duplicates.orderBy("ts_count", "ts").show(truncate=False)

[Stage 30:>                                                         (0 + 2) / 2]

+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+---------------+-----------------------------------------------------+---------------------------------+-------------------------------------------------------------------------+---------+-------+-----------------+---------------------------------------------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+--------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr        |master_metadata_album_album_name                     |master_metadata_album_artist_name|master_metadata_track_name                                               |ms_played|offline|offline_timestamp|platform                                     |reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri   

                                                                                

In [9]:
df_streamingHistory_final = df_streamingHistory.withColumn("ts", F.to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))
df_streamingHistory_final.show(truncate=False)

                                                                                

+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+--------------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+----------------------------------+----------+------------+-------+-------+-------------------+------------------------------------+-------------------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr       |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|ms_played|offline|offline_timestamp|platform                          |reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                 |
+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+-----------

In [10]:
df_streamingHistory.orderBy(desc("ts")).show(truncate=False)



+-----------------------+---------------------+---------------+-------------+------------+------------+-----------------+--------------+--------------------------------------+--------------------------------+---------------------------------+------------------------------------------+---------+-------+-----------------+--------+----------------------------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr                               |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name                |ms_played|offline|offline_timestamp|platform|reason_end                  |reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+-----------------------+---------------------+---------------+---

                                                                                

In [11]:
print("Describe: ")
df_streamingHistory.describe().show()

Describe: 


25/12/18 10:19:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------------+---------------------+---------------+-------------+------------+--------------------+-----------------+-------------+---------------------------------+---------------------------------+--------------------------+------------------+--------------------+--------------------+----------+------------+--------------------+--------------------+--------------------+
|summary|audiobook_chapter_title|audiobook_chapter_uri|audiobook_title|audiobook_uri|conn_country|        episode_name|episode_show_name|      ip_addr| master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|         ms_played|   offline_timestamp|            platform|reason_end|reason_start| spotify_episode_uri|   spotify_track_uri|                  ts|
+-------+-----------------------+---------------------+---------------+-------------+------------+--------------------+-----------------+-------------+---------------------------------+-------------------------

                                                                                

Existem colunas que n√£o possuem valores nenhuns. Tratar disso

In [12]:
# Conta valores n√£o nulos por coluna
non_null_counts = df_streamingHistory.select([
    Fsum(when(col(c).isNotNull(), 1).otherwise(0)).alias(c)
    for c in df_streamingHistory.columns
]).collect()[0].asDict()

# Filtra colunas cujo count > 0
cols_to_keep = [c for c, v in non_null_counts.items() if v > 0]
cols_to_drop = [c for c, v in non_null_counts.items() if v == 0]

print("Colunas a remover (sem valores):", cols_to_drop)
print("Total:", len(cols_to_drop))

# Cria novo DataFrame sem as colunas vazias
df_streamingHistory = df_streamingHistory.select(cols_to_keep)

[Stage 48:>                                                         (0 + 2) / 2]

Colunas a remover (sem valores): ['audiobook_chapter_title', 'audiobook_chapter_uri', 'audiobook_title', 'audiobook_uri']
Total: 4


                                                                                

### Analisar os valores das colunas que existem do dataframe do hist√≥rico

Country (conn_country)

In [13]:
conn_country_df = df_streamingHistory.select("conn_country").distinct()
count_num = conn_country_df.count()
print(f"\nDistinct values for conn_country ({count_num} total):")
conn_country_df.show(count_num, truncate=False)

[Stage 52:>                                                         (0 + 2) / 2]


Distinct values for conn_country (8 total):


                                                                                

+------------+
|conn_country|
+------------+
|NL          |
|ZZ          |
|PT          |
|ES          |
|JP          |
|AT          |
|GB          |
|DE          |
+------------+



Supostamente n√£o estive a ouvir m√∫sica nestes pa√≠ses todos. Investigar

In [14]:
country_counts = (
    df_streamingHistory.groupBy("conn_country")
      .agg(count("*").alias("num_streams"))
      .orderBy(col("num_streams").desc())
)
country_counts.show(truncate=False)

[Stage 63:>                                                         (0 + 2) / 2]

+------------+-----------+
|conn_country|num_streams|
+------------+-----------+
|PT          |175779     |
|ES          |138        |
|ZZ          |98         |
|DE          |27         |
|NL          |20         |
|GB          |5          |
|AT          |3          |
|JP          |1          |
+------------+-----------+



                                                                                

In [15]:
df_streamingHistory.filter(col("conn_country") == "NL") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

df_streamingHistory.filter(col("conn_country") == "GB") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

df_streamingHistory.filter(col("conn_country") == "AT") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

# Verifica√ß√£o das linhas que t√™m a localiza√ß√£o no Jap√£o (nunca l√° estive)
df_streamingHistory.filter(col("conn_country") == "JP") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

                                                                                

+------------+------------+-----------------+--------------+--------------+--------------------------------+---------------------------------+------------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr       |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name          |ms_played|offline|offline_timestamp|platform|reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+--------------+--------------------------------+---------------------------------+------------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------------------------

                                                                                

+------------+------------+-----------------+--------------+-----------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+-------------------------------------------+----------------------------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr    |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name|ms_played|offline|offline_timestamp|platform                                   |reason_end                  |reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+-----------+--------------------------------+---------------------------------+--------------------------+---------+-------+-----------------+--------------------------------

                                                                                

+------------+------------+-----------------+--------------+------------+--------------------------------+---------------------------------+----------------------------------------+---------+-------+-----------------+--------+----------------------------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr     |master_metadata_album_album_name|master_metadata_album_artist_name|master_metadata_track_name              |ms_played|offline|offline_timestamp|platform|reason_end                  |reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+------------+--------------------------------+---------------------------------+----------------------------------------+---------+-------+-----------------+--------+----------------------------+------------+------



+------------+------------+-----------------+--------------+-------------+--------------------------------------------+---------------------------------+-------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------------------------------+--------------------+
|conn_country|episode_name|episode_show_name|incognito_mode|ip_addr      |master_metadata_album_album_name            |master_metadata_album_artist_name|master_metadata_track_name     |ms_played|offline|offline_timestamp|platform|reason_end|reason_start|shuffle|skipped|spotify_episode_uri|spotify_track_uri                   |ts                  |
+------------+------------+-----------------+--------------+-------------+--------------------------------------------+---------------------------------+-------------------------------+---------+-------+-----------------+--------+----------+------------+-------+-------+-------------------+------------

                                                                                

O que pode ter acontecido √© o Spotify ter registado o *conn_country* como JP (Jap√£o) porque o IP p√∫blico no momento da sincroniza√ß√£o ou da stream estava associado a um servidor da M247 localizado em T√≥quio ‚Äî mesmo estando em Portugal

Incognito Mode e Offline

In [16]:
reason_end_df = df_streamingHistory.select("incognito_mode").distinct()
count_num = reason_end_df.count()
print(f"\nDistinct values for reason_end ({count_num} total):")
reason_end_df.show(count_num, truncate=False)

                                                                                


Distinct values for reason_end (2 total):


[Stage 85:>                                                         (0 + 2) / 2]

+--------------+
|incognito_mode|
+--------------+
|true          |
|false         |
+--------------+



                                                                                

Offline

In [17]:
df_reason_end = df_streamingHistory.select("offline").distinct()
count_num = df_reason_end.count()
print(f"\nDistinct values for incognito_mode ({count_num} total):")
df_reason_end.show(count_num, truncate=False)


Distinct values for incognito_mode (3 total):
+-------+
|offline|
+-------+
|true   |
|false  |
|NULL   |
+-------+



Reason End

In [18]:
df_reason_end = df_streamingHistory.select("reason_end").distinct()
count_num = df_reason_end.count()
print(f"\nDistinct values for reason_end ({count_num} total):")
df_reason_end.show(count_num, truncate=False)

                                                                                


Distinct values for reason_end (12 total):
+----------------------------+
|reason_end                  |
+----------------------------+
|backbtn                     |
|logout                      |
|trackdone                   |
|unknown                     |
|fwdbtn                      |
|endplay                     |
|trackerror                  |
|unexpected-exit             |
|unexpected-exit-while-paused|
|remote                      |
|appload                     |
|                            |
+----------------------------+



Reason Start

In [19]:
reason_start_df = df_streamingHistory.select("reason_start").distinct()
count_num = reason_start_df.count()
print(f"\nDistinct values for reason_start ({count_num} total):")
reason_start_df.show(count_num, truncate=False)

                                                                                


Distinct values for reason_start (12 total):
+-----------------+
|reason_start     |
+-----------------+
|appload          |
|backbtn          |
|trackdone        |
|playbtn          |
|unknown          |
|fwdbtn           |
|trackerror       |
|clickrow         |
|remote           |
|switched-to-video|
|switched-to-audio|
|                 |
+-----------------+



                                                                                

## Agregar dados do hist√≥rico com extra info

Agora quero juntar os dados do hist√≥rico completo mais informa√ß√£o adicional sobre m√∫sicas, √°lbuns e artistas, para complementar informa√ß√£o.

Adicionar mais informa√ß√£o relativamente √†s tracks atrav√©s de API calls do Spotipy ao dataframe "principal" - *df_streamingHistory*

In [20]:
load_dotenv()

scopes = [
    "user-library-read",
    "user-read-playback-state",
    "user-read-currently-playing",
    "user-follow-read",
    "user-read-recently-played",
    "user-top-read"
]
scope = " ".join(scopes)

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope, open_browser=True))

In [21]:
path_api_raw = Path(project_root) / "data" / "raw"
path_api_processed = Path(project_root) / "data" / "processed"
print("path_api_raw:", path_api_raw)
print("path_api_processed:", path_api_processed)

path_api_raw: /home/marta/spotify-data-streaming-project/data/raw
path_api_processed: /home/marta/spotify-data-streaming-project/data/processed


Este c√≥digo √© apenas um teste √†s API calls com o uso dos URIs de tracks, albums e artists.E tamb√©m serve para ver a estrutura do json e me ajudar a extrair apenas a informa√ß√£o necess√°ria e relevante

In [22]:
# tracks_info_path = path_api_raw / "streaming_history/extra_info/tracks_info_data.json"
# artists_info_path = path_api_raw / "streaming_history/extra_info/artists_info_data.json"
# albums_info_path = path_api_raw / "streaming_history/extra_info/albums_info_data.json"

# tracks_uris = ['spotify:track:0Lhy1sDDdtUFY2pyq2ZVi9']
# artists_uris = ["spotify:artist:4vxaQs6vK54nK89J1VtLex"]
# albums_uris = ["spotify:album:2sB7ZYdnxqjVOh3ETtx19Z"]

# try:
#     tracks_data = sp.tracks(tracks_uris)
#     albums_data = sp.albums(albums_uris)
#     artists_data = sp.artists(artists_uris)

#     with open(tracks_info_path, 'w', encoding='utf-8') as f:
#         json.dump(tracks_data, f, indent=2, ensure_ascii=False)
#     print(f"‚úÖ Faixas salvas em: {tracks_info_path}")

#     with open(albums_info_path, 'w', encoding='utf-8') as f:
#         json.dump(albums_data, f, indent=2, ensure_ascii=False)
#     print(f"‚úÖ √Ålbuns salvos em: {albums_info_path}")

#     with open(artists_info_path, 'w', encoding='utf-8') as f:
#         json.dump(artists_data, f, indent=2, ensure_ascii=False)
#     print(f"‚úÖ Artistas salvos em: {artists_info_path}")

# except spotipy.SpotifyException as e:
#     print(f"\nErro ao buscar dados da API: {e}")
# except NameError as e:
#     print(f"\nERRO: Certifique-se de que as vari√°veis 'sp' e 'path_api_raw' est√£o definidas. Detalhe: {e}")

√â importante obter os URIs √∫nicos para usar como par√¢metro nas API calls e obter, desta forma, a informa√ß√£o extra acerca das tracks, albums e artists

Lista de URIs das tracks (streaming tracks)

In [23]:
df_tracks_uris = df_streamingHistory.filter(col("spotify_track_uri").isNotNull())
tracks_uris_list = [row.spotify_track_uri for row in df_tracks_uris.select("spotify_track_uri").distinct().collect()]

print("‚úÖ Processo Conclu√≠do.")
print(tracks_uris_list[:5])
print(f"N√∫mero de URIs √öNICOS encontrados: {len(tracks_uris_list)}")

                                                                                

‚úÖ Processo Conclu√≠do.
['spotify:track:0Lhy1sDDdtUFY2pyq2ZVi9', 'spotify:track:79xNgIQLEnhdcz7LuSLM8T', 'spotify:track:4Oz9NAlujs00kYGL4WTUNw', 'spotify:track:2XkDm5m2vPowecEAAR5gmb', 'spotify:track:27sBcXtgTBSJRdUxei1a7J']
N√∫mero de URIs √öNICOS encontrados: 26887


Fun√ß√£o para obter informa√ß√£o para, mais tarde, extrair as URIs dos albums e dos artists correspondentes √†s tracks

In [24]:
def get_data(uris_list, type, successful_sleep=0.2):
    """Divide a lista de URIs em grupos, chama a API e monitora o progresso."""
    results = []
    BATCH_SIZE = 20
    total_uris = len(uris_list)
    
    if type == "tracks":
        sp_function, key = sp.tracks, 'tracks'
    elif type == "albums":
        sp_function, key = sp.albums, 'albums'
    elif type == "artists":
        sp_function, key = sp.artists, 'artists' 
    else:
        print(f"Tipo desconhecido: {type}")
        return results

    milestones = {10: False, 20: False, 30: False, 40: False, 50: False, 60: False, 70: False, 80: False, 90: False}

    for i in range(0, total_uris, BATCH_SIZE):
        batch = uris_list[i:i + BATCH_SIZE]
        uris_processados = i + len(batch)
        
        percent_done = (uris_processados / total_uris) * 100
        for milestone, reached in milestones.items():
            if not reached and percent_done >= milestone:
                print(f"[{type.upper()}] ‚úÖ Progresso: {milestone}% conclu√≠do ({uris_processados}/{total_uris} URIs)")
                milestones[milestone] = True

        try:
            response = sp_function(batch)

            if key in response:
                results.extend([item for item in response[key] if item is not None])
            
            if successful_sleep > 0:
                time.sleep(successful_sleep)
                
        except (spotipy.SpotifyException, ReadTimeout, ConnectionError) as e:
            error_msg = str(e)
            print(f"‚ö†Ô∏è Erro na chamada da API ({type} Batch {i//BATCH_SIZE}, IDs: {batch[0]}...): {error_msg}")
            if '429' in error_msg or 'timed out' in error_msg or 'RemoteDisconnected' in error_msg:
                print("Dormindo por 10 segundos devido a erro de rede/servidor...")
                time.sleep(10)
            continue
            
    print(f"[{type.upper()}] üèÅ 100% conclu√≠do.")
    return results

In [None]:
all_tracks_metadata_path = path_api_raw / "streaming_history/extra_info/all_tracks_metadata.json"

# 1. Verificar se a lista de entrada existe
if 'tracks_uris_list' in locals() or 'tracks_uris_list' in globals():

    # 2. Se o ficheiro N√ÉO existe, faz a chamada √† API
    if not all_tracks_metadata_path.exists():
        print(f"üöÄ Ficheiro n√£o encontrado. Total de URIs √∫nicos para procurar: {len(tracks_uris_list)}")
        all_tracks_metadata = get_data(tracks_uris_list, type="tracks")
        
        if all_tracks_metadata:
            print(f"Total de faixas recolhidas: {len(all_tracks_metadata)}")
            try:
                # Garante que a pasta existe antes de gravar
                all_tracks_metadata_path.parent.mkdir(parents=True, exist_ok=True)
                
                with open(all_tracks_metadata_path, 'w', encoding='utf-8') as f:
                    json.dump(all_tracks_metadata, f, indent=2, ensure_ascii=False)
                print(f"‚úÖ Dados salvos com sucesso em: {all_tracks_metadata_path}")
                    
            except Exception as e:
                print(f"\n‚ùå ERRO ao tentar salvar o ficheiro JSON: {e}")
        else:
            print("\n‚ö†Ô∏è AVISO: Nenhum dado foi retornado pela API.")

    # 3. Se o ficheiro J√Å existe, carrega os dados para a mem√≥ria
    else:
        try:
            with open(all_tracks_metadata_path, 'r', encoding='utf-8') as f:
                all_tracks_metadata = json.load(f)
            print(f"‚ú® Dados carregados do disco: {len(all_tracks_metadata)} faixas. Nenhuma chamada √† API foi necess√°ria.")
        except Exception as e:
            print(f"‚ùå ERRO ao ler o ficheiro existente: {e}")

else:
    print("‚ùå ERRO: A vari√°vel 'tracks_uris_list' n√£o foi definida.")

Total de URIs √∫nicos na entrada: 26887
[TRACKS] ‚úÖ Progresso: 10% conclu√≠do (2700/26887 URIs)
[TRACKS] ‚úÖ Progresso: 20% conclu√≠do (5380/26887 URIs)
[TRACKS] ‚úÖ Progresso: 30% conclu√≠do (8080/26887 URIs)
[TRACKS] ‚úÖ Progresso: 40% conclu√≠do (10760/26887 URIs)
[TRACKS] ‚úÖ Progresso: 50% conclu√≠do (13460/26887 URIs)
[TRACKS] ‚úÖ Progresso: 60% conclu√≠do (16140/26887 URIs)
[TRACKS] ‚úÖ Progresso: 70% conclu√≠do (18840/26887 URIs)
[TRACKS] ‚úÖ Progresso: 80% conclu√≠do (21520/26887 URIs)
[TRACKS] ‚úÖ Progresso: 90% conclu√≠do (24200/26887 URIs)
[TRACKS] üèÅ 100% conclu√≠do.
Total de faixas com metadados: 26887
‚úÖ Dados de 26887 faixas salvos com sucesso em: /home/marta/spotify-data-streaming-project/data/raw/streaming_history/extra_info/all_track_metadata.json


√â preciso a lista de URIs dos albums e artists para a chamada das API calls *sp.albums()* e *sp.artists()*

In [None]:
album_uri_map = {} 
artists_uris_map = {}

for track_data in all_tracks_metadata:
    if track_data:
        track_uri = track_data['uri']
        
        album_uri = track_data.get("album", {}).get("uri", None)        
        if album_uri:
            album_uri_map[track_uri] = album_uri
            
        artists_list = track_data.get("artists", [])
        artist_uris = [artist.get("uri") for artist in artists_list if "uri" in artist]        
        if artist_uris:
            artists_uris_map[track_uri] = artist_uris

            
all_unique_artist_uris = []

for artist_list in artists_uris_map.values():
    all_unique_artist_uris.extend(artist_list)

unique_artist_count = len(set(all_unique_artist_uris))

print(f"Total de Faixas Mapeadas para √Ålbuns ({len(album_uri_map)}) e Artists ({len(artists_uris_map)})")
print(f"Total de URIs de √Ålbum √önicos ({len(set(album_uri_map.values()))}) e Artists ({unique_artist_count})")

if album_uri_map and artists_uris_map:
    albums_amostra = dict(list(album_uri_map.items())[:5])
    pprint.pprint(albums_amostra)
    artists_amostra = dict(list(artists_uris_map.items())[:5])
    pprint.pprint(artists_amostra)

Total de Faixas Mapeadas para √Ålbuns (26887) e Artists (26887)
Total de URIs de √Ålbum √önicos (19312) e Artists (9643)
{'spotify:track:0Lhy1sDDdtUFY2pyq2ZVi9': 'spotify:album:0sWQFBl9e6A9tL4CqFT9X8',
 'spotify:track:27sBcXtgTBSJRdUxei1a7J': 'spotify:album:7tJ8Wtej161vR0uCbGDiDR',
 'spotify:track:2XkDm5m2vPowecEAAR5gmb': 'spotify:album:2yI4m5Yu2tl8v0It5P9WVz',
 'spotify:track:4Oz9NAlujs00kYGL4WTUNw': 'spotify:album:1WqEP2K5Q0Vei7AVXb8Z1l',
 'spotify:track:79xNgIQLEnhdcz7LuSLM8T': 'spotify:album:3fnXTtLgkjHv2HmqOODSW8'}
{'spotify:track:0Lhy1sDDdtUFY2pyq2ZVi9': ['spotify:artist:5FxD8fkQZ6KcsSYupDVoSO',
                                          'spotify:artist:6JGszm6z8oHmeLbxPHjMao',
                                          'spotify:artist:1rHOtdmGNr5vcYNw5v7QGC'],
 'spotify:track:27sBcXtgTBSJRdUxei1a7J': ['spotify:artist:0NB5HROxc8dDBXpkIi1v3d'],
 'spotify:track:2XkDm5m2vPowecEAAR5gmb': ['spotify:artist:7pbDxGE6nQSZVfiFdq9lOL'],
 'spotify:track:4Oz9NAlujs00kYGL4WTUNw': ['spotify:artis

In [27]:
track_metadata_records = []

for track_uri, album_uri in album_uri_map.items():
    
    artists_list_for_track = artists_uris_map.get(track_uri, [])
    
    record = {
        "spotify_track_uri": track_uri,
        "album_uri": album_uri,
        "artist_uris_list": artists_list_for_track # Lista de URIs de Artista
    }
    track_metadata_records.append(record)

print(f"‚úÖ Convers√£o para {len(track_metadata_records)} registos Python conclu√≠da.")

‚úÖ Convers√£o para 26887 registos Python conclu√≠da.


In [None]:
track_schema = StructType([
    StructField("spotify_track_uri", StringType(), False),
    StructField("album_uri", StringType(), True),
    StructField("artist_uris_list", ArrayType(StringType()), True) # Um-para-Muitos
])

df_tracks_metadata = spark.createDataFrame(track_metadata_records, schema=track_schema)

print(f"\n‚úÖ Cria√ß√£o do DataFrame PySpark 'df_tracks_metadata' conclu√≠da.")
df_tracks_metadata.printSchema()
print("Total rows: ", df_tracks_metadata.count())
df_tracks_metadata.show(5, truncate=False)


‚úÖ Cria√ß√£o do DataFrame PySpark 'df_track_metadata' conclu√≠da.
root
 |-- spotify_track_uri: string (nullable = false)
 |-- album_uri: string (nullable = true)
 |-- artist_uris_list: array (nullable = true)
 |    |-- element: string (containsNull = true)



25/12/18 10:50:10 WARN TaskSetManager: Stage 118 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Total rows:  26887


25/12/18 10:50:14 WARN TaskSetManager: Stage 121 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.


+------------------------------------+------------------------------------+---------------------------------------------------------------------------------------------------------------------+
|spotify_track_uri                   |album_uri                           |artist_uris_list                                                                                                     |
+------------------------------------+------------------------------------+---------------------------------------------------------------------------------------------------------------------+
|spotify:track:0Lhy1sDDdtUFY2pyq2ZVi9|spotify:album:0sWQFBl9e6A9tL4CqFT9X8|[spotify:artist:5FxD8fkQZ6KcsSYupDVoSO, spotify:artist:6JGszm6z8oHmeLbxPHjMao, spotify:artist:1rHOtdmGNr5vcYNw5v7QGC]|
|spotify:track:79xNgIQLEnhdcz7LuSLM8T|spotify:album:3fnXTtLgkjHv2HmqOODSW8|[spotify:artist:0nnYdIpahs41QiZ9MWp5Wx]                                                                              |
|spotify:track:4Oz9NAlujs00kYG

Adicionar agora informa√ß√£o extra de acordo com os URIs das tracks, albums e artists

In [None]:
album_uris_list = (
    df_tracks_metadata.select("album_uri").distinct()
    .filter(col("album_uri").isNotNull())
    .rdd.map(lambda row: row[0])
    .collect()
)

artists_uris_list = (
    df_tracks_metadata.select(col("artist_uris_list")[0].alias("artist_uri"))
    .filter(col("artist_uri").isNotNull())
    .distinct()
    .rdd.map(lambda row: row[0])
    .collect()
)

print(f"URIs √önicos para buscar: Tracks={len(tracks_uris_list)}, Albums={len(album_uris_list)}, Artists={len(artists_uris_list)}")

25/12/18 10:50:20 WARN TaskSetManager: Stage 122 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.
25/12/18 10:50:22 WARN TaskSetManager: Stage 125 contains a task of very large size (1788 KiB). The maximum recommended task size is 1000 KiB.


URIs √önicos para buscar: Tracks=26887, Albums=19312, Artists=6858


In [None]:
all_tracks_metadata_full = all_tracks_metadata.copy()

In [None]:
all_albums_metadata_path = path_api_raw / "streaming_history/extra_info/all_albums_metadata.json"

if all_albums_metadata_path.exists():
    try:
        with open(all_albums_metadata_path, 'r', encoding='utf-8') as f:
            all_albums_metadata_full = json.load(f)
        print(f"‚ú® Dados de √°lbuns carregados do disco: {len(all_albums_metadata_full)} √°lbuns. Nenhuma chamada √† API foi necess√°ria.")
    except Exception as e:
        print(f"‚ùå ERRO ao ler o ficheiro existente de √°lbuns: {e}")

else: 
    # Print informativo antes de come√ßar a extra√ß√£o pesada
    print(f"üöÄ Ficheiro de √°lbuns n√£o encontrado em: {all_albums_metadata_path}")
    print(f"üîç Iniciando extra√ß√£o via API para {len(album_uris_list)} URIs de √°lbuns...")
    
    all_albums_metadata_full = get_data(album_uris_list, type="albums")
    
    if all_albums_metadata_full:
        # Garante que a pasta existe (extra_info/)
        all_albums_metadata_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(all_albums_metadata_path, 'w', encoding='utf-8') as f:
            json.dump(all_albums_metadata_full, f, indent=2, ensure_ascii=False)
        
        # Print de sucesso ap√≥s gravar o ficheiro
        print(f"‚úÖ Extra√ß√£o conclu√≠da com sucesso! {len(all_albums_metadata_full)} √°lbuns guardados localmente.")
    else:
        print("‚ö†Ô∏è AVISO: A chamada de API para √°lbuns n√£o retornou resultados.")

‚ö†Ô∏è Erro na chamada da API (albums Batch 0, IDs: spotify:album:3in1TaZy4KApWHvxrf3NA7...): HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)
Dormindo por 10 segundos devido a erro de rede/servidor...
[ALBUMS] ‚úÖ Progresso: 10% conclu√≠do (1940/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 20% conclu√≠do (3880/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 30% conclu√≠do (5800/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 40% conclu√≠do (7740/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 50% conclu√≠do (9660/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 60% conclu√≠do (11600/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 70% conclu√≠do (13520/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 80% conclu√≠do (15460/19312 URIs)
[ALBUMS] ‚úÖ Progresso: 90% conclu√≠do (17400/19312 URIs)
[ALBUMS] üèÅ 100% conclu√≠do.


In [33]:
all_artists_metadata_path = path_api_raw / "streaming_history/extra_info/all_artists_metadata.json"

if all_artists_metadata_path.exists():
    try:
        with open(all_artists_metadata_path, 'r', encoding='utf-8') as f:
            all_artists_metadata_full = json.load(f)
        print(f"‚ú® Dados de artistas carregados do disco: {len(all_artists_metadata_full)} artistas. Nenhuma chamada √† API foi necess√°ria.")
    except Exception as e:
        print(f"‚ùå ERRO ao ler o ficheiro existente de artistas: {e}")

else:
    print(f"üöÄ Ficheiro de artistas n√£o encontrado em: {all_artists_metadata_path}")
    print(f"üîç Iniciando extra√ß√£o via API para {len(artists_uris_list)} URIs de artistas...")
    
    all_artists_metadata_full = get_data(artists_uris_list, type="artists")
    
    if all_artists_metadata_full:
        try:
            # Garante que a pasta 'extra_info' existe
            all_artists_metadata_path.parent.mkdir(parents=True, exist_ok=True)
            
            with open(all_artists_metadata_path, 'w', encoding='utf-8') as f:
                json.dump(all_artists_metadata_full, f, indent=2, ensure_ascii=False)
            
            # Print de sucesso final
            print(f"‚úÖ Extra√ß√£o conclu√≠da! {len(all_artists_metadata_full)} artistas guardados com sucesso.")
        except Exception as e:
            print(f"‚ùå ERRO ao salvar o ficheiro de artistas: {e}")
    else:
        print("‚ö†Ô∏è AVISO: A API n√£o retornou metadados para os artistas solicitados.")

üöÄ Ficheiro de artistas n√£o encontrado em: /home/marta/spotify-data-streaming-project/data/raw/streaming_history/extra_info/all_artists_metadata.json
üîç Iniciando extra√ß√£o via API para 6858 URIs de artistas...
‚ö†Ô∏è Erro na chamada da API (artists Batch 0, IDs: spotify:artist:4N874uPqBka1QiCvnCVOtr...): HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)
Dormindo por 10 segundos devido a erro de rede/servidor...
[ARTISTS] ‚úÖ Progresso: 10% conclu√≠do (700/6858 URIs)
[ARTISTS] ‚úÖ Progresso: 20% conclu√≠do (1380/6858 URIs)
[ARTISTS] ‚úÖ Progresso: 30% conclu√≠do (2060/6858 URIs)
‚ö†Ô∏è Erro na chamada da API (artists Batch 115, IDs: spotify:artist:6UqUYZ8pMGiYhuyIcsH61y...): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Dormindo por 10 segundos devido a erro de rede/servidor...
‚ö†Ô∏è Erro na chamada da API (artists Batch 116, IDs: spotify:artist:3KwmxIhSe9UTSEF37kwngR...): HTTPSConnectionPool(ho

In [None]:
df_track_lookup = spark.createDataFrame(all_tracks_metadata_full).select(
    col("uri").alias("spotify_track_uri"),
    col("track_number").alias("track_number"),
    
    col("popularity").alias("track_popularity_api"),
    col("explicit").alias("track_explicit_api")
)

In [None]:
df_album_lookup = spark.createDataFrame(all_albums_metadata_full).select(
    col("uri").alias("album_uri"), 
    col("release_date").alias("album_release_date"),
    col("album_type").alias("album_type_api"),
    col("label").alias("album_label")
)

In [None]:
df_artist_lookup = spark.createDataFrame(all_artists_metadata_full).select(
    col("uri").alias("artist_uri"), 
    col("name").alias("main_artist_name"),
    col("followers.total").alias("main_artist_followers"),
    col("genres").alias("main_artist_genres")
)

In [None]:
df_tracks_metadata_enriched = df_tracks_metadata.withColumn(
    "main_artist_uri", 
    col("artist_uris_list")[0] # Usando a sintaxe corrigida
)

# 2.2. JOIN com Faixas e √Ålbuns
df_tracks_metadata_enriched = df_tracks_metadata_enriched.join(df_track_lookup, on="spotify_track_uri", how="left")
df_tracks_metadata_enriched = df_tracks_metadata_enriched.join(df_album_lookup, on="album_uri", how="left")

# 2.3. JOIN com Artistas (Chave: main_artist_uri == artist_uri)
df_tracks_metadata_final = df_tracks_metadata_enriched.join(
    df_artist_lookup,
    df_tracks_metadata_enriched["main_artist_uri"] == df_artist_lookup["artist_uri"],
    how="left"
).drop(df_artist_lookup["artist_uri"]).drop("main_artist_uri")

# Atualizar o DataFrame de metadados
df_tracks_metadata = df_tracks_metadata_final

print("\n‚úÖ df_tracks_metadata totalmente enriquecido via JOINs PySpark.")
print("Total rows: ", df_tracks_metadata.count())
df_tracks_metadata.printSchema()

In [None]:
df_tracks_metadata = spark.createDataFrame(enriched_rows)

print("Total rows: ", df_tracks_metadata.count())
df_tracks_metadata.printSchema()
df_tracks_metadata.show(5, truncate=False)

In [None]:
# Juntar o DataFrame de Hist√≥rico (df_streamingHistory) com os metadados
df_streamingHistory = df_streamingHistory.join(
    df_tracks_metadata, 
    on="spotify_track_uri", 
    how="left"
)

df_streamingHistory.printSchema()
df_streamingHistory.show(5, truncate=False)

Ver o dataframe principal para adicionar nova info

In [None]:
df_streamingHistory.orderBy("ts").show(10, truncate=False)
df_streamingHistory.printSchema()
df_streamingHistory.describe().show()