In [0]:
from pyspark.sql import SparkSession
from pathlib import Path
from pyspark.sql.window import Window
from datetime import datetime
import pytz
import pyspark.sql.functions as F
from pyspark.sql.functions import desc, count, col, sum as Fsum, when, lit, current_timestamp
import json, pprint
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType, BooleanType, LongType
from requests.exceptions import ReadTimeout, ConnectionError
import pprint
import os
import time
import math
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import sys

In [0]:
spark = SparkSession.builder.appName("SpotifyStreamingHistory").getOrCreate()
print(spark)

In [0]:
# Lista todos os volumes dentro do schema 'default' do cat√°logo 'workspace'
display(spark.sql("SHOW VOLUMES IN workspace.default"))

In [0]:
folder_path = "/Volumes/workspace/default/spotify-data/streaming_history/raw/streaming_history/*Audio*.json"
df_streamingHistory = spark.read.option("multiline", "true").json(folder_path)
print("Total values across all files: ", df_streamingHistory.count())
display(df_streamingHistory.limit(10))

In [0]:
df_streamingHistory.printSchema()

Verifica√ß√£o de valores repetidos (reparei que h√° linhas com o mesmo 'ts' e isso n√£o pode acontecer...)

In [0]:
windowSpec = Window.partitionBy("ts").orderBy("spotify_track_uri")
df_with_duplicatesNum = df_streamingHistory.withColumn("ts_count", count("*").over(windowSpec))
# este dataframe √© apenas para visualizar e analisar os valores duplicados
df_with_duplicates = df_with_duplicatesNum.filter(col("ts_count") > 1)
duplicates_num = df_with_duplicates.count()
df_with_duplicates.orderBy("ts_count", "ts").show(truncate=False)

Raz√µes para isto poder ter acontecido:
- Dados Agrupados (Batching): O cliente Spotify (aplica√ß√£o) enviou um lote de eventos para o servidor na mesma altura.
- A√ß√µes R√°pidas/Simult√¢neas: O utilizador fez uma a√ß√£o muito r√°pida (por exemplo, deu skip √† m√∫sica duas vezes seguidas no mesmo segundo).
- Dados "Duplicados" (T√©cnico): N√£o s√£o duplicados no sentido estrito (as m√∫sicas s√£o diferentes), mas s√£o eventos simult√¢neos que partilham a chave ts.

Os valores duplicados t√™m de ser removidos apenas se possuirem o mesmo 'ts' e a m√∫sica for id√™ntica (valores duplicados). H√° casos em que o 'ts' √© igual, mas s√£o m√∫sicas diferentes, ter isso em considera√ß√£o!

In [0]:
before_count = df_streamingHistory.count()
df_streamingHistory = df_streamingHistory.dropDuplicates()
after_count = df_streamingHistory.count()
print(f"'ts' duplicates count: {duplicates_num}")
print(f"Removed {before_count - after_count} duplicate values.")

Verificar outra vez os valores duplicados n√£o t√™m a mesma m√∫sica duplicada

In [0]:
df_with_duplicatesNum = df_streamingHistory.withColumn("ts_count", count("*").over(windowSpec))
# este dataframe √© apenas para visualizar e analisar os valores duplicados
df_with_duplicates = df_with_duplicatesNum.filter(col("ts_count") > 1)
df_with_duplicates.orderBy("ts_count", "ts").show(truncate=False)

In [0]:
df_streamingHistory_final = df_streamingHistory.withColumn("ts", F.to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))
display(df_streamingHistory_final)

In [0]:
df_streamingHistory.orderBy(desc("ts")).show(truncate=False)

In [0]:
print("Describe: ")
df_streamingHistory.describe().show()

Existem colunas que n√£o possuem valores nenhuns. Tratar disso

In [0]:
# Conta valores n√£o nulos por coluna
non_null_counts = df_streamingHistory.select([
    Fsum(when(col(c).isNotNull(), 1).otherwise(0)).alias(c)
    for c in df_streamingHistory.columns
]).collect()[0].asDict()

# Filtra colunas cujo count > 0
cols_to_keep = [c for c, v in non_null_counts.items() if v > 0]
cols_to_drop = [c for c, v in non_null_counts.items() if v == 0]

print("Colunas a remover (sem valores):", cols_to_drop)
print("Total:", len(cols_to_drop))

# Cria novo DataFrame sem as colunas vazias
df_streamingHistory = df_streamingHistory.select(cols_to_keep)

### Analisar os valores das colunas que existem do dataframe do hist√≥rico

Country (conn_country)

In [0]:
df_conn_country = df_streamingHistory.select("conn_country").distinct()
count_num = df_conn_country.count()
print(f"\nDistinct values for conn_country ({count_num} total):")
# df_conn_country.show(count_num, truncate=True)
display(df_conn_country)

Supostamente n√£o estive a ouvir m√∫sica nestes pa√≠ses todos. Investigar

In [0]:
country_counts = (
    df_streamingHistory.groupBy("conn_country")
      .agg(count("*").alias("num_streams"))
      .orderBy(col("num_streams").desc())
)
display(country_counts)

In [0]:
df_streamingHistory.filter(col("conn_country") == "NL") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

df_streamingHistory.filter(col("conn_country") == "GB") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

df_streamingHistory.filter(col("conn_country") == "AT") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

# Verifica√ß√£o das linhas que t√™m a localiza√ß√£o no Jap√£o (nunca l√° estive)
df_streamingHistory.filter(col("conn_country") == "JP") \
  .select("*") \
  .orderBy("ts") \
  .show(10, truncate=False)

O que pode ter acontecido √© o Spotify ter registado o *conn_country* como JP (Jap√£o) porque o IP p√∫blico no momento da sincroniza√ß√£o ou da stream estava associado a um servidor da M247 localizado em T√≥quio ‚Äî mesmo estando em Portugal

Incognito Mode e Offline

In [0]:
df_reason_end = df_streamingHistory.select("incognito_mode").distinct()
count_num = df_reason_end.count()
print(f"\nDistinct values for incognito_mode ({count_num} total):")
# df_reason_end.show(count_num, truncate=False)
display(df_reason_end)

Offline

In [0]:
df_reason_end = df_streamingHistory.select("offline").distinct()
count_num = df_reason_end.count()
print(f"\nDistinct values for offline ({count_num} total):")
# df_reason_end.show(count_num, truncate=False)
display(df_reason_end)

Reason End

In [0]:
df_reason_end = df_streamingHistory.select("reason_end").distinct()
count_num = df_reason_end.count()
print(f"\nDistinct values for reason_end ({count_num} total):")
# df_reason_end.show(count_num, truncate=False)
display(df_reason_end)

Reason Start

In [0]:
df_reason_start = df_streamingHistory.select("reason_start").distinct()
count_num = df_reason_start.count()
print(f"\nDistinct values for reason_start ({count_num} total):")
# df_reason_end..show(count_num, truncate=False)
display(df_reason_start)

## Agregar dados do hist√≥rico com extra info

Agora quero juntar os dados do hist√≥rico completo mais informa√ß√£o adicional sobre m√∫sicas, √°lbuns e artistas, para complementar informa√ß√£o.

Adicionar mais informa√ß√£o relativamente √†s tracks atrav√©s de API calls do Spotipy ao dataframe "principal" - *df_streamingHistory*

In [0]:
load_dotenv()

cache_path = "/Volumes/workspace/default/spotify-data/.spotify_token_cache"
refresh_token = os.getenv("SPOTIFY_REFRESH_TOKEN")

scopes = [
    "user-library-read",
    "user-read-playback-state",
    "user-read-currently-playing",
    "user-follow-read",
    "user-read-recently-played",
    "user-top-read"
]
scope = " ".join(scopes)

# Configura√ß√µes
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")
redirect_uri = os.getenv("SPOTIPY_REDIRECT_URI")
refresh_token = os.getenv("SPOTIFY_REFRESH_TOKEN") 

auth_manager = SpotifyOAuth(
    scope = scope,
    client_id=client_id,
    client_secret=client_secret,
    redirect_uri=redirect_uri,
    cache_path=cache_path
)

# L√ìGICA DE AUTOMA√á√ÉO TOTAL
if not os.path.exists(cache_path):
    print("Ficheiro de cache n√£o encontrado. A gerar novo via Refresh Token...")
    new_token_info = auth_manager.refresh_access_token(refresh_token)
    print("‚úÖ Novo token gerado e cache criado com sucesso!")
else:
    print("‚úÖ Autenticado via cache existente.")

sp = spotipy.Spotify(auth_manager=auth_manager)

In [0]:
path_api_raw = "/Volumes/workspace/default/spotify-data/streaming_history/raw/"
path_api_processed = "/Volumes/workspace/default/spotify-data/streaming_history/processed/"
print("path_api_raw:", path_api_raw)
print("path_api_processed:", path_api_processed)

√â importante obter os URIs √∫nicos para usar como par√¢metro nas API calls e obter, desta forma, a informa√ß√£o extra acerca das tracks, albums e artists

Lista de URIs das tracks (streaming tracks)

In [0]:
df_tracks_uris = df_streamingHistory.filter(col("spotify_track_uri").isNotNull())
tracks_uris_list = [row.spotify_track_uri for row in df_tracks_uris.select("spotify_track_uri").distinct().collect()]

print("‚úÖ Processo Conclu√≠do.")
print(tracks_uris_list[:5])
print(f"N√∫mero de URIs √öNICOS encontrados: {len(tracks_uris_list)}")

Fun√ß√£o para obter informa√ß√£o para, mais tarde, extrair as URIs dos albums e dos artists correspondentes √†s tracks

In [0]:
def get_data(uris_list, type, successful_sleep=0.2):
    """Divide a lista de URIs em grupos, chama a API e monitora o progresso."""
    results = []
    API_BATCH_SIZE = 20
    total_uris = len(uris_list)
    
    if type == "tracks":
        sp_function, key = sp.tracks, 'tracks'
    elif type == "albums":
        sp_function, key = sp.albums, 'albums'
    elif type == "artists":
        sp_function, key = sp.artists, 'artists' 
    else:
        print(f"Tipo desconhecido: {type}")
        return results

    milestones = {10: False, 20: False, 30: False, 40: False, 50: False, 60: False, 70: False, 80: False, 90: False}

    for i in range(0, total_uris, API_BATCH_SIZE):
        batch = uris_list[i:i + API_BATCH_SIZE]
        uris_processados = i + len(batch)
        
        percent_done = (uris_processados / total_uris) * 100
        for milestone, reached in milestones.items():
            if not reached and percent_done >= milestone:
                print(f"[{type.upper()}] ‚úÖ Progresso: {milestone}% conclu√≠do ({uris_processados}/{total_uris} URIs)")
                milestones[milestone] = True

        try:
            response = sp_function(batch)

            if key in response:
                results.extend([item for item in response[key] if item is not None])
            
            if successful_sleep > 0:
                time.sleep(successful_sleep)
                
        except (spotipy.SpotifyException, ReadTimeout, ConnectionError) as e:
            error_msg = str(e)
            print(f"‚ö†Ô∏è Erro na chamada da API ({type} Batch {i//API_BATCH_SIZE}, IDs: {batch[0]}...): {error_msg}")
            if '429' in error_msg or 'timed out' in error_msg or 'RemoteDisconnected' in error_msg:
                print("Dormindo por 10 segundos devido a erro de rede/servidor...")
                time.sleep(10)
            continue
            
    print(f"[{type.upper()}] üèÅ 100% conclu√≠do.")
    return results

In [0]:
JSON_BATCH_SIZE = 1000

def process_extra_info(uris_list, entity_type, folder_path, json_batch_size=JSON_BATCH_SIZE):
    folder_path.mkdir(parents=True, exist_ok=True)
    existing_parts = list(folder_path.glob(f"{entity_type}_part_*.json"))
    
    # 1. Tentar Carregar do Cache
    if existing_parts:
        data_full = []
        for part_path in existing_parts:
            with open(part_path, 'r', encoding='utf-8') as f:
                content = json.load(f)
                # Normaliza: extrai a lista se estiver dentro de um dicion√°rio {'artists': [...]}
                if isinstance(content, dict) and entity_type in content:
                    data_full.extend(content[entity_type])
                else:
                    data_full.extend(content)
        print(f"‚ú® [{entity_type.upper()}] Dados carregados do disco: {len(data_full)} itens.")
        return data_full

    # 2. Se n√£o houver cache, extrair via API
    print(f"üöÄ [{entity_type.upper()}] Iniciando extra√ß√£o de {len(uris_list)} URIs...")
    raw_data = get_data(uris_list, type=entity_type)
    
    # Normalizar dados da API (Spotify √†s vezes devolve {'tracks': [...]})
    items_list = raw_data.get(entity_type, []) if isinstance(raw_data, dict) else raw_data

    if items_list:
        num_parts = math.ceil(len(items_list) / json_batch_size)
        for i in range(num_parts):
            batch = items_list[i * json_batch_size : (i + 1) * json_batch_size]
            part_filename = folder_path / f"{entity_type}_part_{i+1}.json"
            
            # Decidimos aqui: faixas salvamos como lista pura, √°lbuns/artistas com chave raiz
            save_content = {entity_type: batch} if entity_type in ['albums', 'artists'] else batch
            
            with open(part_filename, 'w', encoding='utf-8') as f:
                json.dump(save_content, f, indent=2, ensure_ascii=False)
        
        print(f"‚úÖ [{entity_type.upper()}] Sucesso! {len(items_list)} itens guardados em {num_parts} ficheiros.")
        return items_list
    
    print(f"‚ö†Ô∏è [{entity_type.upper()}] A API n√£o retornou resultados.")
    return []

In [0]:
base_extra_path = Path(path_api_raw) / "streaming_history/extra_info"

In [0]:
if 'tracks_uris_list' in locals() or 'tracks_uris_list' in globals():
    all_tracks_metadata = process_extra_info(
        tracks_uris_list, "tracks", base_extra_path / "tracks", JSON_BATCH_SIZE
    )

√â preciso a lista de URIs dos albums e artists para a chamada das API calls *sp.albums()* e *sp.artists()*

In [0]:
album_uri_map = {} 
artists_uris_map = {}

for track_data in all_tracks_metadata:
    if track_data:
        track_uri = track_data['uri']
        
        album_uri = track_data.get("album", {}).get("uri", None)        
        if album_uri:
            album_uri_map[track_uri] = album_uri
            
        artists_list = track_data.get("artists", [])
        artist_uris = [artist.get("uri") for artist in artists_list if "uri" in artist]        
        if artist_uris:
            artists_uris_map[track_uri] = artist_uris

            
all_unique_artist_uris = []

for artist_list in artists_uris_map.values():
    all_unique_artist_uris.extend(artist_list)

unique_artist_count = len(set(all_unique_artist_uris))

print(f"Total de Faixas Mapeadas para √Ålbuns ({len(album_uri_map)}) e Artists ({len(artists_uris_map)})")
print(f"Total de URIs de √Ålbum √önicos ({len(set(album_uri_map.values()))}) e Artists ({unique_artist_count})")

if album_uri_map and artists_uris_map:
    albums_amostra = dict(list(album_uri_map.items())[:5])
    pprint.pprint(albums_amostra)
    artists_amostra = dict(list(artists_uris_map.items())[:5])
    pprint.pprint(artists_amostra)

In [0]:
track_metadata_records = []

for track_uri, album_uri in album_uri_map.items():
    
    artists_list_for_track = artists_uris_map.get(track_uri, [])
    
    record = {
        "spotify_track_uri": track_uri,
        "album_uri": album_uri,
        "album_artists_uris": artists_list_for_track # Lista de URIs de Artista
    }
    track_metadata_records.append(record)

print(f"‚úÖ Convers√£o para {len(track_metadata_records)} registos Python conclu√≠da.")

In [0]:
track_schema = StructType([
    StructField("spotify_track_uri", StringType(), False),
    StructField("album_uri", StringType(), True),
    StructField("album_artists_uris", ArrayType(StringType()), True) # Um-para-Muitos
])

df_tracks_metadata = spark.createDataFrame(track_metadata_records, schema=track_schema)

print(f"\n‚úÖ Cria√ß√£o do DataFrame PySpark 'df_tracks_metadata' conclu√≠da.")
df_tracks_metadata.printSchema()
print("Total rows: ", df_tracks_metadata.count())
display(df_tracks_metadata)

Adicionar agora informa√ß√£o extra de acordo com os URIs das tracks, albums e artists

In [0]:
# For Albums
album_uris_list = [
    row[0] for row in df_tracks_metadata
    .select("album_uri")
    .distinct()
    .filter(col("album_uri").isNotNull())
    .collect()
]

# For Artists
artists_uris_list = [
    row[0] for row in df_tracks_metadata
    .select(col("album_artists_uris")[0].alias("artist_uri"))
    .filter(col("artist_uri").isNotNull())
    .distinct()
    .collect()
]

print(f"URIs √önicos para buscar: Tracks={len(tracks_uris_list)}, Albums={len(album_uris_list)}, Artists={len(artists_uris_list)}")

In [0]:
all_tracks_metadata_full = all_tracks_metadata.copy()

In [0]:
if 'album_uris_list' in locals() or 'album_uris_list' in globals():
    all_albums_metadata_full = process_extra_info(
        album_uris_list, "albums", base_extra_path / "albums", JSON_BATCH_SIZE
    )

In [0]:

if 'artists_uris_list' in locals() or 'artists_uris_list' in globals():
    all_artists_metadata_full = process_extra_info(
        artists_uris_list, "artists", base_extra_path / "artists", JSON_BATCH_SIZE
    )

In [0]:
schema_tracks = StructType([
    StructField("track_id", StringType(), True),
    StructField("track_uri", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("track_popularity", IntegerType(), True),
    StructField("track_duration_ms", LongType(), True),
    StructField("track_is_explicit", BooleanType(), True),
    StructField("track_number", IntegerType(), True),
    StructField("track_disc_number", IntegerType(), True),
    StructField("track_is_playable", BooleanType(), True),
    StructField("track_artists_uris", ArrayType(StringType()), True),
    StructField("track_artists_names", ArrayType(StringType()), True),
    StructField("track_artists_ids", ArrayType(StringType()), True)
])

tracks_flat = [
    {   
        "track_id": t.get("id"),
        "track_uri": t.get("uri"),
        "track_name": t.get("name"),
        "track_popularity": t.get("popularity"),
        "track_duration_ms": t.get("duration_ms"),
        "track_is_explicit": t.get("explicit"),
        "track_number": t.get("track_number"),
        "track_disc_number": t.get("disc_number"),
        "track_is_playable": t.get("is_playable"),
        "track_artists_uris": [t.get("uri") for t in t.get("artists")],
        "track_artists_names": [t.get("name") for t in t.get("artists")],
        "track_artists_ids": [t.get("id") for t in t.get("artists")]
    } 
    for t in all_tracks_metadata_full if t is not None
]

df_extraInfo_tracks = spark.createDataFrame(tracks_flat, schema=schema_tracks)
display(df_extraInfo_tracks)

In [0]:
schema_albums = StructType([
    StructField("album_id", StringType(), True),
    StructField("album_type", StringType(), True),
    StructField("album_uri", StringType(), True),
    StructField("album_total_tracks", IntegerType(), True),
    StructField("album_image", StringType(), True),
    StructField("album_release_date", StringType(), True),
    StructField("album_release_date_precision", StringType(), True),
    StructField("album_artists_names", ArrayType(StringType()), True),
    StructField("album_tracks_names", ArrayType(StringType()), True),
    StructField("album_copyrights", ArrayType(StringType()), True),
    StructField("album_label", StringType(), True),
    StructField("album_popularity", IntegerType(), True)
])

albums_source = all_albums_metadata_full.get("albums", []) if isinstance(all_albums_metadata_full, dict) else all_albums_metadata_full
albums_flat = []

for a in albums_source:
    if a is None: continue
    
    artists = a.get("artists", [])
    artist_names = [art.get("name") for art in artists]
    artist_uris = [art.get("uri") for art in artists]
    
    tracks = a.get("tracks", {})
    tracks_items = tracks.get("items", [])
    track_names = [t.get("name") for t in tracks_items]
    
    copyrights = [c.get("text") for c in a.get("copyrights", [])]
    images = a.get("images", [])
    first_image = images[0].get("url") if images else None

    albums_flat.append({
        "album_id": a.get("id"),
        "album_type": a.get("album_type"),
        "album_uri": a.get("uri"),
        "album_total_tracks": a.get("total_tracks"),
        "album_image": first_image,
        "album_release_date": a.get("release_date"),
        "album_release_date_precision": a.get("release_date_precision"),
        "album_artists_names": artist_names,
        "album_tracks_names": track_names,
        "album_copyrights": copyrights,
        "album_label": a.get("label"),
        "album_popularity": a.get("popularity")
    })

df_extraInfo_albums = spark.createDataFrame(albums_flat, schema=schema_albums)
display(df_extraInfo_albums)

In [0]:
schema_artists = StructType([
    StructField("artist_uri", StringType(), True),
    StructField("artist_id", StringType(), True),
    StructField("artist_followers_total", LongType(), True),
    StructField("artist_genres", ArrayType(StringType()), True),
    StructField("artist_image", StringType(), True),
    StructField("artist_popularity", IntegerType(), True)
])

artists_source = all_artists_metadata_full.get("artists", []) if isinstance(all_artists_metadata_full, dict) else all_artists_metadata_full

artists_flat = []
for art in artists_source:
    if art is None: continue
    
    # Extrair a URL da primeira imagem dispon√≠vel
    images = art.get("images", [])
    first_image = images[0].get("url") if images else None
    
    # Extrair o total de seguidores (campo aninhado)
    followers_dict = art.get("followers", {})
    total_followers = followers_dict.get("total") if isinstance(followers_dict, dict) else 0

    artists_flat.append({
        "artist_uri": art.get("uri"),
        "artist_id": art.get("id"),
        "artist_followers_total": total_followers,
        "artist_genres": art.get("genres", []),
        "artist_image": first_image,
        "artist_popularity": art.get("popularity")
    })

df_extraInfo_artists = spark.createDataFrame(artists_flat, schema=schema_artists)
display(df_extraInfo_artists)

In [0]:
# Criar a coluna de liga√ß√£o para os artistas
df_extraInfo = df_tracks_metadata.withColumn(
    "main_artist_uri", 
    col("album_artists_uris")[0]
)

In [0]:
# JOIN com Faixas (Nomes diferentes: spotify_track_uri vs track_uri)
df_extraInfo = df_extraInfo.join(
    df_extraInfo_tracks, 
    df_extraInfo["spotify_track_uri"] == df_extraInfo_tracks["track_uri"], 
    how="left"
).drop(df_extraInfo_tracks["track_uri"]) # Removemos a duplicada ap√≥s o join

In [0]:
# JOIN com √Ålbuns (Nomes diferentes: album_uri vs album_uri - se os nomes forem iguais o "on" funciona)
# Vou assumir que no df_extraInfo a coluna se chama 'album_uri'
df_extraInfo = df_extraInfo.join(
    df_extraInfo_albums, 
    on="album_uri", 
    how="left"
)

In [0]:
# JOIN com Artistas
df_extraInfo_final = df_extraInfo.join(
    df_extraInfo_artists,
    df_extraInfo["main_artist_uri"] == df_extraInfo_artists["artist_uri"],
    how="left"
)

In [0]:
# 1. Definir o caminho (convertendo Path para string, que o Spark exige)
path_processed_extraInfo = str(Path(path_api_processed) / "extra_info")

# 2. Guardar usando o Spark
# .coalesce(1) garante que os dados sejam unidos num √∫nico ficheiro dentro da pasta
df_extraInfo_final.coalesce(1).write.mode("overwrite") \
    .option("encoding", "UTF-8") \
    .json(path_processed_extraInfo)

print(f"\n‚úÖ DataFrame 'df_extraInfo' guardado em: {path_processed_extraInfo}")

In [0]:
print("‚úÖ Join conclu√≠do com sucesso! Total rows: ", df_extraInfo_final.count())

In [0]:
df_extraInfo_final.printSchema()

In [0]:
display(df_extraInfo_final)

In [0]:
# Juntar o DataFrame de Hist√≥rico (df_streamingHistory) com os metadados
df_streamingHistory = df_streamingHistory.join(
    df_extraInfo_final, 
    on="spotify_track_uri", 
    how="left"
)

print(f"\n‚úÖ DataFrame 'df_streamingHistory' atualizado com metadados adicionais.")

In [0]:
df_streamingHistory = df_streamingHistory.drop("episode_name", "episode_show_name", "track_name", "spotify_episode_uri", "track_is_playable", "artist_name")

In [0]:
df_streamingHistory = df_streamingHistory.withColumnRenamed("spotify_track_uri", "track_uri")
df_streamingHistory = df_streamingHistory.withColumnRenamed("master_metadata_album_album_name", "album_name")
df_streamingHistory = df_streamingHistory.withColumnRenamed("master_metadata_album_artist_name", "artist_name")
df_streamingHistory = df_streamingHistory.withColumnRenamed("master_metadata_track_name", "track_name")
df_streamingHistory = df_streamingHistory.withColumnRenamed("total_tracks", "album_total_tracks")
df_streamingHistory = df_streamingHistory.withColumnRenamed("artist_followers_total", "artist_total_followers")

In [0]:
df_streamingHistory = df_streamingHistory.na.fill({"reason_start": "unknown"})
df_streamingHistory = df_streamingHistory.na.fill({"reason_end": "unknown"})

In [0]:
df_streamingHistory = df_streamingHistory.withColumns({
    "processed_at": current_timestamp(),
    "context_type": lit(None).cast("string"),
    "context_uri": lit(None).cast("string"),
    "track_type": lit("unknown")
})

In [0]:
# Ordena a lista de nomes de colunas e faz o select
colunas_alfabetica = sorted(df_streamingHistory.columns)
df_streamingHistory = df_streamingHistory.select(*colunas_alfabetica)

In [0]:
df_streamingHistory.printSchema()

In [0]:
display(df_streamingHistory.columns)

In [0]:
# Guardar o DataFrame como uma tabela Delta no teu cat√°logo (Unity Catalog ou Hive Metastore)
df_streamingHistory.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("spotify_history")

tz = pytz.timezone('Europe/Lisbon')
current_time = datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')

print("‚úÖ Tabela 'spotify_history' criada com sucesso no cat√°logo! {current_time}")