In [None]:
import requests
import csv
import time
from datetime import datetime, timezone

# ========================
# CONFIGURAÇÕES
# ========================
import os

API_KEY = os.getenv("LASTFM_API_KEY")
USER = os.getenv("LASTFM_USER")

if API_KEY is None or USER is None:
    raise ValueError(
        "Defina as variáveis de ambiente LASTFM_API_KEY e LASTFM_USER"
    )

FROM_DATE = datetime(2023, 1, 1, tzinfo=timezone.utc)
FROM_TS = int(FROM_DATE.timestamp())

LIMIT = 200
BASE_URL = "https://ws.audioscrobbler.com/2.0/"

OUTPUT_FILE = "scrobbles_2023_onwards.csv"

# ========================
# FUNÇÃO DE EXTRAÇÃO
# ========================
def fetch_scrobbles():
    page = 1
    total_pages = None
    rows = []

    while True:
        params = {
            "method": "user.getrecenttracks",
            "user": USER,
            "api_key": API_KEY,
            "format": "json",
            "limit": LIMIT,
            "page": page,
            "from": FROM_TS
        }

        response = requests.get(BASE_URL, params=params)
        response.raise_for_status()
        data = response.json()

        recenttracks = data.get("recenttracks", {})
        tracks = recenttracks.get("track", [])

        if total_pages is None:
            total_pages = int(recenttracks["@attr"]["totalPages"])
            print(f"Total de páginas: {total_pages}")

        if not tracks:
            break

        for t in tracks:
            # Ignora música tocando agora
            if "date" not in t:
                continue

            rows.append({
                "played_at_utc": datetime.fromtimestamp(
                    int(t["date"]["uts"]), tz=timezone.utc
                ).isoformat(),
                "artist": t["artist"]["#text"],
                "track": t["name"],
                "album": t["album"]["#text"],
                "artist_mbid": t["artist"].get("mbid", ""),
                "track_mbid": t.get("mbid", "")
            })

        print(f"Página {page}/{total_pages} processada - Total registros: {len(rows)}")

        if page >= total_pages:
            break

        page += 1
        time.sleep(0.25)  # respeita rate limit

    return rows


# ========================
# EXECUÇÃO
# ========================
if __name__ == "__main__":
    scrobbles = fetch_scrobbles()

    with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=[
                "played_at_utc",
                "artist",
                "track",
                "album",
                "artist_mbid",
                "track_mbid"
            ]
        )
        writer.writeheader()
        writer.writerows(scrobbles)

    print(f"\nArquivo salvo: {OUTPUT_FILE}")
    print(f"Total final de scrobbles: {len(scrobbles)}")
    

In [None]:
import pandas as pd
import re

INPUT_FILE = "scrobbles_2023_onwards.csv"

OUTPUT_SCROBBLES = "scrobbles_normalized.csv"
OUTPUT_UNIQUE = "unique_tracks.csv"

def clean_text(text):
    if pd.isna(text):
        return ""

    text = text.lower()

    # remove conteúdo entre () e []
    text = re.sub(r"\(.*?\)", "", text)
    text = re.sub(r"\[.*?\]", "", text)

    # remove feat / ft
    text = re.sub(r"\bfeat\.?\b.*", "", text)
    text = re.sub(r"\bft\.?\b.*", "", text)

    # remove sufixos comuns
    suffixes = [
        " - ao vivo",
        " - live",
        " - remastered",
        " - remix",
        " - radio edit",
    ]

    for s in suffixes:
        if text.endswith(s):
            text = text.replace(s, "")

    # remove caracteres extras
    text = re.sub(r"[^a-z0-9\s]", "", text)

    # espaços
    text = re.sub(r"\s+", " ", text).strip()

    return text


def main():
    df = pd.read_csv(INPUT_FILE)

    df["artist_raw"] = df["artist"]
    df["track_raw"] = df["track"]

    df["artist_clean"] = df["artist"].apply(clean_text)
    df["track_clean"] = df["track"].apply(clean_text)

    # salva scrobbles normalizados
    df.to_csv(OUTPUT_SCROBBLES, index=False, encoding="utf-8")

    # cria tabela de músicas únicas
    unique_tracks = (
        df.groupby(["artist_clean", "track_clean"])
          .size()
          .reset_index(name="occurrences")
          .sort_values("occurrences", ascending=False)
    )

    unique_tracks.to_csv(OUTPUT_UNIQUE, index=False, encoding="utf-8")

    print(f"Arquivo gerado: {OUTPUT_SCROBBLES}")
    print(f"Arquivo gerado: {OUTPUT_UNIQUE}")
    print(f"Total de músicas únicas: {len(unique_tracks)}")


if __name__ == "__main__":
    main()


In [None]:
import pandas as pd

# ========================
# CONFIG
# ========================
INPUT_FILE = "scrobbles_normalized.csv"

# ========================
# LOAD
# ========================
df = pd.read_csv(INPUT_FILE)

# ========================
# CONTAGENS
# ========================

# Artistas únicos (raw)
unique_artists_raw = df["artist_raw"].nunique()

# Artistas únicos (clean)
unique_artists_clean = df["artist_clean"].nunique()

print(f"Artistas únicos (raw):   {unique_artists_raw}")
print(f"Artistas únicos (clean): {unique_artists_clean}")

# ========================
# RANKING DE ARTISTAS
# ========================
artist_counts = (
    df.groupby("artist_clean")
      .size()
      .reset_index(name="scrobbles")
      .sort_values("scrobbles", ascending=False)
)

print("\nTop 20 artistas mais escutados:")
print(artist_counts.head(20))

# ========================
# SALVAR (opcional, mas útil)
# ========================
artist_counts.to_csv("artist_frequency.csv", index=False, encoding="utf-8")

print("\nArquivo gerado: artist_frequency.csv")


In [None]:
import pandas as pd

# ========================
# CONFIG
# ========================
INPUT_FILE = "artist_frequency.csv"
TOP_N = 1000

# ========================
# LOAD
# ========================
df = pd.read_csv(INPUT_FILE)

total_scrobbles = df["scrobbles"].sum()

top_n = df.head(TOP_N)
top_n_scrobbles = top_n["scrobbles"].sum()

coverage_pct = (top_n_scrobbles / total_scrobbles) * 100

print(f"Total de scrobbles: {total_scrobbles}")
print(f"Scrobbles no Top {TOP_N}: {top_n_scrobbles}")
print(f"Cobertura do Top {TOP_N}: {coverage_pct:.2f}%")


In [None]:
pip install pandas openai tqdm
