In [1]:
#imports

In [None]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import os
from tqdm import tqdm
import time
import shutil
from datetime import datetime

In [None]:
# Title: Initialize Spotify API Client with Credentials

# Description:
# Authenticates and initializes a Spotipy client using Spotify Client Credentials flow, 
# allowing access to public Spotify Web API endpoints such as artist, track, and genre data.

In [None]:

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id='INSERT',
    client_secret='INSERT'
))

In [None]:
# Title: Fetch and Rank Top Genres per Year with Live Spotify API Lookup

# Description:
# Creates a save directory and loads cached artist-to-genre mappings, 
# defines Wrapped cutoff dates from 2017 to 2024, 
# filters top 200 songs per year based on play count, 
# attempts to fetch missing genre data from the Spotify API with caching and error handling, 
# collects and counts genre occurrences among valid artists, 
# prints the top 5 genres per year, and 
# saves the updated genre map to a local JSON cache for future use.

In [None]:

# Create the directory to save data
SAVE_DIR = 'wrapped_data'
os.makedirs(SAVE_DIR, exist_ok=True)

# Load Artist Genre Cache ---
CACHE_PATH = os.path.join(SAVE_DIR, 'top_genres.json')
artist_genre_map = {}

if os.path.exists(CACHE_PATH):
    with open(CACHE_PATH, 'r') as f:
        artist_genre_map = json.load(f)

# Wrapped End Dates per Year
wrapped_end_dates = {year: f"{year}-{'10-31' if year <= 2019 else '11-15'}T23:59:59Z" for year in range(2017, 2025)}

# Prepare Streaming Data
streaming_data['ts'] = pd.to_datetime(streaming_data['ts'])

# Main Analysis
top_genres_by_year = {}

for year, end_str in wrapped_end_dates.items():
    print(f"\nWrapped {year} – Finding Top Genres")

    year_data = streaming_data.query("@start <= ts <= @end", local_dict={'start': f"{year}-01-01T00:00:00Z", 'end': end_str})
    
    if year_data.empty:
        print("No data for this year.")
        top_genres_by_year[year] = []
        continue

    top_candidates = (
        year_data.groupby(['master_metadata_track_name', 'master_metadata_album_artist_name'])
        .agg(play_count=('ts', 'count'))
        .reset_index()
        .sort_values('play_count', ascending=False)
        .head(200)
    )

    valid_artists, genres = [], []

    for artist in tqdm(top_candidates['master_metadata_album_artist_name'].dropna().unique(), desc=f"Fetching genres for {year}"):
        artist_genres = artist_genre_map.get(artist)
        if not artist_genres:
            try:
                results = sp.search(q=f"artist:{artist}", type='artist', limit=1)
                artist_genres = results.get('artists', {}).get('items', [{}])[0].get('genres', [])
                artist_genre_map[artist] = artist_genres if artist_genres else ['Unknown']
                time.sleep(0.2)
            except Exception as e:
                print(f"Error fetching {artist}: {e}")
                artist_genre_map[artist] = ['Unknown']
            artist_genres = artist_genre_map[artist]

        if artist_genres and 'Unknown' not in artist_genres:
            valid_artists.append(artist)
            genres += artist_genres

        if len(valid_artists) >= 100:
            break

    if not genres:
        print(f"No valid genres found for {year}.")
        top_genres_by_year[year] = []
        continue

    top_genres_by_year[year] = pd.Series(genres).value_counts().head(5)

    print(f"Top Genres for {year}:")
    print(top_genres_by_year[year])

# Save Updated Cache
with open(CACHE_PATH, 'w') as f:
    json.dump(artist_genre_map, f, indent=2)

print(f"Artist genres saved to {CACHE_PATH}.")


In [None]:
# Title: Filter and Backup Cleaned Artist Genre Map

# Description:
# Loads a cached artist-to-genre dictionary, 
# removes entries where genre is missing or marked as 'Unknown', 
# creates a timestamped backup of the existing clean genre file if present, 
# saves the cleaned dictionary to a new JSON file, 
# and prints a summary report of how many artists were retained or dropped.

In [None]:

# Set up paths
SAVE_DIR = 'wrapped_data'
os.makedirs(SAVE_DIR, exist_ok=True)

CACHE_PATH = os.path.join(SAVE_DIR, 'top_genres.json')
CLEAN_CACHE_PATH = os.path.join(SAVE_DIR, 'top_genres_clean.json')

# Load your current genre map
with open(CACHE_PATH, 'r') as f:
    artist_genre_map = json.load(f)

# Clean the genre map
clean_artist_genre_map = {}

for artist, genres in artist_genre_map.items():
    if genres and isinstance(genres, list) and 'Unknown' not in genres:
        clean_artist_genre_map[artist] = genres

# Backup old clean file if it exists 
if os.path.exists(CLEAN_CACHE_PATH):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    backup_path = f"{CLEAN_CACHE_PATH}.{timestamp}.bak"
    shutil.copy(CLEAN_CACHE_PATH, backup_path)
    print(f"Backup created: {backup_path}")

# Save the clean genre map into a new file
with open(CLEAN_CACHE_PATH, 'w') as f:
    json.dump(clean_artist_genre_map, f, indent=2)

# Print a report
original_count = len(artist_genre_map)
clean_count = len(clean_artist_genre_map)
dropped_count = original_count - clean_count

print(f"""
Clean Report:
- Original artist entries: {original_count}
- Artists with real genres kept: {clean_count}
- Artists dropped (only 'Unknown'): {dropped_count}

Clean genre map saved as: {CLEAN_CACHE_PATH}
""")
