# Pull Spotify History

> Consonlidate history JSON files and gather metadata from the Spotify API

In [None]:
# | default_exp core

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export
import pandas as pd
import re
import time
import requests
import json
import spotipy
import pickle


from pathlib import Path
from typing import List, Dict, Optional
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

In [None]:
def get_spotipy_obj():
    load_dotenv()
    return spotipy.Spotify(auth_manager=SpotifyClientCredentials())

## Extract Streaming History

> Converting History JSON Files into Pandas DF

Spotify provides a user's history in a series of JSON Files. Some years have multiple files. I wrote a simple function to consolidate this history into a Dict with each year as the key

In [None]:
# | export
# |hide
def extract_streaming_history(
    data_folder: Path,  # Path to the folder containing the streaming history files
) -> Dict[str, pd.DataFrame]:  # Dictionary containing DataFrames for each year

    def get_json_files(data_folder: Path) -> List[Path]:
        """
        Get all the json files in the streaming_history folder.
        """

        json_files = []

        for file in data_folder.iterdir():
            if file.suffix == ".json":
                json_files.append(file)

        return json_files

    def extract_year_from_filename(filename: str) -> str:
        """
        Extract the year from a filename. The year should be a single year,
        not a range of years. For example, 2021-2022 should be 2021.
        """

        year = re.search(r"\d{4}", filename).group()

        return year

    year_to_df = {}

    paths = get_json_files(data_folder)

    for path in paths:
        # Extract the valid year from the filename
        year = extract_year_from_filename(path.name)
        if year is None:
            continue

        # Read the JSON file into a DataFrame
        df = pd.read_json(path)

        # Append the DataFrame to the existing DataFrame for the year, or create a new entry
        if year in year_to_df:
            year_to_df[year] = pd.concat(
                [year_to_df[year], df], ignore_index=True)
        else:
            year_to_df[year] = df

    return year_to_df

In [None]:
streaming_history = extract_streaming_history(Path("streaming_history"))

For this excercise I'm going to only include music from my history that I've considered 'played'. To do so I'll filter the data here instead of adding a column in my Database that differentiates between played & unplayed.

In a real-life scenerio I'm more hesitant to throw away information but I didn't intend to look at unplayed tracks and it wouldn't have been wasted space on my db :)

In [None]:
# | export
# |hide
def clean_streaming_history(
    streaming_history,  # Dictionary containing DataFrames for each year
    # Minimum percentage of the song that must be played to be included in the analysis
    min_percent_played: float = 0.9,
) -> pd.DataFrame:  # Streaming History DataFrame
    """
    Clean the raw streaming history data, standardize column names,
    remove podcast data, remove songs that were not played to completion
    """
    clean_streaming_history = pd.DataFrame()
    for k in streaming_history.keys():
        clean_streaming_history = pd.concat(
            [clean_streaming_history, streaming_history[k]], ignore_index=True
        )

    clean_streaming_history["ts"] = pd.to_datetime(
        clean_streaming_history["ts"], utc=True
    )
    clean_streaming_history = clean_streaming_history.sort_values("ts").reset_index(
        drop=True
    )

    # Adding Data Fields for ease of use
    clean_streaming_history["month"] = clean_streaming_history["ts"].dt.month
    clean_streaming_history["year"] = clean_streaming_history["ts"].dt.year

    clean_streaming_history = clean_streaming_history.rename(
        columns={
            "master_metadata_track_name": "song",
            "master_metadata_album_artist_name": "artist",
            "master_metadata_album_album_name": "album",
            "spotify_track_uri": "URI",
        }
    )

    # Remove anything that's not a song
    clean_streaming_history = clean_streaming_history[
        ~clean_streaming_history.URI.isna()
    ]

    # Extract the track_id
    clean_streaming_history["track_id"] = [
        uri.replace("spotify:track:", "") for uri in clean_streaming_history["URI"]
    ]

    # Approixmate the song duration, add to the dataframe
    approximate_durations = (
        clean_streaming_history.loc[
            clean_streaming_history.reason_end == "trackdone", ["track_id", "ms_played"]
        ]
        .groupby("track_id")["ms_played"]
        .agg(lambda x: x.mode()[0])
        .reset_index()
    )
    approximate_durations = approximate_durations.rename(
        columns={"ms_played": "duration"}
    )
    clean_streaming_history = clean_streaming_history.merge(
        approximate_durations, on="track_id", how="left"
    )
    clean_streaming_history = clean_streaming_history[
        ~clean_streaming_history.duration.isna()
    ].reset_index(drop=True)

    # Adding percent was played and filtering by the given value
    clean_streaming_history["percent_played"] = clean_streaming_history.apply(
        lambda row: row["ms_played"] / row["duration"] if row["duration"] != 0 else 0,
        axis=1,
    )

    clean_streaming_history = clean_streaming_history[
        clean_streaming_history.percent_played >= min_percent_played
    ].reset_index(drop=True)

    return clean_streaming_history

Gonna use a cutoff of 70% played

In [None]:
clean_history = clean_streaming_history(streaming_history, 0.7)
clean_history.head(2)

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,song,artist,album,URI,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode,month,year,track_id,duration,percent_played
0,2014-09-25 10:26:51+00:00,1241589622,"iOS 8.0 (iPad2,5)",278386,US,98.85.37.41,unknown,Ambitionz Az A Ridah,2Pac,All Eyez On Me,spotify:track:3ssX20QT5c3nA9wk78V1LQ,,,,clickrow,trackdone,False,False,False,0.0,False,9,2014,3ssX20QT5c3nA9wk78V1LQ,278386.0,1.0
1,2014-09-25 20:40:44+00:00,1241589622,"iOS 8.0 (iPad2,5)",278386,US,98.85.37.41,unknown,Ambitionz Az A Ridah,2Pac,All Eyez On Me,spotify:track:3ssX20QT5c3nA9wk78V1LQ,,,,clickrow,trackdone,False,False,False,0.0,False,9,2014,3ssX20QT5c3nA9wk78V1LQ,278386.0,1.0


## Exploring Spotify API Data

> Finding metadata that will enrich Spotify History

In [None]:
# |hide
raw_track_metadata = {}
raw_artist_metadata = {}
raw_audio_features = {}
track_metadata = {}
artist_metadata = {}
album_metadata = {}

In [None]:
sp = get_spotipy_obj()

To explore the Spotify API I will be looking at metadata from 2 of my All-Time favorite songs:

- Devil in A New Dress by Kanye West (1UGD3lW3tDmgZfAVDh6w7r)
- 1 Train by A$AP Rocky (7AijU6oTPGmG64uWf63Qvc)

In [None]:
track_ids = ["7AijU6oTPGmG64uWf63Qvc", "1UGD3lW3tDmgZfAVDh6w7r"]
tracks = sp.tracks(track_ids)["tracks"]

In [None]:
# |hide
current_track = 0
train_1 = tracks[current_track]
if train_1["id"] not in raw_track_metadata:
    raw_track_metadata[train_1["id"]] = train_1

In [None]:
# |echo: false
list(raw_track_metadata[train_1["id"]].keys())

['album',
 'artists',
 'available_markets',
 'disc_number',
 'duration_ms',
 'explicit',
 'external_ids',
 'external_urls',
 'href',
 'id',
 'is_local',
 'name',
 'popularity',
 'preview_url',
 'track_number',
 'type',
 'uri']

#### Genres

> Standardizing Genres

Spotify provides genres at the Artist level and these genres can be all over the place. Let's take a look at some examples

> Jay-Z

In [None]:
jay = sp.artist("3nFkdlSjzX9mRTtwJOzDYB")

In [None]:
jay["genres"]

['east coast hip hop', 'gangster rap', 'hip hop', 'pop rap', 'rap']

Jay-z is associated with 5 genres! When I think about Jay-Z I think East Coast Hip Hip but should that be his main genre?

I have 2 things 2 consider here:

- How to bucket artist into broad groups (rappers vs. rock stars)
- And how to bucket them into the sub-genres that I associate with them (east coast hip hop vs. west coast hip hop)

Let's take a look at 2 more examples:

> Pink Floyd

In [None]:
pink = sp.artist("0k17h0D3J5VfsdmQ1iZtE9")

In [None]:
pink["genres"]

['album rock',
 'art rock',
 'classic rock',
 'progressive rock',
 'psychedelic rock',
 'rock',
 'symphonic rock']

> Khruangbin

In [None]:
khruangbin = sp.artist("2mVVjNmdjXZZDvhgQWiakk")

In [None]:
khruangbin["genres"]

['indie soul', 'neo-psychedelic']

In the case of Pink Floyd the genres that I want to associate with them is in the Spotify Response (Rock and Classic Rock). However, Khruangbin is a completly different story, I think they should be Indie.

What I noticed looking through the data is that certain genres--like rap--came through well in the Spotify data while others--like indie--did not. My goal is paint a picture of my spotify history as I see it. If certain genre's aren't being mapped how I envision it's going to mess with the visualizations.

In an ideal world I would have created a AI model or something similar to map the genres programmatically. However, I encountered this issue early in the development process. Perfection is the enemy of progress. My goal is to make a website that displays my data. Not to build an AI model :)

In [None]:
genre_mapping = pd.read_csv("genre_matching.csv")

In [None]:
# |hide
genre_mapping["genres"] = genre_mapping["genres"].str.lower().str.strip()
genre_mapping["main_genre"] = genre_mapping["main_genre"].str.lower().str.strip()
genre_mapping["secondary_genre"] = (
    genre_mapping["secondary_genre"].str.lower().str.strip()
)

Since I didn't want to go through all the genres (there are ≈ 1800) I went through top 300 or so and wrote a regex to match others based on certain keywords

In [None]:
# |code-fold: true
def consolidate_main_genre(genre: str):
    if re.search(r"\brap\b", genre) or re.search(r"\bhip hop\b", genre):
        return "rap"
    elif re.search(r"\brock\b", genre):
        return "rock"
    elif re.search(r"\bsoul\b", genre):
        return "soul"
    elif re.search(r"\bpop\b", genre):
        return "pop"
    elif re.search(r"\bcountry\b", genre):
        return "country"
    elif re.search(
        r"\bjazz\b|\binstrumental\b|\bblues\b|\bclassical\b|\blo-fi\b|\blofi\b|\bambient\b",
        genre,
    ):
        return "focus"

In [None]:
possible_genres = genre_mapping[genre_mapping.genres.isin(jay["genres"])]

In [None]:
possible_genres

Unnamed: 0,genres,main_genre,secondary_genre,hours_played
0,rap,rap,,4187.657624
1,hip hop,rap,,3636.079708
3,pop rap,rap,,1747.090661
8,gangster rap,rap,,732.280244
13,east coast hip hop,rap,east coast hip hop,617.484491


<p>Putting it all together</p>

In [None]:
# |code-fold: true
def consolidate_genres(genres: List[str]) -> Dict:
    possible_genres = genre_mapping[genre_mapping.genres.isin(genres)]
    main_genre = ""
    secondary_genre = ""
    if possible_genres.shape[0] > 0:
        if possible_genres["main_genre"].dropna().shape[0] > 0:
            main_genre = possible_genres["main_genre"].dropna().iloc[0]
        else:
            main_genre = consolidate_main_genre(
                possible_genres["genres"].dropna().iloc[0]
            )

        if possible_genres["secondary_genre"].dropna().shape[0] > 0:
            secondary_genre = possible_genres["secondary_genre"].dropna().iloc[0]
    return {
        "main_genre": main_genre,
        "secondary_genre": secondary_genre,
        "genres": ";;".join(genres),
    }

In [None]:
consolidate_genres(jay["genres"])

{'main_genre': 'rap',
 'secondary_genre': 'east coast hip hop',
 'genres': 'east coast hip hop;;gangster rap;;hip hop;;pop rap;;rap'}

While this is a good start, there's still one scenerio that isn't being accounted for:

- What if the artist API request doens't return ANY genres?

Let's take a look at Santa Esmaralda

In [None]:
santa = sp.artist("0iGmfKLgK5eSMgHp8YgLnS")
santa["genres"]

[]

Santa doesn't have any genres!!! 
<br><br>
The solution here is simple. While Santa doesn't have any genres himself, spotify provides a endpoint for related artist. I can find out the genres of his related artist and use this as a way to approximate Santa's genre

In [None]:
if not santa.get("genres"):
    related_artists = sp.artist_related_artists("0iGmfKLgK5eSMgHp8YgLnS")
    related_genres = {}
    for art in related_artists["artists"]:
        if art.get("genres"):
            for genre in art["genres"]:
                if genre not in related_genres:
                    related_genres[genre] = 0
                related_genres[genre] += 1

In [None]:
# |echo: false
related_genres

{'disco': 13,
 'hi-nrg': 5,
 'italo disco': 2,
 'australian dance': 1,
 'classic uk pop': 1,
 'minneapolis sound': 1,
 'synthpop': 1,
 'diva house': 1,
 'motown': 2,
 'post-disco': 2,
 'quiet storm': 1,
 'deep disco': 1,
 'vintage french electronic': 1}

Consolidating the genres for Santa

In [None]:
related_genres_list = list(related_genres.items())

In [None]:
related_genres_list.sort(key=lambda x: x[1], reverse=True)

In [None]:
related_genres_list = [x[0] for x in related_genres_list]

In [None]:
consolidate_genres(related_genres_list[:5])

{'main_genre': 'soul',
 'secondary_genre': 'dance',
 'genres': 'disco;;hi-nrg;;italo disco;;motown;;post-disco'}

> Proceduralizing

In [None]:
# |code-fold: true
def get_artist_genres(artist) -> Dict[str, int]:
    if artist.get("genres"):
        consolidated_genres = consolidate_genres(artist["genres"])
        return {
            "genres": consolidated_genres["genres"],
            "main_genre": (
                consolidated_genres["main_genre"]
                if consolidated_genres["main_genre"]
                else consolidated_genres["genres"][-1]
            ),
            "secondary_genre": (
                consolidated_genres["secondary_genre"]
                if consolidated_genres["secondary_genre"]
                else consolidated_genres["genres"].split(";;")[0]
            ),
        }
    else:
        sp = get_spotipy_obj()
        related_artists = sp.artist_related_artists(artist["id"])
        related_genres = {}
        for art in related_artists["artists"]:
            if art.get("genres"):
                for genre in art["genres"]:
                    if genre not in related_genres:
                        related_genres[genre] = 0
                    related_genres[genre] += 1
        related_genres_list = list(related_genres.items())
        related_genres_list.sort(key=lambda x: x[1], reverse=True)
        related_genres_list = [x[0] for x in related_genres_list]
        return consolidate_genres(related_genres_list[:5])

In [None]:
get_artist_genres(jay)

{'genres': 'east coast hip hop;;gangster rap;;hip hop;;pop rap;;rap',
 'main_genre': 'rap',
 'secondary_genre': 'east coast hip hop'}

#### Artist

In [None]:
# |echo: false
list(jay.keys())

['external_urls',
 'followers',
 'genres',
 'href',
 'id',
 'images',
 'name',
 'popularity',
 'type',
 'uri']

Getting metadata for main artist


In [None]:
# |code-fold: true
def get_artist_data(artist_id: str, raw_artist_metadata: Dict = {}) -> Dict:
    sp = get_spotipy_obj()
    artist = sp.artist(artist_id)
    artist_genres = get_artist_genres(artist)
    if artist["id"] not in raw_artist_metadata:
        raw_artist_metadata[artist["id"]] = artist
    return {
        "id": artist["id"],
        "name": artist["name"],
        "external_url": artist["external_urls"].get("spotify"),
        "followers": artist["followers"]["total"],
        "genres": artist_genres["genres"],
        "href": artist["href"],
        "images": json.dumps(artist["images"]),
        "popularity": artist["popularity"],
        "type": artist["type"],
        "uri": artist["uri"],
        "main_genre": artist_genres["main_genre"],
        "secondary_genre": artist_genres["secondary_genre"],
    }

Getting all artist associated with a track

In [None]:
# |code-fold: true
def get_track_artists(
    track, artist_metadata: Dict = {}, raw_artist_metadata: Dict = {}
) -> Dict:
    main_artist = None
    artist_names = []
    artist_ids = []
    for i in range(len(track["artists"])):
        id = track["artists"][i]["id"]
        if artist_metadata.get(id):
            artist = artist_metadata[id]
        else:
            artist = get_artist_data(id, raw_artist_metadata)
            artist_metadata[id] = artist
        if i == 0:
            main_artist = artist
            artist_names.append(artist["name"])
            artist_ids.append(artist["id"])
        else:
            artist_names.append(artist["name"])
            artist_ids.append(artist["id"])
    return {
        "main_artist_id": main_artist["id"],
        "main_artist_name": main_artist["name"],
        "genres": main_artist["genres"],
        "main_genre": main_artist["main_genre"],
        "secondary_genre": main_artist["secondary_genre"],
        "main_artist_image": main_artist["images"],
        "main_artist_url": main_artist["external_url"],
        "main_artist_uri": main_artist["uri"],
        "artist_names": ";;".join(artist_names),
        "artist_ids": ";;".join(artist_ids),
    }

#### Track Audio Features

> Enriching Tracks with Audio Features

In [None]:
audio_features = sp.audio_features(track_ids)

In [None]:
TRACK_FEATURE_COLUMNS = [
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "analysis_url",
    "time_signature",
]

In [None]:
def get_track_features(track_features: Dict, raw_audio_features: Dict = {}) -> Dict:
    if track_features["id"] not in raw_audio_features:
        raw_audio_features[track_features["id"]] = track_features
    return {k: track_features[k] for k in TRACK_FEATURE_COLUMNS}

In [None]:
train_1_features = get_track_features(audio_features[current_track], raw_audio_features)

In [None]:
# |echo: false
train_1_features

{'danceability': 0.622,
 'energy': 0.872,
 'key': 2,
 'loudness': -3.403,
 'mode': 1,
 'speechiness': 0.332,
 'acousticness': 0.349,
 'instrumentalness': 0,
 'liveness': 0.695,
 'valence': 0.768,
 'tempo': 83.568,
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7AijU6oTPGmG64uWf63Qvc',
 'time_signature': 4}

#### Album Metadata

> Key Data Points: Label, Popularity, Album Tracks, Release Date


In [None]:
# |echo: false
list(raw_track_metadata[train_1["id"]]["album"].keys())

['album_type',
 'artists',
 'available_markets',
 'external_urls',
 'href',
 'id',
 'images',
 'name',
 'release_date',
 'release_date_precision',
 'total_tracks',
 'type',
 'uri']

Since I don't want all the metadata associated with the album, I'm going to extract only the metadata I want

In [None]:
# |code-fold: true
def get_album_data(track, album_metadata: Dict[str, Dict]):
    if track["album"]["id"] in album_metadata:
        return album_metadata[track["album"]["id"]]
    album = {
        "name": track["album"]["name"],
        "id": track["album"]["id"],
        "artist": ";;".join([artist["name"] for artist in track["album"]["artists"]]),
        "artist_id": ";;".join([artist["id"] for artist in track["album"]["artists"]]),
        "external_url": track["album"]["external_urls"].get("spotify"),
        "href": track["album"]["href"],
        "images": json.dumps(track["album"]["images"]),
        "release_date": track["album"]["release_date"],
        "release_date_precision": track["album"]["release_date_precision"],
        "total_tracks": track["album"]["total_tracks"],
        "type": track["album"]["type"],
        "uri": track["album"]["uri"],
    }
    album_metadata[album["id"]] = album
    return album

In [None]:
train_1_album = get_album_data(train_1, album_metadata)

In [None]:
train_1_album

{'name': 'LONG.LIVE.A$AP (Deluxe Version)',
 'id': '1E1eyI5uGllppJZCxNoF9w',
 'artist': 'A$AP Rocky',
 'artist_id': '13ubrt8QOOCPljQ2FL1Kca',
 'external_url': 'https://open.spotify.com/album/1E1eyI5uGllppJZCxNoF9w',
 'href': 'https://api.spotify.com/v1/albums/1E1eyI5uGllppJZCxNoF9w',
 'images': '[{"height": 640, "url": "https://i.scdn.co/image/ab67616d0000b2733265ed162fa2dd5ec6434ee4", "width": 640}, {"height": 300, "url": "https://i.scdn.co/image/ab67616d00001e023265ed162fa2dd5ec6434ee4", "width": 300}, {"height": 64, "url": "https://i.scdn.co/image/ab67616d000048513265ed162fa2dd5ec6434ee4", "width": 64}]',
 'release_date': '2013-01-11',
 'release_date_precision': 'day',
 'total_tracks': 16,
 'type': 'album',
 'uri': 'spotify:album:1E1eyI5uGllppJZCxNoF9w'}

#### Track

> Putting it all together

Writing a funciton to get track data

In [None]:
# |code-fold: true
def get_track_data(
    track,
    artist_metadata: Optional[Dict] = None,
    raw_artist_metadata: Optional[Dict] = None,
    audio_features: Optional[Dict] = None,
    album_metadata: Optional[Dict] = {},
) -> Dict:
    album = get_album_data(track, album_metadata)
    artists = get_track_artists(
        track, artist_metadata=artist_metadata, raw_artist_metadata=raw_artist_metadata
    )
    if not audio_features:
        audio_features = {k: None for k in TRACK_FEATURE_COLUMNS}
    return {
        "id": track["id"],
        "name": track["name"],
        "artist": artists["main_artist_name"],
        "album": album["name"],
        "album_id": album["id"],
        "album_release_date": album["release_date"],
        "album_release_date_precision": album["release_date_precision"],
        "album_type": album["type"],
        "album_uri": album["uri"],
        "album_external_url": album["external_url"],
        "album_href": album["href"],
        "album_images": album["images"],
        "artist_id": artists["main_artist_id"],
        "artist_names": artists["artist_names"],
        "artist_ids": artists["artist_ids"],
        "artist_genres": artists["genres"],
        "artist_main_genre": artists["main_genre"],
        "artist_secondary_genre": artists["secondary_genre"],
        "artist_image": artists["main_artist_image"],
        "main_artist_url": artists["main_artist_url"],
        "main_artist_uri": artists["main_artist_uri"],
        "danceability": audio_features["danceability"],
        "energy": audio_features["energy"],
        "key": audio_features["key"],
        "loudness": audio_features["loudness"],
        "mode": audio_features["mode"],
        "speechiness": audio_features["speechiness"],
        "acousticness": audio_features["acousticness"],
        "instrumentalness": audio_features["instrumentalness"],
        "liveness": audio_features["liveness"],
        "valence": audio_features["valence"],
        "tempo": audio_features["tempo"],
        "analysis_url": audio_features["analysis_url"],
        "time_signature": audio_features["time_signature"],
    }

## Writing Batch Funcs

In [None]:
sample_ids = clean_history.track_id.unique()[:200]

Writing custom function to get multiple tracks at once <br>
<br>
The spotipy lib seems to re-call the Spotify API immediatly after getting a 'retry-after' messages which Spotify doesn't seem to appreciate. Instead I wait for 5 minutes :)

In [None]:
# |code-fold: true
def get_multiple_tracks(track_ids: List) -> List[Dict]:
    def wait_for_rate_limit(response: requests.Response) -> None:
        print()
        print("-----------------")
        print("Waiting for", response.headers.get("retry-after"), "seconds")
        time.sleep(int(response.headers.get("retry-after")) + 300)

    auth_manager = SpotifyClientCredentials()
    track_ids = ",".join(track_ids)
    url = f"https://api.spotify.com/v1/tracks?ids={track_ids}"
    headers = {
        "Authorization": f"Bearer {auth_manager.get_access_token(as_dict=False)}"
    }
    response = requests.get(url, headers=headers)

    if response.headers.get("retry-after"):
        wait_for_rate_limit(response)
        response = requests.get(url, headers=headers)

    return response.json()

Putting everything together <br>
<br>
The following function will take a list of track ids and return a DataFrame with all the metadata for the tracks. To avoid rate limiting, the it'll will puase for 200 seconds every 1000 ids and respect retry-after messages #IJUSTNEEDSOMESPACE


In [None]:
# |code-fold: true
def enrich_spotify_data(
    track_ids: List[str],
    track_metadata: Dict = {},
    artist_metadata: Dict = {},
    album_metadata: Dict = {},
    audio_features: Dict = {},
    raw_track_metadata: Dict = {},
    raw_artist_metadata: Dict = {},
) -> Dict:
    sp = get_spotipy_obj()
    BATCH_SIZE = 50
    batch = []

    for i in range(len(track_ids)):
        track_id = track_ids[i]

        if track_id in track_metadata:
            continue

        batch.append(track_id)

        if len(batch) == BATCH_SIZE or i == len(track_ids) - 1:
            tracks = sp.tracks(batch)["tracks"]
            audio_features_batch = sp.audio_features(batch)

            for features in audio_features_batch:
                if features:
                    if features["id"] not in audio_features:
                        audio_features[features["id"]] = features

            for track in tracks:
                if track["id"] not in raw_track_metadata:
                    raw_track_metadata[track["id"]] = track
                track_metadata[track["id"]] = get_track_data(
                    track,
                    artist_metadata=artist_metadata,
                    raw_artist_metadata=raw_artist_metadata,
                    audio_features=(
                        audio_features.get(track["id"])
                        if audio_features.get(track["id"])
                        else {k: None for k in TRACK_FEATURE_COLUMNS}
                    ),
                    album_metadata=album_metadata,
                )

            batch = []
            if ((i + 1) % 1000) == 0:
                print("Track", i + 1, "complete")
            time.sleep(200)

    return {
        "track_metadata": track_metadata,
        "artist_metadata": artist_metadata,
        "album_metadata": album_metadata,
        "audio_features": audio_features,
        "raw_track_metadata": raw_track_metadata,
        "raw_artist_metadata": raw_artist_metadata,
    }

In [None]:
# Enriching Data

In [None]:
track_ids = clean_history.track_id.unique()

In [None]:
# |hide
def get_pickle_data(path: Path) -> Dict:
    if path.exists():
        with open(path, "rb") as f:
            return pickle.load(f)
    else:
        return {}

In [None]:
# |hide
track_metadata = get_pickle_data(Path("data/track_metadata.pkl"))
artist_metadata = get_pickle_data(Path("data/artist_metadata.pkl"))
album_metadata = get_pickle_data(Path("data/album_metadata.pkl"))
audio_features = get_pickle_data(Path("data/audio_features.pkl"))
raw_track_metadata = get_pickle_data(Path("data/raw_track_metadata.pkl"))
raw_artist_metadata = get_pickle_data(Path("data/raw_artist_metadata.pkl"))

In [None]:
# |output: false
for i in range(0, len(track_ids), 1000):
    print("Processing tracks", i, "to", i + 1000)
    end = min(i + 1000, len(track_ids))
    data = enrich_spotify_data(
        track_ids[i:end],
        track_metadata=track_metadata,
        artist_metadata=artist_metadata,
        album_metadata=album_metadata,
        audio_features=audio_features,
        raw_track_metadata=raw_track_metadata,
        raw_artist_metadata=raw_artist_metadata,
    )
    track_metadata = data["track_metadata"]
    artist_metadata = data["artist_metadata"]
    album_metadata = data["album_metadata"]
    audio_features = data["audio_features"]
    raw_track_metadata = data["raw_track_metadata"]
    raw_artist_metadata = data["raw_artist_metadata"]

Processing tracks 0 to 1000
Processing tracks 1000 to 2000
Processing tracks 2000 to 3000
Processing tracks 3000 to 4000
Processing tracks 4000 to 5000
Processing tracks 5000 to 6000
Processing tracks 6000 to 7000
Processing tracks 7000 to 8000
Processing tracks 8000 to 9000
Processing tracks 9000 to 10000
Processing tracks 10000 to 11000
Processing tracks 11000 to 12000
Processing tracks 12000 to 13000
Processing tracks 13000 to 14000
Processing tracks 14000 to 15000
Processing tracks 15000 to 16000
Processing tracks 16000 to 17000
Processing tracks 17000 to 18000
Processing tracks 18000 to 19000
Processing tracks 19000 to 20000
Processing tracks 20000 to 21000
Processing tracks 21000 to 22000
Processing tracks 22000 to 23000
Processing tracks 23000 to 24000
Processing tracks 24000 to 25000
Processing tracks 25000 to 26000


Saving the data

In [None]:
# | hide
def save_pickle_data(data: Dict, path: Path) -> None:
    with open(path, "wb") as f:
        pickle.dump(data, f)

In [None]:
# | hide
save_pickle_data(track_metadata, Path("data/track_metadata.pkl"))
save_pickle_data(artist_metadata, Path("data/artist_metadata.pkl"))
save_pickle_data(album_metadata, Path("data/album_metadata.pkl"))
save_pickle_data(audio_features, Path("data/audio_features.pkl"))
save_pickle_data(raw_track_metadata, Path("data/raw_track_metadata.pkl"))
save_pickle_data(raw_artist_metadata, Path("data/raw_artist_metadata.pkl"))

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()