# Pull Spotify History

> Consonlidate history JSON files and gather metadata from the Spotify API

In [None]:
# | default_exp core

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export
import pandas as pd
import re
import time
import requests
import json
import spotipy


from pathlib import Path
from typing import List, Dict, Optional
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

In [None]:
def get_spotipy_obj():
    load_dotenv()
    return spotipy.Spotify(auth_manager=SpotifyClientCredentials())

## Extract Streaming History

> Converting History JSON Files into Pandas DF

Spotify provides a user's history in a series of JSON Files. Some years have multiple files.


In [None]:
# | export
#|hide
def extract_streaming_history(
    data_folder: Path,  # Path to the folder containing the streaming history files
) -> Dict[str, pd.DataFrame]:  # Dictionary containing DataFrames for each year

    def get_json_files(data_folder: Path) -> List[Path]:
        """
        Get all the json files in the streaming_history folder.
        """

        json_files = []

        for file in data_folder.iterdir():
            if file.suffix == ".json":
                json_files.append(file)

        return json_files

    def extract_year_from_filename(filename: str) -> str:
        """
        Extract the year from a filename. The year should be a single year,
        not a range of years. For example, 2021-2022 should be 2021.
        """

        year = re.search(r"\d{4}", filename).group()

        return year

    year_to_df = {}

    paths = get_json_files(data_folder)

    for path in paths:
        # Extract the valid year from the filename
        year = extract_year_from_filename(path.name)
        if year is None:
            continue

        # Read the JSON file into a DataFrame
        df = pd.read_json(path)

        # Append the DataFrame to the existing DataFrame for the year, or create a new entry
        if year in year_to_df:
            year_to_df[year] = pd.concat(
                [year_to_df[year], df], ignore_index=True)
        else:
            year_to_df[year] = df

    return year_to_df

In [None]:
streaming_history = extract_streaming_history(Path("streaming_history"))

In [None]:
streaming_history["2020"].head(2)

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2020-04-14T15:58:10Z,1241589622,Partner SCEI sony_tv;ps4;;,223867,US,69.244.253.115,unknown,What a Fool Believes,The Doobie Brothers,Minute by Minute,spotify:track:2yBVeksU2EtrPJbTu4ZslK,,,,clickrow,trackdone,False,0.0,False,0,False
1,2020-04-14T15:58:17Z,1241589622,Partner SCEI sony_tv;ps4;;,7342,US,69.244.253.115,unknown,Take The Money And Run,Steve Miller Band,Fly Like An Eagle,spotify:track:1ZhrREyOOeFV6TxDOyiPwu,,,,trackdone,fwdbtn,False,0.0,False,0,False


For this excercise I'm going to only include music from my history that I've considered 'played'. To do so I'm will filter the data here instead of adding a column in my Database that differentiates between played & unplayed.
<br>
<br>
In a real-life scenerio I'm more hesitant to throw away information but I knew didn't have any intention of looking at 'unplayed' tracks and figured I could save the space on my db :)

In [None]:
# | export
def clean_streaming_history(
    streaming_history,  # Dictionary containing DataFrames for each year
    # Minimum percentage of the song that must be played to be included in the analysis
    min_percent_played: float = 0.9,
) -> pd.DataFrame:  # Streaming History DataFrame
    """
    Clean the raw streaming history data
    Standardize column names, remove non-song data, remove songs that were not played to completion
    """
    clean_streaming_history = pd.DataFrame()
    for k in streaming_history.keys():
        clean_streaming_history = pd.concat(
            [clean_streaming_history, streaming_history[k]], ignore_index=True
        )

    clean_streaming_history["ts"] = pd.to_datetime(
        clean_streaming_history["ts"], utc=True
    )
    clean_streaming_history = clean_streaming_history.sort_values("ts").reset_index(
        drop=True
    )

    # Adding Data Fields for ease of use
    clean_streaming_history["month"] = clean_streaming_history["ts"].dt.month
    clean_streaming_history["year"] = clean_streaming_history["ts"].dt.year

    clean_streaming_history = clean_streaming_history.rename(
        columns={
            "master_metadata_track_name": "song",
            "master_metadata_album_artist_name": "artist",
            "master_metadata_album_album_name": "album",
            "spotify_track_uri": "URI",
        }
    )

    # Remove anything that's not a song
    clean_streaming_history = clean_streaming_history[
        ~clean_streaming_history.URI.isna()
    ]

    # Extract the track_id
    clean_streaming_history["track_id"] = [
        uri.replace("spotify:track:", "") for uri in clean_streaming_history["URI"]
    ]

    # Approixmate the song duration, add to the dataframe
    approximate_durations = (
        clean_streaming_history.loc[
            clean_streaming_history.reason_end == "trackdone", [
                "track_id", "ms_played"]
        ]
        .groupby("track_id")["ms_played"]
        .agg(lambda x: x.mode()[0])
        .reset_index()
    )
    approximate_durations = approximate_durations.rename(
        columns={"ms_played": "duration"}
    )
    clean_streaming_history = clean_streaming_history.merge(
        approximate_durations, on="track_id", how="left"
    )
    clean_streaming_history = clean_streaming_history[
        ~clean_streaming_history.duration.isna()
    ].reset_index(drop=True)

    # Adding percent was played and filtering by the given value
    clean_streaming_history["percent_played"] = clean_streaming_history.apply(
        lambda row: row["ms_played"] /
        row["duration"] if row["duration"] != 0 else 0,
        axis=1,
    )

    clean_streaming_history = clean_streaming_history[
        clean_streaming_history.percent_played >= min_percent_played
    ].reset_index(drop=True)

    return clean_streaming_history

I will be using a cutoff of 70% to differentiate between 'played' vs. 'unplayed' 
<br><br>
There wasn't a scientific approach to this, I just listen to a lot of music and trusted my intiution :)

In [None]:
clean_history = clean_streaming_history(streaming_history, 0.7)
clean_history.head(2)

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,song,artist,album,URI,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode,month,year,track_id,duration,percent_played
0,2014-09-25 10:26:51+00:00,1241589622,"iOS 8.0 (iPad2,5)",278386,US,98.85.37.41,unknown,Ambitionz Az A Ridah,2Pac,All Eyez On Me,spotify:track:3ssX20QT5c3nA9wk78V1LQ,,,,clickrow,trackdone,False,False,False,0.0,False,9,2014,3ssX20QT5c3nA9wk78V1LQ,278386.0,1.0
1,2014-09-25 20:40:44+00:00,1241589622,"iOS 8.0 (iPad2,5)",278386,US,98.85.37.41,unknown,Ambitionz Az A Ridah,2Pac,All Eyez On Me,spotify:track:3ssX20QT5c3nA9wk78V1LQ,,,,clickrow,trackdone,False,False,False,0.0,False,9,2014,3ssX20QT5c3nA9wk78V1LQ,278386.0,1.0


## Exploring Spotify API Data

> Finding metadata that will enrich Spotify History

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()