# BetterWrapped

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pycountry
import seaborn as sns

## Importing Spotify data

### Reading data

In [2]:
df = pd.DataFrame()
for file in [x for x in os.listdir("Spotify Extended Streaming History") if x.startswith("Streaming_History_Audio")]:
    df = pd.concat([df, pd.read_json("Spotify Extended Streaming History/"+file)])
df.head(3)

Unnamed: 0,ts,platform,ms_played,conn_country,ip_addr,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,...,audiobook_uri,audiobook_chapter_uri,audiobook_chapter_title,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2018-11-12T19:09:12Z,"Android OS 8.0.0 API 26 (LGE, LG-H870)",0,CH,84.74.88.79,Let You Love Me,Rita Ora,Let You Love Me,spotify:track:6xtcFXSo8H9BZN637BMVKS,,...,,,,clickrow,remote,False,False,False,,False
1,2018-11-12T19:09:31Z,Partner sonos_ppc Sonos;Play5;;;,12500,CH,84.74.88.79,Let You Love Me,Rita Ora,Let You Love Me,spotify:track:6xtcFXSo8H9BZN637BMVKS,,...,,,,clickrow,endplay,False,False,False,,False
2,2018-11-12T19:11:02Z,Partner sonos_ppc Sonos;Play5;;;,38500,CH,84.74.88.79,Happier,Marshmello,Happier,spotify:track:2dpaYNEQHiRxtZbfNsse99,,...,,,,clickrow,logout,False,False,False,,False


### Formatting and cleaning data

In [3]:
df.rename(columns={"conn_country":"country_code", "ip_addr":"ip", "master_metadata_track_name":"track_name", "master_metadata_album_artist_name":"artist_name", "master_metadata_album_album_name":"album_name"}, inplace=True)
df['ts'] = pd.to_datetime(df['ts'])
df = df.dropna(subset=["artist_name"])
df["s_played"] = df["ms_played"]/1000
df["min_played"] = df["s_played"]/60
df["h_played"] = df["min_played"]/60
df["d_played"] = df["min_played"]/24
df["country_name"] = df["country_code"].map(lambda x : pycountry.countries.get(alpha_2=x).name if pycountry.countries.get(alpha_2=x) else "Unbekannt")
df.head(3)

Unnamed: 0,ts,platform,ms_played,country_code,ip,track_name,artist_name,album_name,spotify_track_uri,episode_name,...,shuffle,skipped,offline,offline_timestamp,incognito_mode,s_played,min_played,h_played,d_played,country_name
0,2018-11-12 19:09:12+00:00,"Android OS 8.0.0 API 26 (LGE, LG-H870)",0,CH,84.74.88.79,Let You Love Me,Rita Ora,Let You Love Me,spotify:track:6xtcFXSo8H9BZN637BMVKS,,...,False,False,False,,False,0.0,0.0,0.0,0.0,Switzerland
1,2018-11-12 19:09:31+00:00,Partner sonos_ppc Sonos;Play5;;;,12500,CH,84.74.88.79,Let You Love Me,Rita Ora,Let You Love Me,spotify:track:6xtcFXSo8H9BZN637BMVKS,,...,False,False,False,,False,12.5,0.208333,0.003472,0.008681,Switzerland
2,2018-11-12 19:11:02+00:00,Partner sonos_ppc Sonos;Play5;;;,38500,CH,84.74.88.79,Happier,Marshmello,Happier,spotify:track:2dpaYNEQHiRxtZbfNsse99,,...,False,False,False,,False,38.5,0.641667,0.010694,0.026736,Switzerland


## Fetching genres

In [4]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
from spotipy.client import SpotifyException
import requests

In [5]:
CLIENT_ID = "0c5021071fbd4a9fabc9b31b278aaae7"
CLIENT_SECRET = "d2cce0d0a87d433e96f0a8d4d5bf3b51"

auth_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [6]:
def fetch_artist_genre(sp, artist, max_retries=5, initial_wait=5):
    tries = 0
    wait = initial_wait
    while tries < max_retries:
        try:
            genre = sp.search(q=artist, type="artist")["artists"]["items"][0]["genres"]
            return genre
        except (requests.exceptions.ReadTimeout, SpotifyException) as e:
            time.sleep(wait)
            wait *= 2
            tries += 1
    return []

In [7]:
if not "artist_genre.csv" in os.listdir():
    artists = df["artist_name"].unique()
    list_artist_genre = [[artist, fetch_artist_genre(sp, artist)] for artist in artists]
    list_artist_genre = [[artist, ", ".join(genres)] for artist, genres in list_artist_genre]
    df_artist_genre = pd.DataFrame(list_artist_genre, columns=['artist_name', 'genre'])
    df_artist_genre.to_csv("artist_genre.csv", index=False)
else:
    df_artist_genre = pd.read_csv("artist_genre.csv")

In [8]:
df = df.merge(df_artist_genre, on='artist_name', how='left')

## Explaining data

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85534 entries, 0 to 85533
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   ts                       85534 non-null  datetime64[ns, UTC]
 1   platform                 85534 non-null  object             
 2   ms_played                85534 non-null  int64              
 3   country_code             85534 non-null  object             
 4   ip                       85534 non-null  object             
 5   track_name               85534 non-null  object             
 6   artist_name              85534 non-null  object             
 7   album_name               85534 non-null  object             
 8   spotify_track_uri        85534 non-null  object             
 9   episode_name             0 non-null      object             
 10  episode_show_name        0 non-null      object             
 11  spotify_episode_uri      0 n

| Attribute | Description |
|-----------|-------------|
| ts | This field is a timestamp indicating when the track stopped playing in UTC (Coordinated Universal Time). The order is year, month and day followed by a timestamp in military time |
| platform | This field is the platform used when streaming the track (e.g. Android OS, Google Chromecast). |
| ms_played | This field is the number of milliseconds the stream was played. |
| country_code | This field is the country code of the country where the stream was played (e.g. SE - Sweden). |
| ip | This field contains the IP address logged when streaming the track. |
| track_name | This field is the name of the track. |
| artist_name | This field is the name of the artist, band or podcast. |
| album_name | This field is the name of the album of the track. |
| spotify_track_uri  | A Spotify URI, uniquely identifying the track in the form of “spotify:track:<base-62 string>” <br><br> A Spotify URI is a resource identifier that you can enter, for example, in the Spotify Desktop client’s search box to locate an artist, album, or track. |
| episode_name | This field contains the name of the episode of the podcast. |
| episode_show_name | This field contains the name of the show of the podcast. |
| spotify_episode_uri | A Spotify Episode URI, uniquely identifying the podcast episode in the form of “spotify:episode:<base-62 string>” <br><br> A Spotify Episode URI is a resource identifier that you can enter, for example, in the Spotify Desktop client’s search box to locate an episode of a podcast. |
| audiobook_title | This field is the title of the audiobook |
| audiobook_uri | This field contains the audiobook uri |
| audiobook_chapter_uri | This field contains the audiobook chapter uri |
| audiobook_chapter_title | This field is the audiobook chapter title |
| reason_start | This field is a value telling why the track started (e.g. “trackdone”) |
| reason_end | This field is a value telling why the track ended (e.g. “endplay”). |
| shuffle | This field has the value True or False depending on if shuffle mode was used when playing the track. |
| skipped | This field indicates if the user skipped to the next song |
| offline | This field indicates whether the track was played in offline mode (“True”) or not (“False”). |
| offline_timestamp | This field is a timestamp of when offline mode was used, if used. |
| incognito_mode | This field indicates whether the track was played during a private session (“True”) or not (“False”). |
| s_played | This field is the number of seconds the stream was played.|
| min_played | This field is the number of minutes the stream was played.|
| h_played | This field is the number of hours the stream was played.|
| d_played | This field is the number of days the stream was played.|
| country_name | This field is the country name of the country where the stream was played (e.g. Sweden - SE).|
| genre | This field is the genre of the artist |

## Exploring static data

### Your Spotify usage

In [10]:
print("You started using Spotify in", df["ts"].min().date())
print("You listened to ", df.shape[0], " tracks, over the course of ", (df["ts"].max() - df["ts"].min()).days, " days (", round((df["ts"].max() - df["ts"].min()).days/365, 2), " years)", sep='')
print("You listened to Spotify for ", df["ms_played"].sum()//3600000, " hours (", df["ms_played"].sum()//3600000//24, " days)", sep='')

You started using Spotify in 2018-11-12
You listened to 85534 tracks, over the course of 2542 days (6.96 years)
You listened to Spotify for 2567 hours (106 days)


### Your top data

In [None]:
def get_top_data_raw(attribute, sorting_value, top_count):
    if sorting_value == "track_count":
        if attribute == "genre":
            return df[attribute].str.split(', ').explode().value_counts()[:top_count].reset_index()
        else:
            return df[attribute].value_counts()[:top_count].reset_index()
    elif sorting_value == "listening_time":
        if attribute == "genre":
            return df[[attribute, "h_played"]].assign(genre=df[attribute].str.split(', ')).explode(attribute).groupby(attribute)['h_played'].sum().sort_values(ascending=False)[:top_count].reset_index().round(1)
        else:
            return df[[attribute, "h_played"]].groupby(attribute).sum().sort_values("h_played", ascending=False)[:top_count].reset_index().round(1)

In [20]:
attribute = "album_name" #artist_name, track_name, genre, album_name, country_name or platform
top_count = 10
sorting_value = "listening_time" #track_count or listening_time
get_top_data_raw(attribute, sorting_value, top_count)

Unnamed: 0,album_name,h_played
0,Die bunte Seite der Macht,13.4
1,Platte,9.7
2,Currents,9.5
3,Hard To Imagine The Neighbourhood Ever Changing,8.8
4,I Love You.,7.2
5,THE ANXIETY,7.1
6,Walking On A Dream,7.0
7,Notion,6.7
8,Nothing Happens,6.5
9,Hunting High and Low,6.4


## Exploring dynamic data