# Personal Informatics - Assignment | Spotify Data

Interesting:
- plot listening based on time (time in day, in year, in month, in week)
- get music style and plot it based on time
- what do I listen to the most? Which artist? Which title?
- Frequency of listening for most listened tracks

Understanding Spotify data : https://support.spotify.com/us/article/understanding-my-data/

In [421]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import collections
import urllib.parse
import json

## Load and pre-process data

In [2]:
# load spotify history
df = pd.read_json('data/spotify/StreamingHistory0.json')
df = pd.concat([df, pd.read_json('data/spotify/StreamingHistory1.json')])
df = pd.concat([df, pd.read_json('data/spotify/StreamingHistory2.json')])
df.head(3)

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2020-09-05 15:44,Allie X,Super Duper Party People,144170
1,2021-02-03 12:48,Infected Mushroom,Flamingo,244194
2,2021-02-03 23:07,Guns N' Roses,Welcome To The Jungle,53665


In [3]:
# convert the 'endTime' column to datetime format
df['endTime'] = pd.to_datetime(df['endTime'])

In [4]:
# convert the 'msPlayed' column to timedelta format
df['timePlayed'] = pd.to_timedelta(df['msPlayed'], unit='ms')

In [5]:
# add time spent in different units
df['minPlayed'] = df['msPlayed']//(1000*60)
df['hPlayed'] = df['msPlayed']/(1000*60*60)

In [6]:
# add 'startTime' column
df['startTime_exact'] = df.endTime - df.timePlayed
# add 'startTime_round' to round on minute
df['startTime'] = df['startTime_exact'].round('min')

In [7]:
# add date time column for filtering
df['year'] = pd.DatetimeIndex(df['startTime']).year
df['month'] = pd.DatetimeIndex(df['startTime']).month
df['weekday'] = pd.DatetimeIndex(df['startTime']).weekday
df['hour'] = pd.DatetimeIndex(df['startTime']).hour
df['day'] = pd.DatetimeIndex(df['startTime']).floor('D')

I clean the data:
- just one line in 2020 => drop it
- be sure I only have one month per year of data (avoid doubles data across two years)

In [8]:
# drop 2020 data
df = df.drop(df[df.year == 2020].index,axis=0)

In [9]:
# conclusion = we keep 2021 data for February
df[df.month == 2].year.value_counts()

2021    1670
2022     454
Name: year, dtype: int64

In [10]:
df = df.drop(df[(df.year == 2022) & (df.month == 2)].index, axis=0)

In [11]:
for m in range(1,13):
    assert len(df[df.month == 2].year.unique())==1

In [12]:
# sort by startTime
df = df.sort_values('startTime_exact')

In [13]:
startDate = df.startTime.values[0]
endDate = df.startTime.values[-1]
print('Start = {}'.format(np.datetime_as_string(startDate, unit='D')))
print('End = {}'.format(np.datetime_as_string(endDate, unit='D')))

Start = 2021-02-03
End = 2022-01-31


**>>> About time: from February 2021 to January 2022 (all included)**

## Basic visualisations

In [422]:
# recall of columns name
df.columns

Index(['endTime', 'artistName', 'trackName', 'msPlayed', 'timePlayed',
       'minPlayed', 'hPlayed', 'startTime_exact', 'startTime', 'year', 'month',
       'weekday', 'hour', 'day'],
      dtype='object')

In [437]:
# how many tracks and hours spent on the last passed year
nb_tracks = len(df)
nb_hours = np.sum(df.timePlayed.values).astype('timedelta64[h]')
print(f"In a year (Feb 2021 to Jan 2022), I listened to {nb_tracks} tracks on Spotify.")
print(f"It represents {nb_hours}.")

# how many different tracks
nb_unique_tracks = len(df[['artistName','trackName']].drop_duplicates())
print(f"More precisely, I listened to {nb_unique_tracks} unique tracks e.g. {round(100*nb_unique_tracks/nb_tracks)}% of all listened tracks.")

In a year (Feb 2021 to Jan 2022), I listened to 23943 tracks on Spotify.
It represents 1235 hours.
More precisely, I listened to 7787 unique tracks e.g. 33% of all listened tracks.


# Connect to Spotify API

Based on https://towardsdatascience.com/visualizing-spotify-data-with-python-tableau-687f2f528cdd and https://medium.com/swlh/build-spotify-playlist-using-machine-learning-45352975d2ee

Spotify Dev Dashboard: https://developer.spotify.com/dashboard

Request Python doc : https://docs.python-requests.org/en/latest/user/quickstart/#make-a-request

In [81]:
# save your IDs from new project in Spotify Developer Dashboard
CLIENT_ID = 
CLIENT_SECRET = 

In [82]:
# generate access token

import requests

# authentication URL
AUTH_URL = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']

In [83]:
# used for authenticating all API calls
headers = {'Authorization': 'Bearer {token}'.format(token=access_token)}

# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'

### Fetch data from API

In [17]:
df.head(2)

Unnamed: 0,endTime,artistName,trackName,msPlayed,timePlayed,minPlayed,hPlayed,startTime_exact,startTime,year,month,weekday,hour,day
1,2021-02-03 12:48:00,Infected Mushroom,Flamingo,244194,0 days 00:04:04.194000,4,0.067832,2021-02-03 12:43:55.806,2021-02-03 12:44:00,2021,2,2,12,2021-02-03
2,2021-02-03 23:07:00,Guns N' Roses,Welcome To The Jungle,53665,0 days 00:00:53.665000,0,0.014907,2021-02-03 23:06:06.335,2021-02-03 23:06:00,2021,2,2,23,2021-02-03


#### 1. Get the genre of artists

Spotify associates artists with given genres, not tracks.

In [29]:
# fetch all artists
all_artists = df.artistName.unique()

In [34]:
len(all_artists)

3134

In [None]:
artists_fail = []
artists_genres = {}
artists_ids = {}
i=0

In [35]:
for anArtist in all_artists:
    if not (anArtist in list(artists_genres.keys())+artists_fail):
        i+=1
        if (i%100==0):
            print(i)
        r = requests.get(BASE_URL + 'search', headers=headers, params={"type":"artist", "q":anArtist}).json()
        if (len(r['artists']['items'])>0):
            found=False
            for found_item in r['artists']['items']:
                if (found_item['name'] == anArtist):
                    artists_genres[anArtist]=found_item['genres']
                    artists_ids[anArtist]=found_item['id']
                    found=True
                    break
            if (not found):
                artists_fail.append(anArtist)
                print('NOT MATCH:', found_item['name'], 'with', anArtist)
        else:
            artists_fail.append(anArtist)
            print('NOT FOUND:', anArtist)

2500
2600
NOT FOUND: A Little Kiss and Tell: The Podcast
NOT FOUND: La Fabrique de l'espace
2700
2800
2900
3000
NOT FOUND: Sébastien Patoche
3100


In [405]:
# create a artists dataframe
df_artists = pd.DataFrame(list(artists_genres.keys()), columns=['artistName'])
df_artists['genres'] = ""
df_artists['artistId'] = ""

for anArtist, associated_genres in artists_genres.items():
    index = df_artists[df_artists.artistName == anArtist].index
    df_artists.loc[index, 'artistId'] = artists_ids[anArtist]
    df_artists.loc[index, 'genres'] = str(associated_genres)

In [406]:
# add recognized podcasts as podcast genre
all_podcasts = ['Un podcast à soi', 'Les Couilles sur la table', 'Les actus du jour - Hugo Décrypte', 'FT News Briefing', 'The Economist Podcasts', 'Global News Podcast', 'TED Talks Daily', '2 Heures De Perdues', 'Mythes et Légendes', 'Une vie plus saine & sereine', 'Choses à Savoir', 'Choses à Savoir TECH', 'La Matinée Est Tienne', 'Virgo Today', 'Weekly Motivation by Ben Lionel Scott', 'Today in Focus', 'La Série sur le Gâteau ', 'Maintenant Vous Savez - Culture', "L'ARNAQUE", 'Making Gay History | LGBTQ Oral Histories from the Archive', "GAMERS, l'histoire secrète des jeux vidéo", 'A Voix Haute', 'Philosophy is Sexy', 'L’Heure du Monde', '1001 Classic Short Stories & Tales', 'Listen To Sleep - Quiet Bedtime Stories & Sleep Meditations', 'Just Sleep - Bedtime Stories for Adults', 'CONTES SCP - Au-delà des archives', 'Adoprixtoxis', 'Clyde Vanilla (série audio)', 'A suivre', "Au cœur de l'histoire ", 'A Little Kiss and Tell: The Podcast', "La Fabrique de l'espace"]

df_podcasts = pd.DataFrame(all_podcasts, columns=['artistName'])
df_podcasts['genres'] = [str(['podcast']) for _ in range (len(df_podcasts))]
df_podcasts['artistId'] = ""

In [407]:
# fusion
df_artists = df_artists.append(df_podcasts, ignore_index=True)

In [408]:
df_artists

Unnamed: 0,artistName,genres,artistId
0,Infected Mushroom,['psychedelic trance'],6S2tas4z6DyIklBajDqJxI
1,Guns N' Roses,"['glam metal', 'hard rock', 'rock']",3qm84nBOXUEQ2vnTfUTTFC
2,Astrix,"['full on', 'nitzhonot', 'psychedelic trance']",3dUltShd2gJQc98Kc7Syit
3,Fatal Bazooka,"['chanson paillarde', 'comic', 'francoton', 'f...",5oDxGfxYaJkfp6xZlXQnr8
4,Max Boublil,"['chanson paillarde', 'francoton', 'french pop']",1Vf8pNhIprjjtrLAiVPzfh
...,...,...,...
3122,Clyde Vanilla (série audio),['podcast'],
3123,A suivre,['podcast'],
3124,Au cœur de l'histoire,['podcast'],
3125,A Little Kiss and Tell: The Podcast,['podcast'],


In [409]:
len(df_artists.artistName.values)+len(artists_fail)

3134

In [410]:
len(all_artists)

3134

In [411]:
# add number of played artist
df_artists['nbPlayed'] = [len(df[df.artistName == artist]) for artist in df_artists.artistName]

In [412]:
# add time played per artist
df_artists['timePlayed'] = [np.sum(df[df.artistName == artist].timePlayed) for artist in df_artists.artistName]

**Which genre do we have to focus on?**

In [413]:
all_genres = list(set([item for sublist in artists_genres.values() for item in sublist]))

In [414]:
# get an idea of most listened genres
print('Based on timePlayed:')
print(df_artists.sort_values('timePlayed',ascending=False).genres.values[:20])

print('\n\nBased on artists genre occurences:')
genres_counter = collections.Counter([item for sublist in artists_genres.values() for item in sublist])
genres_counter = dict(sorted(genres_counter.items(), key=lambda x:x[1], reverse=True))
limit = 100
print(list(genres_counter.values())[limit], list(genres_counter.keys())[:limit])

Based on timePlayed:
["['acidcore']"
 "['deep minimal techno', 'high-tech minimal', 'minimal techno', 'new french touch', 'tropical house']"
 "['dark disco', 'electro-pop francais', 'french indie pop', 'french indietronica', 'new french touch']"
 '[]' "['chillsynth', 'spacewave', 'synthwave']" '[]' "['podcast']"
 "['psychedelic trance']" "['modern rock', 'permanent wave', 'rock']"
 "['electronica', 'minimal melodic techno', 'minimal techno', 'new french touch', 'tropical house']"
 "['indie folk', 'new americana', 'progressive bluegrass', 'stomp and flutter', 'stomp and holler']"
 "['german dance', 'hamburg electronic', 'livetronica']" "['rap calme']"
 "['french hip hop', 'old school rap francais', 'rap conscient']"
 "['chamber pop']"
 "['alternative rock', 'blues rock', 'garage rock', 'modern blues rock', 'modern rock', 'punk blues', 'rock', 'roots rock']"
 "['art pop', 'dance pop', 'pop']" '[]' "['french folk pop']" "['comic']"]


Based on artists genre occurences:
18 ['rock', 'modern

In [415]:
# manual genre replacement
df_artists.loc[df_artists[df_artists.artistName=='Maxime Verdoni'].index, 'genres'] = str(['neo-classical'])
df_artists.loc[df_artists[df_artists.artistName=='Alexandre Astier'].index, 'genres'] = str(['soundtrack','orchestral sountrack'])
df_artists.loc[df_artists[df_artists.artistName=='Peder B. Helland'].index, 'genres'] = str(['compositional ambient'])
df_artists.loc[df_artists[df_artists.artistName=='Abr.'].index, 'genres'] = str(['techno','house'])
df_artists.loc[df_artists[df_artists.artistName=='Predacid'].index, 'genres'] = str(['techno','acidcore','trance','house'])
df_artists.loc[df_artists[df_artists.artistName=='Jon Bovi'].index, 'genres'] = str(['electro house','synthwave'])
df_artists.loc[df_artists[df_artists.artistName=='Winstonw'].index, 'genres'] = str(['synthwave'])
df_artists.loc[df_artists[df_artists.artistName=='Florent Dorin'].index, 'genres'] = str(['french'])
df_artists.loc[df_artists[df_artists.artistName=='CO&XIST'].index, 'genres'] = str(['techno','acidcore'])
df_artists.loc[df_artists[df_artists.artistName=='Compilation Générique TV'].index, 'genres'] = str(['soundtrack'])
df_artists.loc[df_artists[df_artists.artistName=='John Eyzen'].index, 'genres'] = str(['french'])

In [416]:
df_artists[df_artists.genres=='[]'].sort_values('timePlayed',ascending=False).head()

Unnamed: 0,artistName,genres,artistId,nbPlayed,timePlayed
2071,Kenneth Pattengale & Joey Ryan,[],7m8wSQRZkjgoPYXNPMRgVS,32,0 days 02:22:10.849000
920,Unknown Artist,[],74Ch11L4833kZ9VfyziR3K,68,0 days 01:57:08.648000
2374,Babasmas,[],0vngS6bj2tYZMff6ASijx2,37,0 days 01:53:13.131000
2322,Tip Stevens,[],0dDbdyjsq9HEPNKPyNElgd,21,0 days 01:40:09.797000
2377,Timothé Douart,[],0pqsdoAtaeIn6QzTVGV8jR,28,0 days 01:40:06.203000


https://tahaashtiani.com/spotify-taste-and-predicting-song-genres

In [417]:
# interesting genres
focus_genres = {
    'rock':[],
    'techno':['acidcore'],
    'edm':[],
    'french':['français'],
    'pop':['electropop'],
    'rap':[],
    'minimal':[],
    'house':[],
    'dance':[],
    'synthwave':['spacewave'],
    'electro':['electronica','electropop','indietronica'],
    'indie':[],
    'neo-classical':[],
    'soundtrack':[],
    'podcast':[]
}

In [449]:
print(list(focus_genres.keys()))

['rock', 'techno', 'edm', 'french', 'pop', 'rap', 'minimal', 'house', 'dance', 'synthwave', 'electro', 'indie', 'neo-classical', 'soundtrack', 'podcast']


In [441]:
# add a binary column for each artist

def match_genres(aRow, aListOfKeywords, column_name):
    if len(aRow.genres)>0:
        found_genres = ast.literal_eval(aRow.genres)
        found_genres = ' '.join(found_genres).split(' ')
        if list(set(aListOfKeywords).intersection(found_genres)) != []:
            #print(aRow.artistName, found_genres, aListOfKeywords)
            return 1
    return 0

for aGenre, theKeywords in focus_genres.items():
    all_keywords = [aGenre]+theKeywords
    column_name = 'is_'+aGenre
    df_artists[column_name] = df_artists.apply((lambda x: match_genres(x, all_keywords, column_name)),axis=1)

In [442]:
df_wGenre = df.merge(df_artists, how='left', right_on='artistName', left_on='artistName')
print(len(df),len(df_wGenre))

23943 23943


In [443]:
# EXPORT with genres
df_wGenre.to_csv('data/spotify/my_spotify_history.csv')

In [447]:
df_wGenre.columns

Index(['endTime', 'artistName', 'trackName', 'msPlayed', 'timePlayed_x',
       'minPlayed', 'hPlayed', 'startTime_exact', 'startTime', 'year', 'month',
       'weekday', 'hour', 'day', 'genres', 'artistId', 'nbPlayed',
       'timePlayed_y', 'is_rock', 'is_techno', 'is_edm', 'is_french', 'is_pop',
       'is_rap', 'is_minimal', 'is_house', 'is_dance', 'is_synthwave',
       'is_electro', 'is_indie', 'is_neo-classical', 'is_soundtrack',
       'is_podcast'],
      dtype='object')

#### Add Spotify Track ID [ABORT]

First, add the known ids from my library (`YourLibrary.json`)

In [None]:
with open('data/spotify/YourLibrary.json', encoding='utf-8') as f:
    myLibrary = json.load(f)

In [None]:
# load lib in dataframe
df_lib = pd.DataFrame(myLibrary['tracks'])
df_lib.head()

In [None]:
# create id column from uri

df_lib['id'] = [None for _ in range (len(df_lib))]

def get_id(aRow):
    aRow.id = aRow.uri.split('track:')[1]
    
_ = df_lib.apply(get_id, axis=1)

df_lib.head()

In [None]:
# create df_uniques
df_uniques = df[['artistName','trackName']].drop_duplicates()
df_uniques['id'] = [None for _ in range (len(df_uniques))]

In [None]:
# merge df_uniques and df_lib for common ids
df_uniques = df_uniques.merge(df_lib[['artist','track','id']],left_on = ['artistName', 'trackName'], right_on = ['artist','track'],how='left').drop(['artist','track','id_x'], axis=1)
df_uniques.columns = ['artistName','trackName','id']
df_uniques.head()

In [None]:
#344
df_uniques.id.isna().sum()

In [None]:
not_found = []

def search_id(aRow):
    if (str(aRow.id) == 'nan'):
        anArtist, aTrack = aRow.artistName, aRow.trackName
        q = aTrack+' '+anArtist
        # q = urllib.parse.quote(q)
        r = requests.get(BASE_URL + 'search', headers=headers, params={"type":"track", "q":q}).json()
        if len(r['tracks']['items'])>0:
            trackId = r['tracks']['items'][0]['id']
            aRow.id = trackId
        else:
            print('NOT FOUND: ', anArtist, '>>>', aTrack)
            not_found.append([anArtist, aTrack])

_ = df_uniques.apply(search_id, axis=1)

In [None]:
# search for "not found" tracks
not_found

In [None]:
# to csv
df_uniques.to_csv('data/spotify/my_unique_history.csv')

In [None]:
r = requests.get(BASE_URL + 'search', headers=headers, params={"type":"track", "q":"Fatal Bazooka"})
r = r.json()
[r['tracks']['items'][i]['name'] for i in range (len(r['tracks']['items']))]

In [None]:
aTrackID = "6YTjVxF1fksayDXOo53N4x"
r = requests.get(BASE_URL + 'tracks/'+aTrackID, headers=headers)
r = r.json()
print(r['name'])
print(r['artists'][0]['name'])

In [None]:
r = requests.get(BASE_URL + 'audio-features', headers=headers, params={"ids":aTrackID})
r = r.json()
r