# Spotify data

Description: https://support.spotify.com/us/article/understanding-my-data/ 

In [131]:
import numpy as np
import pandas as pd
import json
from pathlib import Path

data_folder = Path('./data/Spotify Account Data')

## Reproducciones

In [3]:
rep_data = pd.read_json(data_folder / 'StreamingHistory_music_0.json')
# Algunas canciones aparecen "duplicadas" con el mismo momento de inicio
# Descartamos la de menor duración
display(rep_data[rep_data.duplicated(subset=['endTime', 'artistName', 'trackName'], keep=False)].head(10))

min_duracion = rep_data[rep_data.duplicated(subset=['endTime', 'artistName', 'trackName'], keep=False)]\
                    .groupby(['artistName', 'trackName', 'endTime']).idxmin()
display(min_duracion.head(10))
display(rep_data.loc[min_duracion.msPlayed].sort_index())

rep_data = rep_data[~rep_data.msPlayed.isin(min_duracion.msPlayed)]
rep_data

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2023-04-01 07:34,El Drogas,Cordones de Mimbre,0
1,2023-04-01 07:34,El Drogas,Cordones de Mimbre,9243
25,2023-04-04 20:57,The Beatles,While My Guitar Gently Weeps,110991
26,2023-04-04 20:57,The Beatles,While My Guitar Gently Weeps,0
46,2023-04-06 08:40,Eric Clapton,Lies,468
47,2023-04-06 08:40,Eric Clapton,Lies,53538
122,2023-04-11 20:32,Radiohead,Street Spirit (Fade Out),14850
123,2023-04-11 20:32,Radiohead,Street Spirit (Fade Out),0
256,2023-04-15 21:11,Chris Stapleton,Starting Over,56932
257,2023-04-15 21:11,Chris Stapleton,Starting Over,0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,msPlayed
artistName,trackName,endTime,Unnamed: 3_level_1
Alstad,Sakura,2024-02-12 06:46,5662
Arde Bogotá,El Dorado,2023-09-09 16:30,3446
Barricada,En Blanco Y Negro,2024-02-25 09:51,6129
Billy Joel,She's Always a Woman,2024-02-13 22:17,5705
Billy Joel,She's Always a Woman,2024-03-04 14:16,6394
Carmen Boza,OCTUBRE,2023-09-03 10:00,3347
Chris Stapleton,Starting Over,2023-04-15 21:11,257
Ciudad Jara,Trocito de Marte,2023-12-17 15:12,4684
Coldplay,See You Soon,2023-08-22 21:38,3115
Coque Malla,No puedo vivir sin ti (feat. Anni B Sweet),2023-08-23 10:28,3163


Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2023-04-01 07:34,El Drogas,Cordones de Mimbre,0
26,2023-04-04 20:57,The Beatles,While My Guitar Gently Weeps,0
46,2023-04-06 08:40,Eric Clapton,Lies,468
123,2023-04-11 20:32,Radiohead,Street Spirit (Fade Out),0
257,2023-04-15 21:11,Chris Stapleton,Starting Over,0
...,...,...,...,...
6312,2024-02-29 21:14,Sinkope,El carro de la vida - feat. Kutxi Romero,1043
6350,2024-03-01 08:15,Jimi Hendrix,Little Wing,2333
6394,2024-03-04 14:16,Billy Joel,She's Always a Woman,480
6473,2024-03-15 09:18,Flowklorikos,Donde Duele Inspira,0


Unnamed: 0,endTime,artistName,trackName,msPlayed
1,2023-04-01 07:34,El Drogas,Cordones de Mimbre,9243
2,2023-04-01 07:37,Heroes Del Silencio,La carta,178109
3,2023-04-01 07:41,Keane,This Is The Last Time,208787
4,2023-04-01 07:46,Joaquín Sabina,Peces de Ciudad,305175
5,2023-04-01 07:50,Gustavo Cerati,Puente,273758
...,...,...,...,...
6802,2024-04-01 18:34,Inconscientes,El Último Hombre Libre,256187
6803,2024-04-01 18:37,Alela Diane,The Pirate's Gospel,175248
6804,2024-04-01 18:42,Travis Birds,La Chica del Tren,261792
6805,2024-04-01 18:47,The Gathering,You Learn About It,308804


In [4]:
rep_data.groupby(['artistName','trackName']).count().nlargest(10, 'msPlayed')

Unnamed: 0_level_0,Unnamed: 1_level_0,endTime,msPlayed
artistName,trackName,Unnamed: 2_level_1,Unnamed: 3_level_1
Kings of Leon,Pyro,24,24
Bruce Springsteen,Streets of Philadelphia - Single Edit,22,22
U2,One,22,22
Coldplay,See You Soon,21,21
Robe,Nada que perder,21,21
Robe,Viajando por el interior,21,21
Peter Gabriel,Solsbury Hill,20,20
Coldplay,The Scientist,19,19
Creedence Clearwater Revival,Have You Ever Seen The Rain,19,19
Dire Straits,Romeo And Juliet,18,18


In [None]:
lib_data = pd.read_json(data_folder / 'YourLibrary.json', orient='records')
lib_data.head()

## API

In [132]:
import requests
from urllib.parse import urlencode

client_id = '755baf36b7e94f81ba6f0d30c9783a4e'
client_secret = '7a4998fbbe2842fabecbf4dfd319b8ae'

### Get access token

In [133]:
header = {'Content-Type':'application/x-www-form-urlencoded'}
# dict(grant_type='client_credentials', client_id=client_id, client_secret=client_secret)
# f'grant_type=client_credentials&client_id={client_id}&client_secret={client_secret}'

# Get access token
response = requests.post(url='https://accounts.spotify.com/api/token', 
                         headers=header,
                         data=dict(grant_type='client_credentials', client_id=client_id, client_secret=client_secret))

access_token = response.json().get('access_token', None)

header = {'Content-Type':'application/json',
          'Authorization': 'Bearer '+ access_token}

### Extraction code

In [128]:
# Search: https://developer.spotify.com/documentation/web-api/reference/search


def search_track(artista: str, titulo: str) -> list:
    # Ojo, devuelve varios resultados. Seleccionamos siempre el primero por simplificar
    params = dict(q=f'artist:{artista} track:{titulo}', 
                  type=['track'],
                  market='ES',
                  limit=30)
    response = requests.get(url='https://api.spotify.com/v1/search', 
                            headers=header, 
                            params=params)
    
    for art in response.json()['artists']['items']:
        if art['name'] == artista:
            return art
    else:
        return {}

    try:
        return response.json()['tracks']['items'][0]
    except IndexError:
        return {}
    except KeyError:
        return {}


def extract_track(track_id: str):
    params = dict(market='ES')
    response = requests.get(url=f'https://api.spotify.com/v1/tracks/{track_id}', 
                            headers=header,
                            params=params)
    
    return response.json()

def extract_artist(artista_id: str):
    response = requests.get(url=f'https://api.spotify.com/v1/artists/{artista_id}', 
                            headers=header)
    
    return response.json()

def extract_artist_albums(artista_id: str):
    params = dict(include_groups='album',
                  limit=20,
                  market='ES')
    response = requests.get(url=f'https://api.spotify.com/v1/artists/{artista_id}/albums', 
                            headers=header,
                            params=params)
    
    return response.json()

def extract_album(album_id: str):
    params = dict(market='ES')
    response = requests.get(url=f'https://api.spotify.com/v1/albums/{album_id}', 
                            headers=header, 
                            params=params)
    
    return response.json()

### Extraction

In [126]:
display(rep_data.head())

tracks = rep_data.drop_duplicates(subset=['artistName', 'trackName'])
# tracks = tracks.sort_values(by=['artistName', 'trackName'])
display(tracks.head())

Unnamed: 0,endTime,artistName,trackName,msPlayed
1,2023-04-01 07:34,El Drogas,Cordones de Mimbre,9243
2,2023-04-01 07:37,Heroes Del Silencio,La carta,178109
3,2023-04-01 07:41,Keane,This Is The Last Time,208787
4,2023-04-01 07:46,Joaquín Sabina,Peces de Ciudad,305175
5,2023-04-01 07:50,Gustavo Cerati,Puente,273758


Unnamed: 0,endTime,artistName,trackName,msPlayed
1,2023-04-01 07:34,El Drogas,Cordones de Mimbre,9243
2,2023-04-01 07:37,Heroes Del Silencio,La carta,178109
3,2023-04-01 07:41,Keane,This Is The Last Time,208787
4,2023-04-01 07:46,Joaquín Sabina,Peces de Ciudad,305175
5,2023-04-01 07:50,Gustavo Cerati,Puente,273758


In [129]:
results = []
for idx, row in tracks.iterrows():
    # print(row.artistName, '-', row.trackName)
    print(' '*150,end='\r')
    print(f'({idx+1:>3}/{len(tracks):>3}) {row.artistName:<25} - {row.trackName}', end='\r')
    result = search_track(row.artistName, row.trackName)

    if result:
        results.append(result)

# Escribir
with open('./data/raw_data/tracks/tracks.json', 'w+', encoding='utf8') as file:
    json.dump(results, file)
# Cargar
# with open('./data/raw_data/tracks/tracks.json', 'r', encoding='utf8') as file:
#     results = json.load(file)

(6732/2728) Scala & Kolacny Brothers  - Creep                                                       h Deck, Raekwon, Ol' Dirty Bastard, Ghostface Killah & Masta Killa)ack Pearl"/Score

In [111]:
tracks_data = {}
artists_data = {}
albums_data = {}

for idx, result in enumerate(results):
    track = dict(
        id=result['id'],
        name=result['name'],
        popularity=result['popularity'],
        number=result['track_number'],
        duration=result.get('duration_ms', None),
        id_album=result['album']['id'],
        id_artists=[art['id'] for art in result['artists']]
    )
    tracks_data[track['id']] = track

    for art in result['artists']:
        if art['id'] not in artists_data:
            artist = dict(
                id=art['id'],
                name=art['name'],
            )
            artists_data[art['id']] = artist

    album = dict(
        id=result['album']['id'],
        name=result['album']['name'],
        type=result['album']['type'],
        release_date=result['album']['release_date'],
        total_tracks=result['album']['total_tracks'],
        id_artists=[art['id'] for art in result['artists']]
    )
    albums_data[result['album']['id']] = album

df_tracks  = pd.DataFrame([x for x in tracks_data.values()])

In [112]:
df_tracks.head()

Unnamed: 0,id,name,popularity,number,duration,id_album,id_artists
0,3aI5UV6m77rF4D2gYipiSL,Cordones de Mimbre,25,2,177346,5xFHWjntXM8PQ1iRUGQZHB,[5jJWnRLQeU7fe9VaDjpIWK]
1,2RzZSZbfu3jir4udOhyd75,La carta,61,3,187520,7DAoA2gOvycforLAKyFD8Y,[3qAPxVwIQRBuz5ImPUxpZT]
2,6IVlHRrjTKep5SS9kbcpTa,This Is The Last Time,66,9,208786,0MlTOiC5ZYKFGeZ8h3D4rd,[53A0W3U0s8diEn9RhXQhVz]
3,5xq9JrBQQ1r1bKx3P1QhB2,Peces de Ciudad,54,6,305186,2V1g8Z3d12rVQl5UIACCEI,[4aeIWo5CMF1uRmqgJdwkZW]
4,6gwaa6ElIixNTvu6RwkMyo,Puente,70,4,273800,2rIdWbXPjcq8K7BCccBhhC,[1QOmebWGB6FdFtW7Bo3F0W]


In [118]:
for idx, artist in enumerate(artists_data.values()):
    print(f'({idx+1:>3}/{len(artists_data):>3}) {artist['name']:<50}', end='\r')
    
    art = extract_artist(artist['id'])
    artist['followers'] = art['followers']['total']
    artist['popularity'] = art['popularity']
    artist['genres'] = ','.join(art['genres'])
df_artists = pd.DataFrame([x for x in artists_data.values()]).drop_duplicates(subset=['id'])

( 31/ 31) Amanda Shires                                     

In [120]:
for idx, album in enumerate(albums_data.values()):
    print(f'({idx+1:>3}/{len(albums_data):>3}) {album['name']:<50}', end='\r')

    alb = extract_album(album['id'])
    album['genres'] = ','.join(alb['genres'])
    album['label'] = alb['label']
    album['popularity'] = alb['popularity']
df_albums  = pd.DataFrame([x for x in albums_data.values()]).drop_duplicates(subset=['id'])

In [124]:
df_tracks.head()

Unnamed: 0,id,name,popularity,number,duration,id_album,id_artists
0,3aI5UV6m77rF4D2gYipiSL,Cordones de Mimbre,25,2,177346,5xFHWjntXM8PQ1iRUGQZHB,[5jJWnRLQeU7fe9VaDjpIWK]
1,2RzZSZbfu3jir4udOhyd75,La carta,61,3,187520,7DAoA2gOvycforLAKyFD8Y,[3qAPxVwIQRBuz5ImPUxpZT]
2,6IVlHRrjTKep5SS9kbcpTa,This Is The Last Time,66,9,208786,0MlTOiC5ZYKFGeZ8h3D4rd,[53A0W3U0s8diEn9RhXQhVz]
3,5xq9JrBQQ1r1bKx3P1QhB2,Peces de Ciudad,54,6,305186,2V1g8Z3d12rVQl5UIACCEI,[4aeIWo5CMF1uRmqgJdwkZW]
4,6gwaa6ElIixNTvu6RwkMyo,Puente,70,4,273800,2rIdWbXPjcq8K7BCccBhhC,[1QOmebWGB6FdFtW7Bo3F0W]


In [121]:
df_artists.head()

Unnamed: 0,id,name,followers,popularity,genres
0,5jJWnRLQeU7fe9VaDjpIWK,El Drogas,37060,44,"punk urbano,spanish punk"
1,3qAPxVwIQRBuz5ImPUxpZT,Heroes Del Silencio,2539220,67,"latin alternative,latin rock,musica aragonesa,..."
2,53A0W3U0s8diEn9RhXQhVz,Keane,3299088,74,"neo mellow,piano rock,pop rock"
3,4aeIWo5CMF1uRmqgJdwkZW,Joaquín Sabina,2989607,67,"cantautor,spanish pop,trova"
4,1QOmebWGB6FdFtW7Bo3F0W,Gustavo Cerati,4230975,72,"argentine rock,latin alternative,latin rock,ro..."


In [122]:
df_albums.head()

Unnamed: 0,id,name,type,release_date,total_tracks,id_artists,genres,label,popularity
0,5xFHWjntXM8PQ1iRUGQZHB,Demasiado Tonto en la Corteza,album,2013-11-05,24,[5jJWnRLQeU7fe9VaDjpIWK],,"Maldito Records, S.L.",29
1,7DAoA2gOvycforLAKyFD8Y,Senderos De Traición - Edición Especial,album,1990,25,[3qAPxVwIQRBuz5ImPUxpZT],,Parlophone Spain,71
2,0MlTOiC5ZYKFGeZ8h3D4rd,Hopes And Fears,album,2004-05-10,12,[53A0W3U0s8diEn9RhXQhVz],,Universal-Island Records Ltd.,82
3,2V1g8Z3d12rVQl5UIACCEI,Dímelo En La Calle,album,2002-10-05,14,[4aeIWo5CMF1uRmqgJdwkZW],,Ariola,56
4,2rIdWbXPjcq8K7BCccBhhC,Bocanada,album,1999-06-01,15,[1QOmebWGB6FdFtW7Bo3F0W],,Ariola,71


### Tests

In [135]:
# Test track by name and artist
print(json.dumps(search_track('Extremoduro', 'Buscando una luna'), indent=2))

{
  "album": {
    "album_type": "album",
    "artists": [
      {
        "external_urls": {
          "spotify": "https://open.spotify.com/artist/3bgsNtcf5d5h9jbQbohfBK"
        },
        "href": "https://api.spotify.com/v1/artists/3bgsNtcf5d5h9jbQbohfBK",
        "id": "3bgsNtcf5d5h9jbQbohfBK",
        "name": "Extremoduro",
        "type": "artist",
        "uri": "spotify:artist:3bgsNtcf5d5h9jbQbohfBK"
      }
    ],
    "external_urls": {
      "spotify": "https://open.spotify.com/album/3msSCqpQLoY0nl4RTSf1Is"
    },
    "href": "https://api.spotify.com/v1/albums/3msSCqpQLoY0nl4RTSf1Is",
    "id": "3msSCqpQLoY0nl4RTSf1Is",
    "images": [
      {
        "height": 640,
        "url": "https://i.scdn.co/image/ab67616d0000b273489635998e8b3a9dec1ae455",
        "width": 640
      },
      {
        "height": 300,
        "url": "https://i.scdn.co/image/ab67616d00001e02489635998e8b3a9dec1ae455",
        "width": 300
      },
      {
        "height": 64,
        "url": "https://i.sc

In [None]:
# Test song
print(json.dumps(extract_track('3aI5UV6m77rF4D2gYipiSL'), indent=2))

In [136]:
# Test artist
print(json.dumps(extract_artist('5jJWnRLQeU7fe9VaDjpIWK'), indent=2))

{
  "external_urls": {
    "spotify": "https://open.spotify.com/artist/5jJWnRLQeU7fe9VaDjpIWK"
  },
  "followers": {
    "href": null,
    "total": 37060
  },
  "genres": [
    "punk urbano",
    "spanish punk"
  ],
  "href": "https://api.spotify.com/v1/artists/5jJWnRLQeU7fe9VaDjpIWK",
  "id": "5jJWnRLQeU7fe9VaDjpIWK",
  "images": [
    {
      "height": 640,
      "url": "https://i.scdn.co/image/ab6761610000e5ebe5028bd31932a168db265365",
      "width": 640
    },
    {
      "height": 320,
      "url": "https://i.scdn.co/image/ab67616100005174e5028bd31932a168db265365",
      "width": 320
    },
    {
      "height": 160,
      "url": "https://i.scdn.co/image/ab6761610000f178e5028bd31932a168db265365",
      "width": 160
    }
  ],
  "name": "El Drogas",
  "popularity": 44,
  "type": "artist",
  "uri": "spotify:artist:5jJWnRLQeU7fe9VaDjpIWK"
}


In [None]:
# Test albums from artist
print(json.dumps(extract_artist_albums('5jJWnRLQeU7fe9VaDjpIWK'), indent=2))

In [None]:
# Test albums
print(json.dumps(extract_album('5xFHWjntXM8PQ1iRUGQZHB'), indent=2))