# Spotify data

Description: https://support.spotify.com/us/article/understanding-my-data/ 

In [2]:
import pandas as pd
import json
from pathlib import Path

# Para crear los directorios necesarios
raw_data_path = Path('./data/raw_data')
extracted_data_path = Path('./data/extracted_data')
if not raw_data_path.exists(): raw_data_path.mkdir(parents=True, exist_ok=True)
if not extracted_data_path.exists(): extracted_data_path.mkdir(parents=True, exist_ok=True)

## Reproducciones

In [None]:
rep_data = pd.read_json('./data/raw_data/Spotify Account Data/StreamingHistory_music_0.json')
# Se eliminan registros duplicados: canciones que figuran dos veces, terminando a la vez,
# pero con duraciones de reproducción diferentes.
# Identificamos todos los registros duplicados
min_duracion = rep_data[rep_data.duplicated(subset=['endTime', 'artistName', 'trackName'], keep=False)]
display(min_duracion.head(10))
# Para cada duplicado, obtenemos el índice del de menor duración
min_duracion = min_duracion.groupby(['artistName', 'trackName', 'endTime']).idxmin()
# Descartamos los registros cuyo índice es uno de los registros en min_duracion
rep_data = rep_data[~rep_data.msPlayed.isin(min_duracion.msPlayed)]
rep_data = rep_data.reset_index(drop=True)

# Agregamos una columna con la fecha de reproducción
rep_data['playDate'] = pd.to_datetime(rep_data.endTime, format='ISO8601').values
# Agregamos una columna con el tiempo de reproducción, en minutos
rep_data['minPlayed'] = rep_data.msPlayed//60000

## API

In [2]:
import requests

client_id = '755baf36b7e94f81ba6f0d30c9783a4e'
client_secret = '7a4998fbbe2842fabecbf4dfd319b8ae'

### Get access token

In [3]:
# Get access token
response = requests.post(url='https://accounts.spotify.com/api/token', 
                             headers={'Content-Type':'application/x-www-form-urlencoded'},
                             data=dict(grant_type='client_credentials', 
                                       client_id=client_id, 
                                       client_secret=client_secret))
access_token = response.json().get('access_token', None)

header = {'Authorization': 'Bearer '+ access_token, }
# 'Content-Type':'application/json',

### Extraction code

In [4]:
header = {'Authorization': 'Bearer '+ access_token, }

def search_track(artist_name: str, track_name: str) -> dict:
    """ Obtiene el ID de una canción a partir de su nombre y el de su artista

    Como el endpoint de búsqueda devuelve una lista de resultados relevantes,
    es necesario revisar los resultados para seleccionar la canción adecuada
    comparando su nombre.
    
    Argumentos:
        artist_name: Nombre del artista
        track_name: Nombre de la canción
    Devuelve:
        Diccionario con la respuesta en JSON del servidor
    """
    # Search: https://developer.spotify.com/documentation/web-api/reference/search

    params = dict(q=f'artist:"{artist_name}" track:"{track_name}"', 
                  type=['track'],
                  market='ES',
                  limit=30)
    response = requests.get(url='https://api.spotify.com/v1/search', 
                            headers=header, 
                            params=params)
    tracks = response.json()['tracks']['items']
    for track in tracks:
        if track['name'] == track_name:
            return track
    else:
        return {}


def extract_track(track_id: str) -> dict:
    # Tracks: https://developer.spotify.com/documentation/web-api/reference/tracks
    params = dict(market='ES')
    response = requests.get(url=f'https://api.spotify.com/v1/tracks/{track_id}', 
                            headers=header,
                            params=params)
    
    return response.json()

def extract_artist(artist_id: str) -> dict:
    response = requests.get(url=f'https://api.spotify.com/v1/artists/{artist_id}', 
                            headers=header)
    
    return response.json()

def extract_artist_albums(artist_id: str) -> dict:
    params = dict(include_groups='album',
                  limit=20,
                  market='ES')
    response = requests.get(url=f'https://api.spotify.com/v1/artists/{artist_id}/albums', 
                            headers=header,
                            params=params)
    
    return response.json()

def extract_album(album_id: str) -> dict:
    params = dict(market='ES')
    response = requests.get(url=f'https://api.spotify.com/v1/albums/{album_id}', 
                            headers=header, 
                            params=params)
    
    return response.json()

### Extraction

In [5]:
rep_data = pd.read_json(data_folder / 'StreamingHistory_music_0.json')

tracks = rep_data.drop_duplicates(subset=['artistName', 'trackName']).reset_index(drop=True)
tracks.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2023-04-01 07:34,El Drogas,Cordones de Mimbre,0
1,2023-04-01 07:37,Heroes Del Silencio,La carta,178109
2,2023-04-01 07:41,Keane,This Is The Last Time,208787
3,2023-04-01 07:46,Joaquín Sabina,Peces de Ciudad,305175
4,2023-04-01 07:50,Gustavo Cerati,Puente,273758


In [6]:
results = []
for idx, row in tracks.iterrows():
    print(f'({idx+1:>3}/{len(tracks):>3}) {row.artistName + " - " + row.trackName:<200}', end='\r')
    result = search_track(row.artistName, row.trackName)

    if result:
        results.append(result)

# Escribir
with open('./data/extracted_data/tracks.json', 'w+', encoding='utf8') as file:
    json.dump(results, file)
# Cargar
# with open('./data/extracted_data/tracks/tracks.json', 'r', encoding='utf8') as file:
#     results = json.load(file)

In [8]:
tracks_data = {}
artists_data = {}
albums_data = {}

for idx, result in enumerate(results):
    track = dict(
        id=result['id'],
        name=result['name'],
        popularity=result['popularity'],
        number=result['track_number'],
        duration=result.get('duration_ms', None),
        id_album=result['album']['id'],
        id_artists=[art['id'] for art in result['artists']]
    )
    tracks_data[track['id']] = track

    for art in result['artists']:
        if art['id'] not in artists_data:
            artist = dict(
                id=art['id'],
                name=art['name'],
            )
            artists_data[art['id']] = artist

    if result['album']['id'] not in albums_data:
        album = dict(
            id=result['album']['id'],
            name=result['album']['name'],
            type=result['album']['type'],
            release_date=result['album']['release_date'],
            total_tracks=result['album']['total_tracks'],
            id_artists=[art['id'] for art in result['artists']]
        )
        albums_data[result['album']['id']] = album

df_tracks  = pd.DataFrame([x for x in tracks_data.values()])

In [21]:
# Completamos los artistas
for idx, artist in enumerate(artists_data.values()):
    print(f'({idx+1:>3}/{len(artists_data):>3}) {artist['name']:<200}', end='\r')
    
    art = extract_artist(artist['id'])
    artist['followers'] = art['followers']['total']
    artist['popularity'] = art['popularity']
    artist['genres'] = ','.join(art['genres'])
df_artists = pd.DataFrame([x for x in artists_data.values()])

(1137/1137) Scala & Kolacny Brothers                          erer

In [25]:
# Completamos los discos
for idx, album in enumerate(albums_data.values()):
    print(f'({idx+1:>3}/{len(albums_data):>3}) {album['name']:<200}', end='\r')
    alb = extract_album(album['id'])
    album['genres'] = ','.join(alb['genres'])
    album['label'] = alb['label']
    album['popularity'] = alb['popularity']
df_albums  = pd.DataFrame([x for x in albums_data.values()])

(1705/1705) On The Rocks                                                                                                                                                                                            

In [26]:
df_tracks.head()

Unnamed: 0,id,name,popularity,number,duration,id_album,id_artists
0,3aI5UV6m77rF4D2gYipiSL,Cordones de Mimbre,25,2,177346,5xFHWjntXM8PQ1iRUGQZHB,[5jJWnRLQeU7fe9VaDjpIWK]
1,2RzZSZbfu3jir4udOhyd75,La carta,61,3,187520,7DAoA2gOvycforLAKyFD8Y,[3qAPxVwIQRBuz5ImPUxpZT]
2,6IVlHRrjTKep5SS9kbcpTa,This Is The Last Time,67,9,208786,0MlTOiC5ZYKFGeZ8h3D4rd,[53A0W3U0s8diEn9RhXQhVz]
3,5xq9JrBQQ1r1bKx3P1QhB2,Peces de Ciudad,54,6,305186,2V1g8Z3d12rVQl5UIACCEI,[4aeIWo5CMF1uRmqgJdwkZW]
4,6gwaa6ElIixNTvu6RwkMyo,Puente,70,4,273800,2rIdWbXPjcq8K7BCccBhhC,[1QOmebWGB6FdFtW7Bo3F0W]


In [27]:
df_artists.head()

Unnamed: 0,id,name,followers,popularity,genres
0,5jJWnRLQeU7fe9VaDjpIWK,El Drogas,37139,44,"punk urbano,spanish punk"
1,3qAPxVwIQRBuz5ImPUxpZT,Heroes Del Silencio,2542226,67,"latin alternative,latin rock,musica aragonesa,..."
2,53A0W3U0s8diEn9RhXQhVz,Keane,3303317,74,"neo mellow,piano rock,pop rock"
3,4aeIWo5CMF1uRmqgJdwkZW,Joaquín Sabina,2991403,67,"cantautor,spanish pop,trova"
4,1QOmebWGB6FdFtW7Bo3F0W,Gustavo Cerati,4238788,72,"argentine rock,latin alternative,latin rock,ro..."


In [28]:
df_albums.head()

Unnamed: 0,id,name,type,release_date,total_tracks,id_artists,genres,label,popularity
0,5xFHWjntXM8PQ1iRUGQZHB,Demasiado Tonto en la Corteza,album,2013-11-05,24,[5jJWnRLQeU7fe9VaDjpIWK],,"Maldito Records, S.L.",29
1,7DAoA2gOvycforLAKyFD8Y,Senderos De Traición - Edición Especial,album,1990,25,[3qAPxVwIQRBuz5ImPUxpZT],,Parlophone Spain,71
2,0MlTOiC5ZYKFGeZ8h3D4rd,Hopes And Fears,album,2004-05-10,12,[53A0W3U0s8diEn9RhXQhVz],,Universal-Island Records Ltd.,82
3,2V1g8Z3d12rVQl5UIACCEI,Dímelo En La Calle,album,2002-10-05,14,[4aeIWo5CMF1uRmqgJdwkZW],,Ariola,56
4,2rIdWbXPjcq8K7BCccBhhC,Bocanada,album,1999-06-01,15,[1QOmebWGB6FdFtW7Bo3F0W],,Ariola,71


In [30]:
df_tracks.to_csv('./data/extracted_data/tracks.csv', index=False, header=True)
df_artists.to_csv('./data/extracted_data/artists.csv', index=False, header=True)
df_albums.to_csv('./data/extracted_data/albums.csv', index=False, header=True)

### Tests

In [135]:
# Test track by name and artist
print(json.dumps(search_track('Extremoduro', 'Buscando una luna'), indent=2))

{
  "album": {
    "album_type": "album",
    "artists": [
      {
        "external_urls": {
          "spotify": "https://open.spotify.com/artist/3bgsNtcf5d5h9jbQbohfBK"
        },
        "href": "https://api.spotify.com/v1/artists/3bgsNtcf5d5h9jbQbohfBK",
        "id": "3bgsNtcf5d5h9jbQbohfBK",
        "name": "Extremoduro",
        "type": "artist",
        "uri": "spotify:artist:3bgsNtcf5d5h9jbQbohfBK"
      }
    ],
    "external_urls": {
      "spotify": "https://open.spotify.com/album/3msSCqpQLoY0nl4RTSf1Is"
    },
    "href": "https://api.spotify.com/v1/albums/3msSCqpQLoY0nl4RTSf1Is",
    "id": "3msSCqpQLoY0nl4RTSf1Is",
    "images": [
      {
        "height": 640,
        "url": "https://i.scdn.co/image/ab67616d0000b273489635998e8b3a9dec1ae455",
        "width": 640
      },
      {
        "height": 300,
        "url": "https://i.scdn.co/image/ab67616d00001e02489635998e8b3a9dec1ae455",
        "width": 300
      },
      {
        "height": 64,
        "url": "https://i.sc

In [49]:
# Test song
print(json.dumps(extract_track('5PPwm655zF0qaG28ybUfDI'), indent=2))

{
  "album": {
    "album_type": "album",
    "artists": [
      {
        "external_urls": {
          "spotify": "https://open.spotify.com/artist/3bgsNtcf5d5h9jbQbohfBK"
        },
        "href": "https://api.spotify.com/v1/artists/3bgsNtcf5d5h9jbQbohfBK",
        "id": "3bgsNtcf5d5h9jbQbohfBK",
        "name": "Extremoduro",
        "type": "artist",
        "uri": "spotify:artist:3bgsNtcf5d5h9jbQbohfBK"
      }
    ],
    "external_urls": {
      "spotify": "https://open.spotify.com/album/3msSCqpQLoY0nl4RTSf1Is"
    },
    "href": "https://api.spotify.com/v1/albums/3msSCqpQLoY0nl4RTSf1Is",
    "id": "3msSCqpQLoY0nl4RTSf1Is",
    "images": [
      {
        "height": 640,
        "url": "https://i.scdn.co/image/ab67616d0000b273489635998e8b3a9dec1ae455",
        "width": 640
      },
      {
        "height": 300,
        "url": "https://i.scdn.co/image/ab67616d00001e02489635998e8b3a9dec1ae455",
        "width": 300
      },
      {
        "height": 64,
        "url": "https://i.sc

In [50]:
# Test artist
print(json.dumps(extract_artist('3bgsNtcf5d5h9jbQbohfBK'), indent=2))

{
  "external_urls": {
    "spotify": "https://open.spotify.com/artist/3bgsNtcf5d5h9jbQbohfBK"
  },
  "followers": {
    "href": null,
    "total": 1265540
  },
  "genres": [
    "indie extremena",
    "spanish rock"
  ],
  "href": "https://api.spotify.com/v1/artists/3bgsNtcf5d5h9jbQbohfBK",
  "id": "3bgsNtcf5d5h9jbQbohfBK",
  "images": [
    {
      "height": 640,
      "url": "https://i.scdn.co/image/ab6761610000e5eb12434039f9760438626cf90b",
      "width": 640
    },
    {
      "height": 320,
      "url": "https://i.scdn.co/image/ab6761610000517412434039f9760438626cf90b",
      "width": 320
    },
    {
      "height": 160,
      "url": "https://i.scdn.co/image/ab6761610000f17812434039f9760438626cf90b",
      "width": 160
    }
  ],
  "name": "Extremoduro",
  "popularity": 63,
  "type": "artist",
  "uri": "spotify:artist:3bgsNtcf5d5h9jbQbohfBK"
}


In [51]:
# Test albums from artist
print(json.dumps(extract_artist_albums('3bgsNtcf5d5h9jbQbohfBK'), indent=2))

{
  "href": "https://api.spotify.com/v1/artists/3bgsNtcf5d5h9jbQbohfBK/albums?include_groups=album&offset=0&limit=20&market=ES",
  "items": [
    {
      "album_group": "album",
      "album_type": "album",
      "artists": [
        {
          "external_urls": {
            "spotify": "https://open.spotify.com/artist/3bgsNtcf5d5h9jbQbohfBK"
          },
          "href": "https://api.spotify.com/v1/artists/3bgsNtcf5d5h9jbQbohfBK",
          "id": "3bgsNtcf5d5h9jbQbohfBK",
          "name": "Extremoduro",
          "type": "artist",
          "uri": "spotify:artist:3bgsNtcf5d5h9jbQbohfBK"
        }
      ],
      "external_urls": {
        "spotify": "https://open.spotify.com/album/33kfD97XqCE416YXIUMdKa"
      },
      "href": "https://api.spotify.com/v1/albums/33kfD97XqCE416YXIUMdKa",
      "id": "33kfD97XqCE416YXIUMdKa",
      "images": [
        {
          "height": 640,
          "url": "https://i.scdn.co/image/ab67616d0000b2730c181ad7973614d699da8d5c",
          "width": 640
  

In [52]:
# Test albums
print(json.dumps(extract_album('3msSCqpQLoY0nl4RTSf1Is'), indent=2))

{
  "album_type": "album",
  "artists": [
    {
      "external_urls": {
        "spotify": "https://open.spotify.com/artist/3bgsNtcf5d5h9jbQbohfBK"
      },
      "href": "https://api.spotify.com/v1/artists/3bgsNtcf5d5h9jbQbohfBK",
      "id": "3bgsNtcf5d5h9jbQbohfBK",
      "name": "Extremoduro",
      "type": "artist",
      "uri": "spotify:artist:3bgsNtcf5d5h9jbQbohfBK"
    }
  ],
  "copyrights": [
    {
      "text": "\u00a9 2010 Warner Music Spain, S.L.",
      "type": "C"
    },
    {
      "text": "\u2117 1996 DRO EAST WEST S.A.",
      "type": "P"
    }
  ],
  "external_ids": {
    "upc": "825646758890"
  },
  "external_urls": {
    "spotify": "https://open.spotify.com/album/3msSCqpQLoY0nl4RTSf1Is"
  },
  "genres": [],
  "href": "https://api.spotify.com/v1/albums/3msSCqpQLoY0nl4RTSf1Is",
  "id": "3msSCqpQLoY0nl4RTSf1Is",
  "images": [
    {
      "height": 640,
      "url": "https://i.scdn.co/image/ab67616d0000b273489635998e8b3a9dec1ae455",
      "width": 640
    },
    {
    