#### Loading credentials from another config file

In [11]:
import config

#### Starting with Spotify API called Spotipy

In [12]:
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials
import random
import pprint
import pandas as pd

# Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.client_id,
                                                           client_secret= config.client_secret, requests_timeout=20))

In [13]:
# The "sp" variable has two useful funtions:
# The first usefull function is:
# .search(q='',limit=n)
# .search(q="track:"+song_name+" artist:"+artist_name,limit=5) to restrict to a song name and artist.
# Where the "q" keyword is the query you want to perform on spotify: song_name, artist,...
# while The "limit" keyword will limit the number of returned results.
#
# The second usefull function is:
# .audio_features([URL|URI|ID])
# which returns some 'features of the song', that after cleanup, we can use in order to characterize a song.

##results = sp.search(q="Lose yourself",limit=3,market="GB")
##results
#json_results = json.dumps(results, ensure_ascii=True)
#json_results
#results['tracks']['items'][0]['external_urls']["spotify"]

#### Extracting the songs of playlists on Spotify

Pagination using "next"
When you collect songs from a playlist using sp.playlist_tracks, you're limited by the limit parameter, which has a maximum (and default) value of 100. When the playlist has more than 100 songs, you have to collect them by navigating through the "pages" of the results.

The parameter offset allows you to retrieve resuls starting at a certain position: if you start at position 101, you'd get the next "page" of results. An offset of 201 would give you the third page, and so on.

The function sp.next() does the same, but in a simpler way: it can be used on the results from any request to directly retrieve the results for the next page.

We can check whether there's a next page or not by accessing the key next on the results from any request.

#### Code runs below for getting unique 10.000 songs (1.000 per 10 music categories)

In [4]:
#!pip install pandarallel

In [5]:
import time
import pandas as pd
from pandarallel import pandarallel

start_time = time.time()

# Initialize pandarallel for parallel processing
pandarallel.initialize(progress_bar=True)

# Define a list of popular genres
genres = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]

# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=10)
    sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=True)
    for playlist in sorted_playlists:
        playlist_ids.append(playlist["id"])

# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=100)
        for track in playlist_tracks["items"]:
            if len(track_ids[genre]) >= num_tracks:
                break
            track_id = track["track"]["id"]
            if track_id not in used_track_ids:
                track_ids[genre].append(track_id)
                used_track_ids.add(track_id)

# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Define a function to retrieve audio features for a track
def get_audio_features(track_id):
    import config
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=20)
    audio_features = sp.audio_features(track_id)[0]
    return audio_features

# Retrieve audio features for each track using parallel processing
audio_features_list = pd.Series(flat_track_ids).parallel_apply(get_audio_features).tolist()

# Convert audio features list to pandas dataframe
df = pd.DataFrame(audio_features_list)
df = df[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Calculate time taken
time_taken = time.time() - start_time
print(f"Time taken: {time_taken:.2f} seconds")

df



INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


KeyboardInterrupt: 

In [None]:
##Workspace
import time
import pandas as pd
from pandarallel import pandarallel

start_time = time.time()

# Initialize pandarallel for parallel processing
pandarallel.initialize(progress_bar=True, nb_workers=4)

# Define a list of popular genres
TO_DO = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]
genres = ["pop"]
# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=20)
    sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=True)
    for playlist in sorted_playlists:
        followers = playlist.get("followers")
        if followers is not None:
            followers_total = followers.get("total")
            sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: followers_total, reverse=True)


# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=100)
        for track in playlist_tracks["items"]:
            if len(track_ids[genre]) >= num_tracks:
                break
            track_dict = track.get("track")
            if track_dict is not None:
                track_id = track_dict.get("id")
                if track_id is not None and track_id not in used_track_ids:
                    track_ids[genre].append(track_id)
                    used_track_ids.add(track_id)


# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Define a function to retrieve audio features for a track
def get_audio_features(track_id):
    import config
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=20)
    audio_features = sp.audio_features(track_id)[0]
    return audio_features

# Retrieve audio features for each track using parallel processing
audio_features_list = pd.Series(flat_track_ids).parallel_apply(get_audio_features).tolist()

# Convert audio features list to pandas dataframe
df1 = pd.DataFrame(audio_features_list)
df1 = df1[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Calculate time taken
time_taken = time.time() - start_time
print(f"Time taken: {time_taken:.2f} seconds")

df1



In [None]:
##Workspace
import time
import pandas as pd
from pandarallel import pandarallel

start_time = time.time()

# Initialize pandarallel for parallel processing
pandarallel.initialize(progress_bar=True)

# Define a list of popular genres
TO_DO = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]
genres = ["hip-hop"]
# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=20)
    sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=True)
    for playlist in sorted_playlists:
        playlist_ids.append(playlist["id"])

# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=100)
        for track in playlist_tracks["items"]:
            if len(track_ids[genre]) >= num_tracks:
                break
            track_id = track["track"]["id"]
            if track_id not in used_track_ids:
                track_ids[genre].append(track_id)
                used_track_ids.add(track_id)

# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Define a function to retrieve audio features for a track
def get_audio_features(track_id):
    import config
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    audio_features = sp.audio_features(track_id)[0]
    return audio_features

# Retrieve audio features for each track using parallel processing
audio_features_list = pd.Series(flat_track_ids).parallel_apply(get_audio_features).tolist()

# Convert audio features list to pandas dataframe
df2 = pd.DataFrame(audio_features_list)
df2 = df2[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Calculate time taken
time_taken = time.time() - start_time
print(f"Time taken: {time_taken:.2f} seconds")

df2


In [14]:
len(flat_track_ids)

NameError: name 'flat_track_ids' is not defined

In [15]:
#df.to_csv("10-000Songs.csv",index=False)

In [16]:
import pprint
pprint.pprint(playlists)

{'playlists': {'href': 'https://api.spotify.com/v1/search?query=metal&type=playlist&offset=0&limit=10',
               'items': [{'collaborative': False,
                          'description': '<a '
                                         'href=spotify:playlist:37i9dQZF1EIYKiGTCZfJap>Rammstein</a>, '
                                         '<a '
                                         'href=spotify:playlist:37i9dQZF1EIWWw7AS5OscY>System '
                                         'Of A Down</a>, <a '
                                         'href=spotify:playlist:37i9dQZF1EIWnUbsWCrSf6>Korn</a> '
                                         'and more',
                          'external_urls': {'spotify': 'https://open.spotify.com/playlist/37i9dQZF1EQpgT26jgbgRI'},
                          'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1EQpgT26jgbgRI',
                          'id': '37i9dQZF1EQpgT26jgbgRI',
                          'images': [{'height': None,
              

In [None]:
pprint.pprint(playlists)

#### Doing the same but this time selecting the least popular playlists

In [None]:
# Time taken: 7.13 seconds for 1 track
# Total Time taken: 1533.60 seconds = 25.56min
import time
import pandas as pd
from pandarallel import pandarallel

start_time = time.time()

# Initialize pandarallel for parallel processing
pandarallel.initialize(progress_bar=True)

# Define a list of popular genres
genres = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]

# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=10)
    sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=False)
    for playlist in sorted_playlists:
        playlist_ids.append(playlist["id"])

# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=100)
        for track in playlist_tracks["items"]:
            if len(track_ids[genre]) >= num_tracks:
                break
            track_id = track["track"]["id"]
            if track_id not in used_track_ids:
                track_ids[genre].append(track_id)
                used_track_ids.add(track_id)

# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Define a function to retrieve audio features for a track
def get_audio_features(track_id):
    import config
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    audio_features = sp.audio_features(track_id)[0]
    return audio_features

# Retrieve audio features for each track using parallel processing
audio_features_list = pd.Series(flat_track_ids).parallel_apply(get_audio_features).tolist()

# Convert audio features list to pandas dataframe
df_3 = pd.DataFrame(audio_features_list)
df_3 = df_3[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Calculate time taken
time_taken = time.time() - start_time
print(f"Time taken: {time_taken:.2f} seconds")

df_3



In [None]:
##TESTINGG lOOP
import time
import pandas as pd
from pandarallel import pandarallel

start_time = time.time()

# Initialize pandarallel for parallel processing
pandarallel.initialize(progress_bar=True)

# Define a list of popular genres
TODO = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]
genres = ["pop"]

# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=15)
    sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=True)
    for playlist in sorted_playlists:
        playlist_ids.append(playlist["id"])

# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=100)
        for track in playlist_tracks["items"]:
            if len(track_ids[genre]) >= num_tracks:
                break
            track_id = track["track"]["id"]
            if track_id not in used_track_ids:
                track_ids[genre].append(track_id)
                used_track_ids.add(track_id)

# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Define a function to retrieve audio features for a track
def get_audio_features(track_id):
    import config
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    audio_features = sp.audio_features(track_id)[0]
    return audio_features

# Retrieve audio features for each track using parallel processing
audio_features_list = pd.Series(flat_track_ids).parallel_apply(get_audio_features).tolist()

# Convert audio features list to pandas dataframe
pop = pd.DataFrame(audio_features_list)
pop = pop[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Calculate time taken
time_taken = time.time() - start_time
print(f"Time taken: {time_taken:.2f} seconds")

pop


In [None]:
#df_2.to_csv("least10-000Songs.csv",index=False)

In [None]:
import time
import pandas as pd
from pandarallel import pandarallel

start_time = time.time()

# Initialize pandarallel for parallel processing
pandarallel.initialize(progress_bar=True)

# Define a list of popular genres
genres = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]

# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=10)
    sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=True)
    for playlist in sorted_playlists:
        playlist_ids.append(playlist["id"])

# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=100)
        for track in playlist_tracks["items"]:
            if len(track_ids[genre]) >= num_tracks:
                break
            track_id = track["track"]["id"]
            if track_id not in used_track_ids:
                track_ids[genre].append(track_id)
                used_track_ids.add(track_id)

# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Define a function to retrieve audio features for a track
def get_audio_features(track_id):
    import config
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    audio_features = sp.audio_features(track_id)[0]
    return audio_features

# Retrieve audio features for each track using parallel processing
audio_features_list = pd.Series(flat_track_ids).parallel_apply(get_audio_features).tolist()

# Convert audio features list to pandas dataframe
df = pd.DataFrame(audio_features_list)
df = df[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Calculate time taken
time_taken = time.time() - start_time
print(f"Time taken: {time_taken:.2f} seconds")

df



In [None]:
import pandas as pd
import numpy as np
d1 = pd.read_csv("10-000Songs.csv")
d2 = pd.read_csv("least10-000Songs.csv")
d1


In [None]:
test = d1.drop_duplicates()
test

In [None]:
d1_and_2 = pd.concat([d1,d2], axis=0)

In [None]:
d1_and_2

In [None]:
DF = d1_and_2.drop_duplicates()

In [None]:
DF

In [None]:
##NOW TRYING 10.000 songs per category!
#ERROR: "NoneType object is not subscriptable" 
#usually means that you are trying to access an element of an object that is None
#fixed in code by adding 
import time
import pandas as pd
from pandarallel import pandarallel

start_time = time.time()

# Initialize pandarallel for parallel processing
pandarallel.initialize(progress_bar=True)

# Define a list of popular genres
To_Do = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]
genres = ["pop"]
# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=20)
    if playlists is not None and "playlists" in playlists and "items" in playlists["playlists"]:
        sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=True)
        for playlist in sorted_playlists:
            playlist_ids.append(playlist["id"])

# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=100)
        if playlist_tracks is not None and "items" in playlist_tracks:
            for track in playlist_tracks["items"]:
                if len(track_ids[genre]) >= num_tracks:
                    break
                track_id = track["track"]["id"]
                if track_id not in used_track_ids:
                    track_ids[genre].append(track_id)
                    used_track_ids.add(track_id)

# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Define a function to retrieve audio features for a track
def get_audio_features(track_id):
    import config
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
    client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    audio_features = sp.audio_features(track_id)[0]
    return audio_features

# Retrieve audio features for each track using parallel processing
audio_features_list = pd.Series(flat_track_ids).parallel_apply(get_audio_features).tolist()

# Convert audio features list to pandas dataframe
pop = pd.DataFrame(audio_features_list)
pop = pop[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo"]]

# Calculate time taken
time_taken = time.time() - start_time
print(f"Time taken: {time_taken:.2f} seconds")

pop



In [None]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

# Initialize the parallel processing
pandarallel.initialize(progress_bar=True)

# Define a list of popular genres
genres = ["pop", "hip-hop", "rock", "edm", "country", "rnb", "jazz", "classical", "blues", "metal"]

# Define the number of tracks to retrieve for each genre
num_tracks = 1000

# Loop through the genres and get the playlist IDs for each genre
playlist_ids = []
for genre in genres:
    playlists = sp.search(q=genre, type="playlist", limit=10, market="US")
    sorted_playlists = sorted(playlists["playlists"]["items"], key=lambda p: p.get("followers", {}).get("total", 0), reverse=True)
    for playlist in sorted_playlists:
        playlist_ids.append(playlist["id"])

# Get the track IDs for each playlist until we have 1000 tracks for each genre
track_ids = {}
for genre in genres:
    track_ids[genre] = []
    used_track_ids = set()  # keep track of the track IDs that have already been used
    for playlist_id in playlist_ids:
        if len(track_ids[genre]) >= num_tracks:
            break
        playlist_tracks = sp.playlist_tracks(playlist_id, fields="items(track(id))", limit=num_tracks)
        for track in playlist_tracks["items"]:
            if len(track_ids[genre]) >= num_tracks:
                break
            track_id = track["track"]["id"]
            if track_id not in used_track_ids:
                track_ids[genre].append(track_id)
                used_track_ids.add(track_id)

# Print the number of tracks for each genre
for genre in genres:
    print(f"{genre}: {len(track_ids[genre])}")

# Flatten the track IDs into a single list
flat_track_ids = [track_id for genre_track_ids in track_ids.values() for track_id in genre_track_ids]

# Process the audio features in parallel
def process_track(track_id):
    audio_features = sp.audio_features(track_id)[0]
    audio_features['track_id'] = track_id
    return audio_features

df = pd.DataFrame()
chunk_size = 100
for i in tqdm(range(0, len(flat_track_ids), chunk_size)):
    chunk_ids = flat_track_ids[i:i+chunk_size]
    chunk_df = pd.DataFrame(chunk_ids, columns=['id'])
    chunk_df = chunk_df.parallel_apply(lambda row: process_track(row['id']), axis=1)
    df = pd.concat([df, chunk_df])

# Reorder the columns
df = df[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Reset the index
df = df.reset_index(drop=True)

# Print the dataframe
print(df.head())


## Dummy: extracting the songs ids from 1 playlist

In [7]:
def get_playlist_tracks(username, playlist_id):
    results = sp.user_playlist_tracks(username,playlist_id,market="GB")
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

In [None]:
# playlist "all genres, all decades" with 2600 tracks
tracks=get_playlist_tracks("spotify", "6VaNNtZuGdbQ3GMNnhPl9e")
print(len(tracks)-1)
# Python is zero-based that's why len -1

In [9]:
# playlist "all genres, all decades"
tracks=get_playlist_tracks("spotify", "6VaNNtZuGdbQ3GMNnhPl9e")

list_of_audio_features=[]
for item in range(0,1100):
    list_of_audio_features.append(sp.audio_features(tracks[item]["track"]["id"])[0])


In [14]:
list_of_audio_features

[{'danceability': 0.611,
  'energy': 0.319,
  'key': 5,
  'loudness': -10.743,
  'mode': 1,
  'speechiness': 0.0357,
  'acousticness': 0.815,
  'instrumentalness': 0.0151,
  'liveness': 0.108,
  'valence': 0.176,
  'tempo': 124.92,
  'type': 'audio_features',
  'id': '6dtWKqqdveI3YvdYJQKWWn',
  'uri': 'spotify:track:6dtWKqqdveI3YvdYJQKWWn',
  'track_href': 'https://api.spotify.com/v1/tracks/6dtWKqqdveI3YvdYJQKWWn',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/6dtWKqqdveI3YvdYJQKWWn',
  'duration_ms': 226653,
  'time_signature': 4},
 {'danceability': 0.583,
  'energy': 0.61,
  'key': 5,
  'loudness': -5.639,
  'mode': 0,
  'speechiness': 0.0382,
  'acousticness': 0.00425,
  'instrumentalness': 0.000105,
  'liveness': 0.14,
  'valence': 0.336,
  'tempo': 114.06,
  'type': 'audio_features',
  'id': '3uwnnTQcHM1rDqSfA4gQNz',
  'uri': 'spotify:track:3uwnnTQcHM1rDqSfA4gQNz',
  'track_href': 'https://api.spotify.com/v1/tracks/3uwnnTQcHM1rDqSfA4gQNz',
  'analysis_url': 'https:/

In [10]:
print(tracks[item]["track"]["id"])

0pQskrTITgmCMyr85tb9qq


In [10]:
df=pd.DataFrame(list_of_audio_features)    
df=df[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

df

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms
0,0.611,0.319,-10.743,0.0357,0.81500,0.015100,0.108,0.1760,124.920,6dtWKqqdveI3YvdYJQKWWn,226653
1,0.583,0.610,-5.639,0.0382,0.00425,0.000105,0.140,0.3360,114.060,3uwnnTQcHM1rDqSfA4gQNz,214740
2,0.700,0.709,-5.006,0.0838,0.00480,0.000000,0.029,0.6240,122.019,1gv4xPanImH17bKZ9rOveR,202960
3,0.547,0.895,-4.693,0.0408,0.00226,0.000692,0.184,0.4970,90.000,6xBQEW4ge9OlbYUt5IVyVz,198973
4,0.628,0.692,-5.640,0.2020,0.00244,0.012900,0.668,0.0963,124.887,1ZHYJ2Wwgxes4m8Ba88PeK,378893
...,...,...,...,...,...,...,...,...,...,...,...
1095,0.741,0.580,-9.050,0.0304,0.11700,0.000033,0.212,0.9440,127.402,0vOkmmJEtjuFZDzrQSFzEE,171267
1096,0.613,0.487,-8.781,0.0935,0.41200,0.000010,0.286,0.6990,143.332,5haXbSJqjjM0TCJ5XkfEaC,165800
1097,0.740,0.415,-11.429,0.0594,0.28800,0.534000,0.103,0.1530,89.997,3lML37ujKr8GdxQqCYkCoz,168000
1098,0.631,0.697,-7.625,0.0454,0.36400,0.002850,0.115,0.2770,149.954,7cXVAtKlhVQdNQNzXUFLFv,207787


In [12]:
# Retrieve tracks from playlist
tracks = get_playlist_tracks("spotify", "6VaNNtZuGdbQ3GMNnhPl9e")

# Retrieve audio features for each track
list_of_audio_features = []
for item in range(0, len(tracks)):
    list_of_audio_features.append(sp.audio_features(tracks[item]["track"]["id"])[0])

# Create DataFrame with audio features
df = pd.DataFrame(list_of_audio_features)

# Select only the columns you need
df = df[["danceability","energy","loudness","speechiness","acousticness",
         "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

# Append additional rows to the DataFrame
for item in range(len(tracks), 2599):
    list_of_audio_features.append(sp.audio_features(tracks[item]["track"]["id"])[0])
    df = df.append(list_of_audio_features, ignore_index=True)
df

AttributeError: 'NoneType' object has no attribute 'keys'

In [None]:
import pprint
pprint.pprint(results)

In [None]:
results["tracks"]["items"][0]["album"]["artists"][0]["id"]
# [0] because i access first item of list [{...
# ["id"] is key because i access dictionary {...

In [None]:
results["tracks"]["items"][0]["external_urls"]["spotify"]

In [None]:
track_id=track_id=results["tracks"]["items"][0]["id"]
track_id

## Embeded track player

{'spotify': https://open.spotify.com/track/4O2N861eOnF9q8EtpH8IJu

In [None]:
from IPython.display import IFrame

#track_id = "1rfORa9iYmocEsnnZGMVC4"
#track_id= 'spotify:track:3hgl7EQwTutSm6PESsB7gZ'
IFrame(src="https://open.spotify.com/embed/track/"+track_id,
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media",
      )

In [None]:
def play_song(track_id):
    return IFrame(src="https://open.spotify.com/embed/track/"+track_id,
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media",
      )

<b> navigating through the dictionary..

In [None]:
results.keys()

In [None]:
results['tracks']["items"][0].keys()

<b> more readable version

In [None]:
import pprint

pprint.pprint(results)

In [None]:
results['tracks']['items'][0]

In [None]:
results['tracks']['items'][0].keys()

<b> getting the track id

In [None]:
results['tracks']['items'][0]["id"]

In [None]:
for item in results['tracks']['items']:
    print("The name of song is: '{}' and the id is: {}".format(item['name'],item["id"]))

In [None]:
import pandas as pd

song = sp.search(q="Bad Guy", limit=50,market="GB") 
song

In [None]:
song["tracks"]["items"][2]

In [None]:
#pprint.pprint(song['tracks']['items'][0]['uri'])
song["tracks"]["items"][0]["uri"]

# Understanding the json

Understanding the hierachy of a JSON can make you mad. Therefore you can cosider using some online pages where you can paste your JSON file and see the "tree" structure of the file.

https://codebeautify.org/jsonviewer

So, copy the json output from the previous query and paste it on the website's left panel. On the right panel you will be able to see the hierachy of the json file.

Let's get used to the json at hand.

In [None]:
print("The json file has the following keys: ",list(results.keys())) # We can see that we only have tracks
print("The 'tracks' key has the following child keys: ",list(results["tracks"].keys())) # Let's check the values
print("The query we made is: ",results["tracks"]["href"]) # Query we have searched 
print("The song's info is contained in: ",results["tracks"]["items"]) #items (actual tracks)
print("The limit of the query we've made is: ",results["tracks"]["limit"]) #Limit we have chosen
print("The next page if any: ",results["tracks"]["next"]) #link to the next page (next 50 tracks)
print("The starting webpage: ",results["tracks"]["offset"]) # Actual offset (starting point)
print("Starting webpage: ",results["tracks"]["previous"]) #Previous search
print("Total number of results: ",results["tracks"]["total"]) # Number of matches

## Checking albums

In [None]:
print(results["tracks"]["items"][0]["album"]) # we have more info about the album
print("****************\n")
print(list(results["tracks"]["items"][0]["album"].keys())) # Will check artists, id, name, release date, total tracks 
print("****************\n")
print(results["tracks"]["items"][0]["album"]["artists"]) # List with artists and information
print("****************\n")
print("The album ID is: ",results["tracks"]["items"][0]["album"]["id"]) # Album ID 
print("****************\n")
print(results["tracks"]["items"][0]["album"]["name"]) # Album name (if its a single u'll get the name of the song)

## Other Info

In [None]:
results["tracks"]["items"][0]["artists"] # Track artists
results["tracks"]["items"][0]["id"] # Track ID
results["tracks"]["items"][0]["name"] # Track name
results["tracks"]["items"][0]["popularity"] # Popularity index
results["tracks"]["items"][0]["uri"] # Basically ID

# Getting the Audio feature of a song

In [None]:
results["tracks"]["items"][0]["id"]

In [None]:
sp.audio_features(results["tracks"]["items"][0]["id"] )

In [None]:
## example of bethoven song
sp.audio_features("1Y25uib0Cu5kYTtNuRqyRU")

## building Data frame of audio features

In [None]:
sp.audio_features(song["tracks"]["items"][0]["uri"])

In [None]:
list_of_songs

In [None]:
#my_dict = sp.audio_features(song["tracks"]["items"][0]["uri"])[0] # you can provide a list of uri's

list_of_songs=[]
for index in range(0,len(song["tracks"]["items"])):
    list_of_songs.append(sp.audio_features(song["tracks"]["items"][index]["uri"])[0])
df=pd.DataFrame(list_of_songs)    
df=df[["danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","id","duration_ms"]]

df

## Searching a playlist

In [None]:
playlist = sp.user_playlist_tracks("spotify", "7beGd4yYY1qpsBv6K3clFZ",market="GB")

In [None]:
playlist["items"][0]

## extracting a song from playlist

In [None]:
playlist["items"][0]["track"]["id"]

In [None]:
play_song(playlist["items"][0]["track"]["id"])

In [None]:
print(list(playlist.keys())) # Let's look at items and total:
print("Total number of songs in the playlist: ",playlist["total"]) #  Let's check items:
len(playlist["items"]) # It is limited to 100 tracks, we will have to fix it:

## Optional(Extra)

## Getting the artists of the playlist 

In [None]:
def get_artists_from_track(track):
    return [artist["name"] for artist in track["artists"]]

In [None]:
def get_artists_from_playlist(playlist_id):
    tracks_from_playlist = get_playlist_tracks("spotify", playlist_id)
    return list(set(artist for subset in [get_artists_from_track(track["track"]) for track in tracks_from_playlist] for artist in subset))

In [None]:
get_artists_from_playlist("4rnleEAOdmFAbRcNCgZMpY")

# Getting albums 

In this section we will work with albums to extract information. We will start by extracting all the albums of an artist.

In [None]:
def get_albums_from_artist(artist_id):
    results = sp.artist_albums(artist_id, limit = 50,country="GB")
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

# Same for albums ids
def get_album_ids_from_artist(artist_id):
    results = sp.artist_albums(artist_id, limit = 50)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return [track["id"] for track in tracks]

Example: Coldplay

In [None]:
coldplay_id = "4gzpq5DPGxSnKTe4SA8HAU"
coldplay_albums = get_albums_from_artist(coldplay_id)
coldplay_album_ids = get_album_ids_from_artist(coldplay_id)

# Check artists that played with coldplay
set([artist["name"] for track in coldplay_albums for artist in track["artists"]])

## Getting the songs of a given album

In [None]:
def get_track_ids_from_albums(album_ids):
    return list(set([i["id"] for j in album_ids for i in sp.album(j)["tracks"]["items"]]))

In [None]:
coldplay_songs = get_track_ids_from_albums(coldplay_album_ids)

len(coldplay_songs)