In [1]:
'''
Installing the packages
'''
#!pip install opendatasets
#!pip install spotipy

'\nInstalling the packages\n'

In [2]:
#import the necessary libraries
import pandas as pd
import numpy as np
import time
import opendatasets as od

#importing Spotipy library - A lightweight Python library for Spotify Web API
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
def setup_connection():
    
    '''
    setup_connection():
    
    1. Assigning unique client id and client secret from Spotify API
    2. Creating spotify client objects and initiating connection
    3. Return the spotipy object
    
    '''
    
    #Assigning unique client id and client secret
    
    cid = '2977e71288534526b508dcbbc5b89bee'
    secret = '3424bb06964041e9a73f5ca1696e3c83'
    
    #Creating spotify client objects and initiating connection
    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)
    
    return sp

In [4]:
def get_playlist_info(url, sp):
    
    '''
    get_playlist_info():
    
    1. Extracting the URI (Unique Resource Identifier) for the playlist 
    2. Get the total number of tracks in the playlist
    3. Return the values as individual variables
    
    '''
    playlist_link = url
    
    playlist_URI = playlist_link.split("/")[-1].split("?")[0]
    count_of_tracks = sp.playlist_tracks(playlist_URI)['total']
    
    return playlist_URI, count_of_tracks

In [5]:
def get_track_info(track):
    
    '''
    get_track_info():
    
    1. Create an empty dictionary, track_info
    2. Extract unique track identifier, track name, track's artist name, track's album name, Track popularity,
       Get Track URL
    3. Store the extracted values in the track_info dictionary
    4. Return the track_info dictionary
    
    '''
    track_basic_information = {}
    
    try:
        track_basic_information['track_id'] = track["track"]["uri"].split(":")[-1]
        track_basic_information['track_name'] = track["track"]["name"]
        track_basic_information['artist_name'] = track["track"]["artists"][0]["name"]
        track_basic_information['album_name'] = track["track"]["album"]["name"]
        track_basic_information['track_popularity'] = track["track"]["popularity"] 
        track_basic_information['track_url'] = "https://open.spotify.com/track/" + \
                                            track_basic_information['track_id'] 
        
            
    except:
        track_basic_information['track_id'] = np.nan
        track_basic_information['track_id'] = np.nan
        track_basic_information['track_name'] = np.nan
        track_basic_information['artist_name'] = np.nan
        track_basic_information['album_name'] = np.nan
        track_basic_information['track_popularity'] = np.nan 
        track_basic_information['track_url'] = np.nan
        
    

    return track_basic_information

In [6]:
def get_audio_features(audio_features):
    
    '''
    get_audio_features():
    
    1. Extract audio features such as danceability, energy, key, loudness, mode, speechiness, 
       acousticness, instrumentalness, liveness, valence, tempo, duration_ms, time_signature
    2. Create an empty dictionary
    3. Run a loop to extract the required audio features and append it to the dictionary
    4. Return the dictionary of audio components
    
    '''
    #Selecting columns to be extracted from the audio features
    audio_features_to_extract = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                                 'acousticness','instrumentalness', 'liveness', 'valence', 'tempo',
                                 'duration_ms', 'time_signature']
    
    track_audio_features = {}
    
    #Extracting Audio features by looping over the features
    for feature in audio_features_to_extract:
        
        if audio_features[feature] != np.nan:
            track_audio_features[feature] = audio_features[feature]
        
        else:
            track_audio_features[feature] = np.nan
     
    return track_audio_features

In [7]:
def create_track_record(sp, track):
    
    '''
    create_track_record():
    
    1. Create a track record dictionary
    2. Get basic track information as a dictionary and append it to track record
    3. Get the audio features of the track as a dictionary and append it to track record
    4. Convert the dictionary to dataframe
    5. Return the track record dataframe
    
    '''
    
    track_record = {}
    
    #Get Track info
    track_info = get_track_info(track)
    track_record.update(track_info)
    
    #Extract audio features
    try:
        audio_feature_list = sp.audio_features(track_record['track_id'])[0]
        audio_components = get_audio_features(audio_feature_list)
        
    except:
        audio_components = {'danceability': np.nan, 
         'energy': np.nan, 
         'key': np.nan, 
         'loudness': np.nan, 
         'mode': np.nan, 
         'speechiness': np.nan, 
         'acousticness': np.nan, 
         'instrumentalness': np.nan, 
         'liveness': np.nan, 
         'valence':np.nan, 
         'tempo': np.nan, 
         'duration_ms': np.nan, 
         'time_signature': np.nan}
    
    track_record.update(audio_components)
    track_record_df = pd.DataFrame([track_record])
    
    return track_record_df
    

In [8]:
def main(playlist_url):
    
    '''
    main():
    
    1. Setup the Spotify API connection using setup_connection()
    2. Get playlist_id and count of tracks in the playlist using get_playlist_info(playlist_url, spo)
    3. Run a loop over all the tracks items in the playlist uri
    4. Inside the loop, call the create_track_record function(spotipy_object, track_item)
    5. Create a dataframe and append the dictionary returned for each track
    6. Return the dataframe
    
    '''
    
    #setup the Spotify API connection
    spo = setup_connection()
    print("Playlist URL: ", playlist_url)
    
    #Get playlist_id and count of tracks in the playlist
    playlist_id, total_tracks = get_playlist_info(playlist_url, spo)
    print("Playlist id: ", playlist_id)
    print("Total Number of Tracks in the Playlist: ", total_tracks)
    
    all_data = pd.DataFrame()
    
    for i in range(0, 800, 100):
        
        print(i)
        
        for track in spo.playlist_tracks(playlist_id, limit = 100, offset = i)["items"]:
            time.sleep(3)
            track_details = create_track_record(spo, track)
            all_data = pd.concat([all_data, track_details], ignore_index=True)
            all_data.to_csv(f"spotify_playlist_api_data_{i}.csv", index = False)
            time.sleep(3)
        
        time.sleep(30)
        

    return all_data

In [None]:
url = 'https://open.spotify.com/playlist/1G8IpkZKobrIlXcVPoSIuf?si=aee64286897448f3'
dataset = main(url)
dataset

Playlist URL:  https://open.spotify.com/playlist/1G8IpkZKobrIlXcVPoSIuf?si=aee64286897448f3
Playlist id:  1G8IpkZKobrIlXcVPoSIuf
Total Number of Tracks in the Playlist:  10000
0


Max Retries reached
Max Retries reached
Max Retries reached
Max Retries reached


References:
1. https://medium.com/@maxtingle/getting-started-with-spotifys-api-spotipy-197c3dc6353b
2. https://developer.spotify.com/documentation/web-api
3. https://spotipy.readthedocs.io/en/latest/#
4. https://towardsdatascience.com/extracting-song-data-from-the-spotify-api-using-python-b1e79388d50
5. https://www.kaggle.com/code/laurabarreda/extracting-song-data-from-the-spotify-api
6. https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset/data