# Lab | Extending the internal databases with audio features

At this point, you have the **hot_songs** and the **not_hot_songs** databases. However, you don't have any acoustic information about the songs. 
The purpose of this lab is to use Spotify's API to extend both databases with this information to use it later.

## Instructions

* Create a function to search a given **single** song in the Spotify API: **search_song(title, artist, limit)**. 

First importing the necessary libraries.

In [1]:
import pandas as pd
import numpy as np

import sys

# getting the spotify credentials out of this file
from config import *
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
# initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=Client_ID,
                                                           client_secret=Client_Secret))

In [3]:
# importing the previous created data frames with top 100 hot songs and top 2000 2023 "not hot" songs:

hot_songs = pd.read_csv('billboard_top100.csv')
not_hot_songs = pd.read_csv('not_hot_songs.csv')

In [4]:
hot_songs.drop(columns='Unnamed: 0', inplace=True)
not_hot_songs.drop(columns='Unnamed: 0', inplace=True)

In [5]:
hot_songs.head(100)

Unnamed: 0,artist,title
0,Jack Harlow,Lovin On Me
1,Taylor Swift,Cruel Summer
2,Tate McRae,Greedy
3,Doja Cat,Paint The Town Red
4,Zach Bryan Featuring Kacey Musgraves,I Remember Everything
...,...,...
95,Zach Bryan,Tourniquet
96,Junior H,Y Lloro
97,Sophie Ellis-Bextor,Murder On The Dancefloor
98,Karol G,Amargura


In [6]:
#artists = hot_songs['artist'].to_list()
#titles = hot_songs['title'].to_list()

In [7]:
hot_songs['title'][0]

'Lovin On Me'

In [8]:
"tracks:"+hot_songs['title'][0]+" artist:"+hot_songs['artist'][0]

'tracks:Lovin On Me artist:Jack Harlow'

In [9]:
results = sp.search(q="tracks:"+hot_songs['title'][0]+" artist:"+hot_songs['artist'][0],limit=1)
results

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=tracks%3ALovin+On+Me+artist%3AJack+Harlow&type=track&offset=0&limit=1',
  'items': [{'album': {'album_type': 'compilation',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of'},
       'href': 'https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of',
       'id': '0LyfQWJT6nXafLPZqxe9Of',
       'name': 'Various Artists',
       'type': 'artist',
       'uri': 'spotify:artist:0LyfQWJT6nXafLPZqxe9Of'}],
     'available_markets': ['AR',
      'AU',
      'AT',
      'BE',
      'BO',
      'BR',
      'BG',
      'CA',
      'CL',
      'CO',
      'CR',
      'CY',
      'CZ',
      'DK',
      'DO',
      'DE',
      'EC',
      'EE',
      'SV',
      'FI',
      'FR',
      'GR',
      'GT',
      'HN',
      'HK',
      'HU',
      'IS',
      'IE',
      'IT',
      'LV',
      'LT',
      'LU',
      'MY',
      'MT',
      'MX',
      'NZ',
      'NI',
      'NO',
 

In [10]:
json_results = json.dumps(results, ensure_ascii=True)
json_results

'{"tracks": {"href": "https://api.spotify.com/v1/search?query=tracks%3ALovin+On+Me+artist%3AJack+Harlow&type=track&offset=0&limit=1", "items": [{"album": {"album_type": "compilation", "artists": [{"external_urls": {"spotify": "https://open.spotify.com/artist/0LyfQWJT6nXafLPZqxe9Of"}, "href": "https://api.spotify.com/v1/artists/0LyfQWJT6nXafLPZqxe9Of", "id": "0LyfQWJT6nXafLPZqxe9Of", "name": "Various Artists", "type": "artist", "uri": "spotify:artist:0LyfQWJT6nXafLPZqxe9Of"}], "available_markets": ["AR", "AU", "AT", "BE", "BO", "BR", "BG", "CA", "CL", "CO", "CR", "CY", "CZ", "DK", "DO", "DE", "EC", "EE", "SV", "FI", "FR", "GR", "GT", "HN", "HK", "HU", "IS", "IE", "IT", "LV", "LT", "LU", "MY", "MT", "MX", "NZ", "NI", "NO", "PA", "PY", "PE", "PH", "PL", "PT", "SG", "SK", "ES", "SE", "CH", "TW", "TR", "UY", "US", "GB", "AD", "LI", "MC", "ID", "JP", "TH", "VN", "RO", "IL", "SA", "AE", "BH", "QA", "OM", "KW", "LB", "JO", "PS", "IN", "KZ", "MD", "UA", "AL", "BA", "HR", "ME", "MK", "RS", "SI",

In [11]:
results['tracks']['items'][0]['id']

'5RLLhNTZllxd0siTTcaTTM'

In [12]:
def get_song_ids(df: pd.DataFrame):
    """
    Get the ID of the songs
    """
    import time
    
    list_of_ids = []
    
    # First, we are creating chunks:
    chunk_size = 50
    
    for start in range(0, len(df), chunk_size):
        chunk = df[start:start+chunk_size]
        
        for index, row in chunk.iterrows():
            try:
                #search_song = sp.search(q="tracks:"+df['title'][s]+" artist:"+df['artist'][s],limit=1)
                search_song = sp.search(q=row['title'], limit=1)
                song_id = search_song['tracks']['items'][0]['id']
                list_of_ids.append(song_id)
            
            except:
                print("Song not found!")
                list_of_ids.append("")
                
        print("Sleeping a bit before getting the next ids")
        time.sleep(20)
        
    return list_of_ids

In [13]:
hot_song_id_list = get_song_ids(hot_songs)

Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids


In [14]:
hot_song_id_list

['4xhsWYTOGcal8zt0J161CU',
 '1BxfuPKGuaTgP7aM0Bbdwr',
 '3rUGC1vUpkDG9CZFHMur1t',
 '2IGMVunIBsBLtEQyoI1Mu7',
 '4KULAymBBJcPRpk1yO4dOG',
 '4iZ4pt7kvcaH6Yo8UoZ4s2',
 '5aIVCx5tnk0ntmdiinnYvw',
 '59uQI0PADDKeE6UZDTJEe8',
 '1Lo0QY9cvc8sUB2vnIOxDT',
 '7dJYggqjKo71KI9sLzqCs8',
 '0PAcdVzhPO4gq1Iym9ESnK',
 '6usohdchdzW9oML7VC4Uhk',
 '1Iq8oo9XkmmvCQiGOfORiz',
 '0cVyQfDyRnMJ0V3rjjdlU3',
 '7DSAEUvxU8FajXtRloy8M0',
 '1yeB8MUNeLo9Ek1UEpsyz6',
 '1a73gcEg6h6Re6hHXoVltJ',
 '7MSWxMumjz6lHj7oRApNbg',
 '0mflMxspEfB0VbI1kyLiAv',
 '6wf7Yu7cxBSPrRlWeSeK0Q',
 '1kuGVB7EU95pJObxwvfwKS',
 '07on0OB2cdPt79IFsgdRUy',
 '11C4y2Yz1XbHmaQwO06s9f',
 '4rXLjWdF2ZZpXCVTfWcshS',
 '5ya0TmUQw2wHMkq36rPsnd',
 '4OMJGnvZfDvsePyCwRGO7X',
 '4RvWPyQ5RL0ao9LPZeSouE',
 '1rnYNAJPAb586NhetosdNW',
 '7aqfrAY2p9BUSiupwk3svU',
 '41WQUSINanQHfhfKpFDsms',
 '3OHfY25tqY28d16oZczHc8',
 '0J1YRLYhTG1MBosjidD7OI',
 '2gT2iF1YK5r54A2PDEXVv8',
 '0R6NfOiLzLj4O5VbYSJAjf',
 '1o8Z7GD1CeOaVBEyuzu4HO',
 '73NzYbpaYvQ7JfpjztFESL',
 '3vkCueOmm7xQDoJ17W1Pm3',
 

Once the desired song is located, **the function should return the href/id/uri of the song to the code** (not to the user) to get the audio features.

* Create a function **get_audio_features(list_of_song_ids)** to obtain the audio features of a given list of songs (the content of list_of_songs can be the href/id/uri or a list with a single song IDs). 

In [22]:
def get_audio_features(list_of_song_ids: list):
    """
    Using the song IDs to get the audio features out of the Spotify Database.
    """
    
    import time 
    
    feature_list = []
    feature_df = pd.DataFrame()
    # define a chunk size
    chunk_size = 50
    
    for start in range(0, len(list_of_song_ids), chunk_size):
        #for start in range(0,100,50) -> chunk_size is the increment. second loop will start from chunk_size
        try:
            features = sp.audio_features(tracks=list_of_song_ids[start:start+chunk_size])
            #features = sp.audio_features(tracks=[1,2,3,4,…50])
            
            for f in features:
                df_temp = pd.DataFrame([f])
                feature_df = pd.concat([feature_df, df_temp], ignore_index=True)
        
        except:
            #print(f'Error processing tracks ({i} : {i+chunk_size}) -> {err}')
            print("Error processing tracks")
    print("Sleeping a bit before getting the next ids")
    time.sleep(20)
    
    return feature_df

In [24]:
hot_song_feature_df = get_audio_features(hot_song_id_list)

Sleeping a bit before getting the next ids


In [25]:
hot_song_feature_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.943,0.558,2,-4.911,1,0.0568,0.0026,2e-06,0.0937,0.606,104.983,audio_features,4xhsWYTOGcal8zt0J161CU,spotify:track:4xhsWYTOGcal8zt0J161CU,https://api.spotify.com/v1/tracks/4xhsWYTOGcal...,https://api.spotify.com/v1/audio-analysis/4xhs...,138411,4
1,0.552,0.702,9,-5.707,1,0.157,0.117,2.1e-05,0.105,0.564,169.994,audio_features,1BxfuPKGuaTgP7aM0Bbdwr,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,https://api.spotify.com/v1/tracks/1BxfuPKGuaTg...,https://api.spotify.com/v1/audio-analysis/1Bxf...,178427,4
2,0.75,0.733,6,-3.18,0,0.0319,0.256,0.0,0.114,0.844,111.018,audio_features,3rUGC1vUpkDG9CZFHMur1t,spotify:track:3rUGC1vUpkDG9CZFHMur1t,https://api.spotify.com/v1/tracks/3rUGC1vUpkDG...,https://api.spotify.com/v1/audio-analysis/3rUG...,131872,1
3,0.868,0.538,5,-8.603,1,0.174,0.269,3e-06,0.0901,0.732,99.968,audio_features,2IGMVunIBsBLtEQyoI1Mu7,spotify:track:2IGMVunIBsBLtEQyoI1Mu7,https://api.spotify.com/v1/tracks/2IGMVunIBsBL...,https://api.spotify.com/v1/audio-analysis/2IGM...,231750,4
4,0.429,0.453,0,-7.746,1,0.0459,0.554,2e-06,0.102,0.155,77.639,audio_features,4KULAymBBJcPRpk1yO4dOG,spotify:track:4KULAymBBJcPRpk1yO4dOG,https://api.spotify.com/v1/tracks/4KULAymBBJcP...,https://api.spotify.com/v1/audio-analysis/4KUL...,227196,4


In [27]:
hot_song_feature_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.943,0.558,2,-4.911,1,0.0568,0.00260,0.000002,0.0937,0.606,104.983,audio_features,4xhsWYTOGcal8zt0J161CU,spotify:track:4xhsWYTOGcal8zt0J161CU,https://api.spotify.com/v1/tracks/4xhsWYTOGcal...,https://api.spotify.com/v1/audio-analysis/4xhs...,138411,4
1,0.552,0.702,9,-5.707,1,0.1570,0.11700,0.000021,0.1050,0.564,169.994,audio_features,1BxfuPKGuaTgP7aM0Bbdwr,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,https://api.spotify.com/v1/tracks/1BxfuPKGuaTg...,https://api.spotify.com/v1/audio-analysis/1Bxf...,178427,4
2,0.750,0.733,6,-3.180,0,0.0319,0.25600,0.000000,0.1140,0.844,111.018,audio_features,3rUGC1vUpkDG9CZFHMur1t,spotify:track:3rUGC1vUpkDG9CZFHMur1t,https://api.spotify.com/v1/tracks/3rUGC1vUpkDG...,https://api.spotify.com/v1/audio-analysis/3rUG...,131872,1
3,0.868,0.538,5,-8.603,1,0.1740,0.26900,0.000003,0.0901,0.732,99.968,audio_features,2IGMVunIBsBLtEQyoI1Mu7,spotify:track:2IGMVunIBsBLtEQyoI1Mu7,https://api.spotify.com/v1/tracks/2IGMVunIBsBL...,https://api.spotify.com/v1/audio-analysis/2IGM...,231750,4
4,0.429,0.453,0,-7.746,1,0.0459,0.55400,0.000002,0.1020,0.155,77.639,audio_features,4KULAymBBJcPRpk1yO4dOG,spotify:track:4KULAymBBJcPRpk1yO4dOG,https://api.spotify.com/v1/tracks/4KULAymBBJcP...,https://api.spotify.com/v1/audio-analysis/4KUL...,227196,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.593,0.397,6,-8.309,1,0.0329,0.68400,0.000007,0.0982,0.320,76.703,audio_features,3EvZ03hGAFwGZ2Ebcu86YH,spotify:track:3EvZ03hGAFwGZ2Ebcu86YH,https://api.spotify.com/v1/tracks/3EvZ03hGAFwG...,https://api.spotify.com/v1/audio-analysis/3EvZ...,189053,4
96,0.728,0.589,7,-7.115,1,0.0376,0.42400,0.000008,0.1660,0.767,77.475,audio_features,6RcAHyC5sAUIbPTkhOQwd8,spotify:track:6RcAHyC5sAUIbPTkhOQwd8,https://api.spotify.com/v1/tracks/6RcAHyC5sAUI...,https://api.spotify.com/v1/audio-analysis/6RcA...,179013,4
97,0.730,0.849,1,-5.281,0,0.0299,0.00234,0.000026,0.3120,0.887,117.310,audio_features,4tKGFmENO69tZR9ahgZu48,spotify:track:4tKGFmENO69tZR9ahgZu48,https://api.spotify.com/v1/tracks/4tKGFmENO69t...,https://api.spotify.com/v1/audio-analysis/4tKG...,230013,4
98,0.920,0.696,6,-3.356,0,0.0742,0.18300,0.000000,0.1490,0.545,106.966,audio_features,505v13epFXodT9fVAJ6h8k,spotify:track:505v13epFXodT9fVAJ6h8k,https://api.spotify.com/v1/tracks/505v13epFXod...,https://api.spotify.com/v1/audio-analysis/505v...,170480,4


* Once the previous function has been created, create another function **add_audio_features(df, audio_features_df)** to concat a given dataframe with the audio features dataframe and return the extended data frame.

In [28]:
def add_audio_features(df, audio_features_df):
    """
    Concats a given dataframe with the audio features dataframe and return the extended data frame. 
    """
    
    final_df = pd.concat([df, audio_features_df], axis=1)
    
    return final_df

In [29]:
final_hot_songs = add_audio_features(hot_songs, hot_song_feature_df)

In [30]:
final_hot_songs.head()

Unnamed: 0,artist,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Jack Harlow,Lovin On Me,0.943,0.558,2,-4.911,1,0.0568,0.0026,2e-06,0.0937,0.606,104.983,audio_features,4xhsWYTOGcal8zt0J161CU,spotify:track:4xhsWYTOGcal8zt0J161CU,https://api.spotify.com/v1/tracks/4xhsWYTOGcal...,https://api.spotify.com/v1/audio-analysis/4xhs...,138411,4
1,Taylor Swift,Cruel Summer,0.552,0.702,9,-5.707,1,0.157,0.117,2.1e-05,0.105,0.564,169.994,audio_features,1BxfuPKGuaTgP7aM0Bbdwr,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,https://api.spotify.com/v1/tracks/1BxfuPKGuaTg...,https://api.spotify.com/v1/audio-analysis/1Bxf...,178427,4
2,Tate McRae,Greedy,0.75,0.733,6,-3.18,0,0.0319,0.256,0.0,0.114,0.844,111.018,audio_features,3rUGC1vUpkDG9CZFHMur1t,spotify:track:3rUGC1vUpkDG9CZFHMur1t,https://api.spotify.com/v1/tracks/3rUGC1vUpkDG...,https://api.spotify.com/v1/audio-analysis/3rUG...,131872,1
3,Doja Cat,Paint The Town Red,0.868,0.538,5,-8.603,1,0.174,0.269,3e-06,0.0901,0.732,99.968,audio_features,2IGMVunIBsBLtEQyoI1Mu7,spotify:track:2IGMVunIBsBLtEQyoI1Mu7,https://api.spotify.com/v1/tracks/2IGMVunIBsBL...,https://api.spotify.com/v1/audio-analysis/2IGM...,231750,4
4,Zach Bryan Featuring Kacey Musgraves,I Remember Everything,0.429,0.453,0,-7.746,1,0.0459,0.554,2e-06,0.102,0.155,77.639,audio_features,4KULAymBBJcPRpk1yO4dOG,spotify:track:4KULAymBBJcPRpk1yO4dOG,https://api.spotify.com/v1/tracks/4KULAymBBJcP...,https://api.spotify.com/v1/audio-analysis/4KUL...,227196,4


In [31]:
not_hot_song_id_list = get_song_ids(not_hot_songs)

Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a bit before getting the next ids
Sleeping a 

In [32]:
len(not_hot_song_id_list)

1996

In [33]:
not_hot_song_feature_df = get_audio_features(not_hot_song_id_list)

Sleeping a bit before getting the next ids


In [36]:
final_not_hot_songs= add_audio_features(not_hot_songs, not_hot_song_feature_df)

In [37]:
final_hot_songs.to_csv('hot_songs.csv')
final_not_hot_songs.to_csv('not_hot_songs.csv')

In [38]:
function_file = 'my_function.py'  
with open(function_file, 'a') as file:
    file.write('''
    def get_song_ids(df: pd.DataFrame):
    """
    Get the ID of the songs
    """
    import time
    
    list_of_ids = []
    
    # First, we are creating chunks:
    chunk_size = 50
    
    for start in range(0, len(df), chunk_size):
        chunk = df[start:start+chunk_size]
        
        for index, row in chunk.iterrows():
            try:
                #search_song = sp.search(q="tracks:"+df['title'][s]+" artist:"+df['artist'][s],limit=1)
                search_song = sp.search(q=row['title'], limit=1)
                song_id = search_song['tracks']['items'][0]['id']
                list_of_ids.append(song_id)
            
            except:
                print("Song not found!")
                list_of_ids.append("")
                
        print("Sleeping a bit before getting the next ids")
        time.sleep(20)
        
    return list_of_ids
    
    def get_audio_features(list_of_song_ids: list):
    """
    """
    
    import time 
    
    feature_list = []
    
    # First, we are creating chunks:
    chunk_size = 50
    
    for start in range(0, len(list_of_song_ids), chunk_size):
        chunk = list_of_song_ids[start:start+chunk_size]
        
        for i in chunk:
            try:
                my_dict = sp.audio_features([i])[0]
                #my_dict_new = {key : [my_dict[key]] for key in my_dict.keys()}
                feature_list.append(my_dict)
                
            except:
                print("Error retrieving features for song:", i)
    
        print("Sleeping a bit before getting the next ids")
        time.sleep(20)
        
    feature_df = pd.DataFrame(feature_list)
    
    return feature_df
    
    def add_audio_features(df, audio_features_df):
    """
    Concats a given dataframe with the audio features dataframe and return the extended data frame. 
    """
    
    final_df = pd.concat([df, audio_features_df], axis=1)
    
    return final_df
    ''')