# Lab | API wrappers - Create your collection of songs & audio features

To move forward with the project, you need to create a collection of songs with their audio features - as large as possible!


These are the songs that we will cluster. And, later, when the user inputs a song, we will find the cluster to which the song belongs and recommend a song from the same cluster. The more songs you have, the more accurate and diverse recommendations you'll be able to give. Although... you might want to make sure the collected songs are "curated" in a certain way.


Try to find playlists of songs that are diverse, but also that meet certain standards.


The process of sending hundreds or thousands of requests can take some time - it's normal if you have to wait a few minutes (or, if you're ambitious, even hours) to get all the data you need.


An idea for collecting as many songs as possible is to start with all the songs of a big, diverse playlist and then go to every artist present in the playlist and grab every song of every album of that artist. The amount of songs you'll be collecting per playlist will grow exponentially!

In [51]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
from time import sleep
import random
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd


In [52]:
secrets_file = open("secrets_spotify.txt","r")
string = secrets_file.read()

In [53]:
secrets_dict={} # create a dictionary

for line in string.split('\n'):
    if len(line) > 0: #this is used in case we have empty lines
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

In [54]:

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))

In [55]:
# Function to get playlist songs and audio features 

def playlist_songs(playlist_id):
    
    df = pd.DataFrame()
    df_audio = pd.DataFrame()
    
    music_list = []
    artist_list = []
    album_list = []
    audio_features_list = []
    
    
    results = sp.user_playlist_tracks("spotify",playlist_id)
    playlist = results['items']
    
    # Get all items
    while results['next']!=None:
        results = sp.next(results)
        playlist = playlist + results['items']
        sleep(randint(1,3))
     
    
    # Extract song name, artist and album name
    for track in playlist:
        music_list.append(track['track']['name']) #list all songs
        artist_list.append(track['track']['artists'][0]['name']) #list all artists. QUESTION = how to get multiple artists???
        album_list.append(track['track']['album']['name']) #list the name of the album
        
        audio_features_list.append(sp.audio_features(track['track']['uri'])) #audio features
        sleep(randint(1,3))
    
    

    
    
    #create df columns
    df['music'] = music_list
    df['artist'] = artist_list
    df['album'] = album_list
    
    #create df for audio features
    df_audio['audio_features'] = audio_features_list
    
    
    # flatten audio features column
    flat_list = ['audio_features']
    flat_list0 = ['audio_features_0']

    #flatten to first remove list
    for column in flat_list:
        flattened = pd.DataFrame(dict(df_audio[column])).transpose()
        columns = [str(col) for col in flattened.columns]
        flattened.columns = [column + '_' + colname for colname in columns]
        df_audio = pd.concat([df_audio, flattened], axis=1)
        df_audio = df_audio.drop(column, axis=1)
    
    
    #flatten - then remove dict
    for column in flat_list0:
        flattened = pd.DataFrame(dict(df_audio[column])).transpose()
        columns = [str(col) for col in flattened.columns]
        flattened.columns = [column + '_' + colname for colname in columns]
        df_audio = pd.concat([df_audio, flattened], axis=1)
        df_audio = df_audio.drop(column, axis=1)
    
    
    #create a single dataframe with songs and audio features 
    playlist_db = pd.concat([df,df_audio], axis = 1).reset_index()
    
    
    
    return playlist_db

In [56]:
#6FTVlz76p3Bmmw4vwKJgHy
playlist_db_hippie_fish = playlist_songs('6FTVlz76p3Bmmw4vwKJgHy')
playlist_db_hippie_fish.shape

(99, 22)

In [57]:
#37i9dQZF1EIfjV34mXjFem
playlist_db_indie_folk = playlist_songs('37i9dQZF1EIfjV34mXjFem')
playlist_db_indie_folk.shape

(50, 22)

In [58]:
#37i9dQZF1DXdVyc8LtLi96
playlist_db_sunrise_yoga = playlist_songs('37i9dQZF1DXdVyc8LtLi96')
playlist_db_sunrise_yoga.shape

(298, 22)

In [59]:
#10f57QMSHHWcnoRTHuQSRV
playlist_db_oxigenio_radio = playlist_songs('10f57QMSHHWcnoRTHuQSRV')
playlist_db_oxigenio_radio.shape

(592, 22)

In [60]:
#37i9dQZF1DXb1cKZ3eM1zf
playlist_db_pinkpop = playlist_songs('37i9dQZF1DXb1cKZ3eM1zf')
playlist_db_pinkpop.shape

ReadTimeout: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)

In [None]:
#3x6xRt9FnSbfPbgiO6incZ
playlist_db_french_cafe_house = playlist_songs('3x6xRt9FnSbfPbgiO6incZ')
playlist_db_french_cafe_house.shape

In [None]:
#37i9dQZF1DX76cnmxfAAhD
playlist_db_90road_trip = playlist_songs('37i9dQZF1DX76cnmxfAAhD')
playlist_db_90road_trip.shape

In [None]:
#37i9dQZF1DX6z20IXmBjWI
playlist_db_infinite_acoustic = playlist_songs('37i9dQZF1DX6z20IXmBjWI')
playlist_db_infinite_acoustic.shape

In [None]:
#37i9dQZF1DX4bw5oBAFpuz
playlist_db_working_at_home = playlist_songs('37i9dQZF1DX4bw5oBAFpuz')
playlist_db_working_at_home.shape

In [None]:
#37i9dQZF1DWSqmBTGDYngZ
playlist_db_songs_shower = playlist_songs('37i9dQZF1DWSqmBTGDYngZ')
playlist_db_songs_shower.shape

In [None]:
#3DMHr8z77MrapnXBXasDlF
playlist_db_vodafone_fm = playlist_songs('3DMHr8z77MrapnXBXasDlF')
playlist_db_vodafone_fm.shape

In [None]:
# create a single list with all playlists

pdList = [playlist_db_hippie_fish, playlist_db_indie_folk, playlist_db_sunrise_yoga, playlist_db_oxigenio_radio,
         playlist_db_pinkpop,playlist_db_french_cafe_house, playlist_db_90road_trip, playlist_db_infinite_acoustic,
         playlist_db_working_at_home, playlist_db_songs_shower, playlist_db_vodafone_fm]  # List of your dataframes
playlist_db_all = pd.concat(pdList, axis=0)
playlist_db_all = playlist_db_all.drop(['index'], axis = 1)
playlist_db_all.shape

In [None]:
playlist_db_all.to_excel('playlist_db_all.xlsx', sheet_name='spotify_list')  