# Collecting songs and their properties for the music reconmender

#### Importing libraries

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from random import randint
from time import sleep
from itertools import islice
from pandas import json_normalize

#### Getting access data for the API

In [2]:
secrets_file = open("Access.txt","r")
string = secrets_file.read()
# string

In [3]:
# Creating a dictionary with the credentials
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]
# secrets_dict

#### Initialising the connection

In [4]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                           client_secret=secrets_dict['clientsecret']))

## Getting data

First I will create a small dataframe on this playlist
'5fo41o54DPTvdPO2uMTDH1'
with just about 190 songs. 
When I am sure that everything works, I will aplly my functions on a bigger playlist.

#### Example data

In [5]:
# gettiong the playlist
playlist = sp.user_playlist_tracks("spotify", "5fo41o54DPTvdPO2uMTDH1")

In [6]:
playlist['total']

192

In [7]:
len(playlist['items'])

100

In [8]:
# playlist.keys()

In [9]:
# playlist['offset']

In [10]:
# playlist['items'][1]

In [11]:
# playlist['items'][0]['track']

In [12]:
playlist['items'][0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [13]:
playlist['items'][0]['track']['duration_ms']

147410

### What do we want to do?
1. We will iterate over the playlist, putting all songs into a list.
2. We will pull out the desired features into a dataframe.
3. We will get the audiofeatures for all songs in the dataframe.

What to consider: We have to be aware of the piossibility of timeouts during our API requests and consider that in creating our function.

We create a dataframe, where we will finally store our songs
- columns = ['id','title','artist_name','artist_id,'album','length','explicit']

#### 1. Iterating over the playlist, pulling all songs out.

In [14]:
# We need to get everything!
def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3)) # respectful nap
    return tracks       

In [15]:
playlist = get_playlist_tracks('5fo41o54DPTvdPO2uMTDH1')
len(playlist)

192

#### 1.2 Getting all songfeatures from the playlist

In [16]:
# Using the function form the lesson, modified that it only returns artists
def get_name_artists_from_track(track):
    return [artist["name"] for artist in track["artists"]]

In [17]:
# Same for artist id
def get_name_artists_id_from_track(track):
    return [artist["id"] for artist in track["artists"]]

In [18]:
def get_tracks(playlist):
    tracklist = []
    for i in range(len(playlist)):
        tracklist.append([
            playlist[i]['track']['uri'],
            playlist[i]['track']['name'],
            get_name_artists_from_track(playlist[i]['track'])[0],
            get_name_artists_id_from_track(playlist[i]['track'])[0],
            playlist[i]['track']['album']['id'],
            playlist[i]['track']['album']['name'],
            playlist[i]['track']['duration_ms'],
            playlist[i]['track']['explicit'],
            playlist[i]['track']['popularity']
        ]
        )
    return tracklist

In [19]:
songs = get_tracks(playlist)

#### 2. We will pull out the desired features into a dataframe.

In [20]:
df_songs = pd.DataFrame(data = songs, columns = ['uri','title','artist_name','artist_id','album_id','album_name','length','explicit','popularity'])

In [21]:
display(df_songs)

Unnamed: 0,uri,title,artist_name,artist_id,album_id,album_name,length,explicit,popularity
0,spotify:track:5JVA0t7r2Y7m9NaHmgaeiC,Remedy,Leony,2NpPlwwDVYR5dIj0F31EcC,4nKguZWie1WQuxFspIwHOY,Remedy,147410,False,72
1,spotify:track:6Hj9jySrnFppAI0sEMCZpJ,Robbery,Juice WRLD,4MCBfE4596Uoi2O4DtmEMz,1GYVNOzwhx1nMcIJDogSNp,Death Race For Love,240050,True,10
2,spotify:track:7gpSIL1cTo9hRaJWzJ366l,Shots,HBz,7I2JG3CcPawkeQPE7uypHJ,1usEYgVkSLHg5nWQcbZTpT,Family,167230,False,62
3,spotify:track:0RSZ8EmUPEN3ySfCgytPke,Auf & Ab,Montez,5ZY4M2aGiTaZQEP6HfqeJc,0HSkqlRyseGrpCBrrTv773,Auf & Ab,165476,False,6
4,spotify:track:6CDzDgIUqeDY5g8ujExx2f,Heat Waves,Glass Animals,4yvcSjfu4PC0CYQyLy4wSq,69K1zrf6TkXHdYUO8n2qVi,Heat Waves,238805,False,84
...,...,...,...,...,...,...,...,...,...
187,spotify:track:7qqZZt1ey8D2OBRkccCakO,LEERE AUGEN,FiNCH,1ZyqnbV7Brg5LgyS4EZCUD,6ik1OcdVYKnuJjGWciH8Gw,LEERE AUGEN,178818,False,56
188,spotify:track:194lppukv9RRDj2yqHrO6N,Dance Dance x Hung Up - Remix,NVBR,2SK1xoft5PyuLAxTjK9RVe,1oonpEPBsvmCdRgndYYcLZ,Dance Dance x Hung Up (Remix),223006,False,68
189,spotify:track:0oFHlnUwZVhizux1FOAsfr,Never Forget,GASHI,0JOxt5QOwq0czoJxvSc5hS,3mbsaMY2Dp9qtV44xrM9wW,Cabin Fever,289485,True,48
190,spotify:track:0b6wdul3A5sQNpIOv03OxP,Ocean Drive,Duke Dumont,61lyPtntblHJvA7FMMhi7E,5APvbPGki6FOQO6rNEuXCv,Ocean Drive,206320,False,72


#### 3. We will get the audiofeatures for all songs in the dataframe.

In [22]:
# We pull out the features and then wait a bit so we don't access the api too often whhen apllying
def get_features_delayed(uri):
    sleep(randint(1,2)) # respectful nap
    return sp.audio_features(uri)   

In [23]:
# Commented out, takes too long to run over and over again

# df_songs['features'] = df_songs['uri'].apply(get_features_delayed).copy()
# df_songs

In [24]:
# print(get_features_delayed('spotify:track:1qQWAmVIfqIEGH7BNBrbPO'))

# Making the functions resilient to errors

#### Feature extraction

In [25]:
# We have to make the function resilient to errors in case the coinnection breaks
def get_features_delayed(uri):
    # First we try to get the features
    try:
        sleep(randint(1,2)) # respectful nap
        features = sp.audio_features(uri)
    # if it doesnt work we try again, for this we make the function recursive
    except:
        print('Error occured while getting feature names')
        features = get_features_delayed(uri)
    # Finally we return the features
    return features

In [26]:
# df_songs.head(3)

In [27]:
def flatten_features(df):
    featurelist = []
    for i in range(len(df['features'])):
        try:
            featurelist.append([
            df['features'][i][0]['danceability'],
            df['features'][i][0]['energy'],
            df['features'][i][0]['key'],
            df['features'][i][0]['loudness'],
            df['features'][i][0]['mode'],
            df['features'][i][0]['speechiness'],
            df['features'][i][0]['acousticness'],
            df['features'][i][0]['instrumentalness'],
            df['features'][i][0]['liveness'],
            df['features'][i][0]['valence'],
            df['features'][i][0]['tempo']
            ])
        except:
            featurelist.append([0,0,0,0,0,0,0,0,0,0,0])
    featureframe = pd.DataFrame(featurelist,columns = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo'] )
    df = pd.concat([df,featureframe],axis = 1)
    df = df.drop('features',axis = 1)
    return df

In [28]:
# For testing the function, commented it out, takes too long
# df_songs2 = df_songs.copy()
# df_test = flatten_features(df_songs)
# df_test.head(5)

#### Getting all songs

In [29]:
def get_playlist_tracks_festures_to_dataframe_csv(playlist_id):
    # We get the first chunk of the playlist
    results = sp.user_playlist_tracks("spotify",playlist_id)
    filenumber = 1
    # Some things to monitor the progress
    errorcount = 0
    fetched = 100
    
    
    # Saving first batch, everything explained in detail in the while loop
    flat = get_tracks(results['items'])
    resultframe = pd.DataFrame(data = flat, columns = ['uri','title','artist_name','artist_id','album_id','album_name','length','explicit','popularity'])
    resultframe['features'] = resultframe['uri'].apply(get_features_delayed)
    resultframe = flatten_features(resultframe)
    resultframe.to_csv('Data/part'+str(filenumber)+'.csv', index=False)
    print('First batch successfull')
    
    
    # We iterate over the whole playlist
    while results['next']!=None:
        # We try to get more from the next iteration
        try:
            sleep(randint(1,3)) # respectful nap
            
            # Getting the next batch
            results = sp.next(results)
    
            # Making our dataframe
            flat = get_tracks(results['items'])
            resultframe = pd.DataFrame(data = flat, columns = ['uri','title','artist_name','artist_id','album_id','album_name','length','explicit','popularity'])
            
            # Get features
            resultframe['features'] = resultframe['uri'].apply(get_features_delayed)
            
            # Flatten the features
            resultframe = flatten_features(resultframe)
            
            # Saving result
            filenumber += 1
            resultframe.to_csv('Data/part'+str(filenumber)+'.csv', index=False)
            
            # If it succeeds we report:
            fetched += 100
            print('Fetched '+str(fetched)+' out of '+str(results['total'])+' '+str(100*(fetched/results['total']))+'%')
            
        # If it does not work we give out an error, since the results did not change the next
        # iteration will try again
        except:
            errorcount += 1
            print('Error while fetching. # '+str(errorcount))
        
       
    return pd.DataFrame(resultframe)

In [30]:
# test = get_playlist_tracks_festures_to_dataframe_csv('5fo41o54DPTvdPO2uMTDH1')

In [31]:
test = pd.read_csv('./Data/part2.csv')
test.head(20)

Unnamed: 0,uri,title,artist_name,artist_id,album_id,album_name,length,explicit,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,spotify:track:0fmbjLzenF8z20WN0QPSXl,999 (Remix),MYT,0q9PBbZ4ZWpyY4hK6IpXui,49QJjwNEWrL9Vl9SO8qXGq,999 (Remix),208000,True,59,0.669,0.452,1,-12.112,1,0.231,0.42,0.291,0.0999,0.106,165.035
1,spotify:track:5fQllnHCmwjhmhnpFAcknW,High sein,HBz,7I2JG3CcPawkeQPE7uypHJ,76UbTO36WLsNpyaGQ54JV0,High sein,133943,False,64,0.669,0.873,1,-6.767,1,0.179,0.0405,0.00181,0.252,0.442,142.081
2,spotify:track:43cTBSgL0p3JfAqVwk3SBR,Falsche Liebe,K-Fly,4jDdEv3oafVcJfrywVDi4p,4VvOcsfEsqT4RszOHUU3Px,Falsche Liebe,215327,False,43,0.452,0.628,8,-14.86,0,0.445,0.221,0.0,0.12,0.359,90.632
3,spotify:track:5YdnOm5990Kfq1Jodws98B,Call It Love,Felix Jaehn,4bL2B6hmLlMWnUEZnorEtG,5c3YGhnf058Op4YviM73wn,Call It Love,154560,False,84,0.616,0.841,5,-4.779,0,0.076,0.0559,0.00217,0.417,0.714,110.029
4,spotify:track:5uSFGgIfHMT3osrAd9n9ym,Forget Me,Lewis Capaldi,4GNC7GD6oZMSxPGyXy4MNB,50IWCes196EP2nWO6i4I67,Forget Me,203472,False,86,0.669,0.742,1,-3.518,0,0.0441,0.296,0.0,0.355,0.716,101.982
5,spotify:track:6kvW6EI5UuqCmlp7L7HD9O,Someone Else,ClockClock,4NSzuIc0eGOftqr0tEOhJk,5HbBzk9KEgIAi2mWj4ubCh,Someone Else,173423,False,74,0.817,0.784,5,-4.971,0,0.0463,0.119,0.0,0.13,0.537,131.995
6,spotify:track:0R3QYRCLL1UryJhxXQM07e,Adilettenstyle,Specktakel,1ErIf1pAAaw5upKsCbqhzz,36rPPcwcImUfUmll4PxX7T,Adilettenstyle,213745,False,42,0.663,0.934,0,-1.9,1,0.138,0.00682,0.0,0.195,0.614,139.87
7,spotify:track:3EJRsDYQdoBW8S2uj6EhQq,Drop Drop Drop,THOVI,55E3g7oUV2dhqCWao7kzm3,6kAIOQEpuhgPQ7P2Hv5CMZ,Drop Drop Drop,122081,False,58,0.754,0.885,2,-5.926,1,0.232,0.102,0.00316,0.236,0.701,149.959
8,spotify:track:2cvwCpqdCeL6CtXTd2JJNQ,MÜNCHEN,Tream,6vNAKgK5d74N1I0zTxRPDp,5jCZm7SNgFxihxrtcZvRH0,MÜNCHEN,157221,False,53,0.68,0.951,0,-4.8,0,0.349,0.365,0.0,0.15,0.536,169.983
9,spotify:track:6HUWZBldbQhzvBMqaNkNYM,AUF DIE ZEITEN DIE MAL WAREN,Tream,6vNAKgK5d74N1I0zTxRPDp,399uH7NgKhqgX4vOIf6fS4,AUF DIE ZEITEN DIE MAL WAREN,122666,True,54,0.59,0.732,1,-4.605,1,0.0713,0.286,0.0,0.124,0.457,89.969


# Applying the working mechanism to a long playlist

In [32]:
"5S8SJdl1BDc0ugpkEvFsIL"

'5S8SJdl1BDc0ugpkEvFsIL'

In [33]:
final = get_playlist_tracks_festures_to_dataframe_csv('5S8SJdl1BDc0ugpkEvFsIL')

First batch successfull
Fetched 200 out of 10000 2.0%
Fetched 300 out of 10000 3.0%
Fetched 400 out of 10000 4.0%
Fetched 500 out of 10000 5.0%
Fetched 600 out of 10000 6.0%
Fetched 700 out of 10000 7.000000000000001%
Fetched 800 out of 10000 8.0%
Fetched 900 out of 10000 9.0%
Fetched 1000 out of 10000 10.0%
Fetched 1100 out of 10000 11.0%
Fetched 1200 out of 10000 12.0%
Fetched 1300 out of 10000 13.0%
Fetched 1400 out of 10000 14.000000000000002%
Fetched 1500 out of 10000 15.0%
Fetched 1600 out of 10000 16.0%
Fetched 1700 out of 10000 17.0%
Fetched 1800 out of 10000 18.0%
Fetched 1900 out of 10000 19.0%
Fetched 2000 out of 10000 20.0%
Fetched 2100 out of 10000 21.0%
Fetched 2200 out of 10000 22.0%
Fetched 2300 out of 10000 23.0%
Fetched 2400 out of 10000 24.0%
Fetched 2500 out of 10000 25.0%
Fetched 2600 out of 10000 26.0%
Fetched 2700 out of 10000 27.0%
Fetched 2800 out of 10000 28.000000000000004%
Fetched 2900 out of 10000 28.999999999999996%
Fetched 3000 out of 10000 30.0%
Fetched 

Expected id of type track but found type In+the+Shadow+of+the+Valley spotify:local:::In+the+Shadow+of+the+Valley:187


Fetched 4400 out of 10000 44.0%
Fetched 4500 out of 10000 45.0%
Fetched 4600 out of 10000 46.0%
Fetched 4700 out of 10000 47.0%


Expected id of type track but found type April+Sweatpants spotify:local:::April+Sweatpants:166


Fetched 4800 out of 10000 48.0%
Fetched 4900 out of 10000 49.0%


Expected id of type track but found type Gee+%28Korean+Ver.+MP3+only%29 spotify:local:SNSD::Gee+%28Korean+Ver.+MP3+only%29:205
Expected id of type track but found type Hero spotify:local:::Hero:194
Expected id of type track but found type Beneath+the+Mask+%5BWith+Lyrics%5D+-+Persona+5 spotify:local:::Beneath+the+Mask+%5BWith+Lyrics%5D+-+Persona+5:285
Expected id of type track but found type Last+Surprise+%5BWith+Lyrics%5D+-+Persona+5 spotify:local:::Last+Surprise+%5BWith+Lyrics%5D+-+Persona+5:236


Fetched 5000 out of 10000 50.0%
Fetched 5100 out of 10000 51.0%
Fetched 5200 out of 10000 52.0%
Fetched 5300 out of 10000 53.0%
Fetched 5400 out of 10000 54.0%
Fetched 5500 out of 10000 55.00000000000001%
Fetched 5600 out of 10000 56.00000000000001%
Fetched 5700 out of 10000 56.99999999999999%
Fetched 5800 out of 10000 57.99999999999999%
Error occured while getting feature names
Fetched 5900 out of 10000 59.0%
Fetched 6000 out of 10000 60.0%
Fetched 6100 out of 10000 61.0%
Fetched 6200 out of 10000 62.0%
Fetched 6300 out of 10000 63.0%
Fetched 6400 out of 10000 64.0%
Fetched 6500 out of 10000 65.0%


Expected id of type track but found type Vampire+Weekend++Ottoman spotify:local:::Vampire+Weekend++Ottoman:244


Fetched 6600 out of 10000 66.0%
Fetched 6700 out of 10000 67.0%
Fetched 6800 out of 10000 68.0%


Expected id of type track but found type Vampire+Weekend+-+Ottoman spotify:local:::Vampire+Weekend+-+Ottoman:234


Fetched 6900 out of 10000 69.0%
Fetched 7000 out of 10000 70.0%
Fetched 7100 out of 10000 71.0%
Fetched 7200 out of 10000 72.0%
Fetched 7300 out of 10000 73.0%
Fetched 7400 out of 10000 74.0%
Fetched 7500 out of 10000 75.0%


Expected id of type track but found type Father+Christmas spotify:local:Harry+Gregson-Williams:The+Chronicles+of+Narnia%3A+The+Lion%2C+The+Witch+and+the+Wardrobe+%28Soundtrack+from+the+Motion+Picture%29:Father+Christmas:200
Expected id of type track but found type To+Aslan%27s+Camp spotify:local:Harry+Gregson-Williams:The+Chronicles+of+Narnia%3A+The+Lion%2C+The+Witch+and+the+Wardrobe+%28Soundtrack+from+the+Motion+Picture%29:To+Aslan%27s+Camp:192
Expected id of type track but found type Only+the+Beginning+of+the+Adventure spotify:local:Harry+Gregson-Williams:The+Chronicles+of+Narnia%3A+The+Lion%2C+The+Witch+and+the+Wardrobe+%28Soundtrack+from+the+Motion+Picture%29:Only+the+Beginning+of+the+Adventure:348
Expected id of type track but found type Evacuating+London spotify:local:Harry+Gregson-Williams:The+Chronicles+of+Narnia%3A+The+Lion%2C+The+Witch+and+the+Wardrobe+%28Soundtrack+from+the+Motion+Picture%29:Evacuating+London:218
Expected id of type track but found type The+Battle spotify:lo

Fetched 7600 out of 10000 76.0%
Fetched 7700 out of 10000 77.0%
Fetched 7800 out of 10000 78.0%
Fetched 7900 out of 10000 79.0%
Fetched 8000 out of 10000 80.0%
Error occured while getting feature names
Fetched 8100 out of 10000 81.0%
Fetched 8200 out of 10000 82.0%
Fetched 8300 out of 10000 83.0%
Fetched 8400 out of 10000 84.0%


Expected id of type track but found type I+Am+Not+A+Robot+%28Clock+Opera+Remix%29 spotify:local:Marina+%26+The+Diamonds:I+Am+Not+a+Robot:I+Am+Not+A+Robot+%28Clock+Opera+Remix%29:276


Fetched 8500 out of 10000 85.0%


Expected id of type track but found type Guilt spotify:local:Nero:Guilt:Guilt:273


Fetched 8600 out of 10000 86.0%


Expected id of type track but found type Bomb+The+Bass+%2F+Bug+Powder+Dust+%28K%26D+Session%29 spotify:local:Kruder+%26+Dorfmeister:The+K%26D+Sessions+CD1:Bomb+The+Bass+%2F+Bug+Powder+Dust+%28K%26D+Session%29:440


Fetched 8700 out of 10000 87.0%
Fetched 8800 out of 10000 88.0%
Error occured while getting feature names
Fetched 8900 out of 10000 89.0%
Fetched 9000 out of 10000 90.0%
Fetched 9100 out of 10000 91.0%
Fetched 9200 out of 10000 92.0%
Fetched 9300 out of 10000 93.0%
Fetched 9400 out of 10000 94.0%
Fetched 9500 out of 10000 95.0%


Expected id of type track but found type Agust+D spotify:local:Agust+D:Agust+D:Agust+D:234
Expected id of type track but found type %EC%95%BD%EC%86%8D+By+JIMIN+Of+BTS spotify:local:BTS:%EC%95%BD%EC%86%8D+By+JIMIN+Of+BTS:%EC%95%BD%EC%86%8D+By+JIMIN+Of+BTS:151
Expected id of type track but found type Seesaw+X+I+NEED+U+REMIX spotify:local:BTS:Seesaw+X+I+NEED+U+REMIX:Seesaw+X+I+NEED+U+REMIX:191


Fetched 9600 out of 10000 96.0%
Fetched 9700 out of 10000 97.0%


Expected id of type track but found type In+A+Gadda+Da+Vida+%28Live%29 spotify:local:Iron+Butterfly:Divers:In+A+Gadda+Da+Vida+%28Live%29:1129


Fetched 9800 out of 10000 98.0%
Fetched 9900 out of 10000 99.0%
Fetched 10000 out of 10000 100.0%
