### Import dependencies

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import timeit
import pandas as pd
import csv

cid ="c5af2d460a7a4f02b32b78b1ae7c7b96" 
secret = "e9de66f3f6a4468d843a41f7d0f431cd"

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### This API call gets the top 10,000 tracks based on popularity

In [2]:
# timeit library to measure the time needed to run this code
start = timeit.default_timer()

# create empty lists where the results are going to be stored
artist_name = []
track_name = []
popularity = []
track_id = []

for i in range(0,10000,50):
    track_results = sp.search(q='year:2018', type='track', limit=50,offset=i)
    for i, t in enumerate(track_results['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
      

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

Time to run this code (in seconds): 156.58990143439522


In [3]:
# Throw the information into a dataframe
df_tracks = pd.DataFrame({'artist_name':artist_name,'track_name':track_name,'track_id':track_id,'popularity':popularity})
print(df_tracks.shape)
df_tracks.head()

(10000, 4)


Unnamed: 0,artist_name,track_name,track_id,popularity
0,Ariana Grande,"thank u, next",2rPE9A1vEgShuZxxzR2tZH,100
1,Los Unidades,E-Lo (feat. Jozzy),3eydp9rHJAskzOevEBK267,68
2,Travis Scott,SICKO MODE,2xLMifQCjDGFmkHkpNLD9h,95
3,Sheck Wes,Mo Bamba,1xzBco0xcoJEDXktl7Jxrr,93
4,Kodak Black,ZEZE (feat. Travis Scott & Offset),7l3E7lcozEodtVsSTCkcaA,94


### Get rid of duplicate titles

In [4]:
grouped = df_tracks.groupby(['artist_name','track_name'], as_index=True).size()
grouped[grouped > 1].count()

2315

In [5]:
df_tracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [6]:
# doing the same grouping as before to verify the solution
grouped_after_dropping = df_tracks.groupby(['artist_name','track_name'], as_index=True).size()
grouped_after_dropping[grouped_after_dropping > 1].count()

0

In [7]:
df_tracks[df_tracks.duplicated(subset=['artist_name','track_name'],keep=False)].count()

artist_name    0
track_name     0
track_id       0
popularity     0
dtype: int64

### This API call will send the track ID's we just collected out to generate audio analysis

In [8]:
# again measuring the time
start = timeit.default_timer()

# empty list, batchsize and the counter for None results
rows = []
batchsize = 100
None_counter = 0

for i in range(0,len(df_tracks['track_id']),batchsize):
    batch = df_tracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter + 1
        else:
            rows.append(t)
            
print('Number of tracks where no audio features were available:',None_counter)

stop = timeit.default_timer()
print ('Time to run this code (in seconds):',stop - start)

Number of tracks where no audio features were available: 72
Time to run this code (in seconds): 9.694773736015293


In [9]:
# Create dataframe for this set as well using the from_dict feature
df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
print("Shape of the dataset:", df_audio_features.shape)
df_audio_features.head()

Shape of the dataset: (6876, 18)


Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.28,https://api.spotify.com/v1/audio-analysis/2rPE...,0.724,207333,0.647,2rPE9A1vEgShuZxxzR2tZH,0.0,1,0.102,-5.642,1,0.0658,106.96,4,https://api.spotify.com/v1/tracks/2rPE9A1vEgSh...,audio_features,spotify:track:2rPE9A1vEgShuZxxzR2tZH,0.435
1,0.469,https://api.spotify.com/v1/audio-analysis/3eyd...,0.698,214720,0.797,3eydp9rHJAskzOevEBK267,0.00118,3,0.152,-5.125,0,0.0615,101.969,4,https://api.spotify.com/v1/tracks/3eydp9rHJAsk...,audio_features,spotify:track:3eydp9rHJAskzOevEBK267,0.53
2,0.00513,https://api.spotify.com/v1/audio-analysis/2xLM...,0.834,312820,0.73,2xLMifQCjDGFmkHkpNLD9h,0.0,8,0.124,-3.714,1,0.222,155.008,4,https://api.spotify.com/v1/tracks/2xLMifQCjDGF...,audio_features,spotify:track:2xLMifQCjDGFmkHkpNLD9h,0.446
3,0.194,https://api.spotify.com/v1/audio-analysis/1xzB...,0.729,183907,0.625,1xzBco0xcoJEDXktl7Jxrr,0.00986,4,0.248,-5.266,1,0.0315,146.034,4,https://api.spotify.com/v1/tracks/1xzBco0xcoJE...,audio_features,spotify:track:1xzBco0xcoJEDXktl7Jxrr,0.261
4,0.0515,https://api.spotify.com/v1/audio-analysis/7l3E...,0.861,228654,0.637,7l3E7lcozEodtVsSTCkcaA,3e-06,8,0.0967,-5.692,0,0.24,98.044,4,https://api.spotify.com/v1/tracks/7l3E7lcozEod...,audio_features,spotify:track:7l3E7lcozEodtVsSTCkcaA,0.474


In [10]:
# get rid of columns we dont want/need
df_audio_features = df_audio_features.drop(columns=['track_href', 'uri', 'type', 'analysis_url'])

# rename id column for merging purposes
df_audio_features.rename(columns={'id': 'track_id'}, inplace=True)

In [11]:
# Make data in the mode column readable
df_audio_features['mode'].replace(to_replace=[0, 1], value=['Major', 'Minor'], inplace=True)

df_audio_features['mode'].unique()

# df['First Season'] = (df['First Season'] > 1990).astype(int)

array(['Minor', 'Major'], dtype=object)

In [12]:
# make data in the key column readable
df_audio_features['key'].replace(to_replace=[0,1,2,3,4,5,6,7,8,9,10,11], 
                                 value=['C','C♯, D♭','D','D♯, E♭','E','F','F♯, G♭','G','G♯, A♭','A','A♯, B♭','B'], 
                                 inplace=True)

df_audio_features['key'].unique()

array(['C♯, D♭', 'D♯, E♭', 'G♯, A♭', 'E', 'F♯, G♭', 'D', 'F', 'A', 'G',
       'A♯, B♭', 'C', 'B'], dtype=object)

In [13]:
df_audio_features.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,track_id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0.28,0.724,207333,0.647,2rPE9A1vEgShuZxxzR2tZH,0.0,"C♯, D♭",0.102,-5.642,Minor,0.0658,106.96,4,0.435
1,0.469,0.698,214720,0.797,3eydp9rHJAskzOevEBK267,0.00118,"D♯, E♭",0.152,-5.125,Major,0.0615,101.969,4,0.53
2,0.00513,0.834,312820,0.73,2xLMifQCjDGFmkHkpNLD9h,0.0,"G♯, A♭",0.124,-3.714,Minor,0.222,155.008,4,0.446
3,0.194,0.729,183907,0.625,1xzBco0xcoJEDXktl7Jxrr,0.00986,E,0.248,-5.266,Minor,0.0315,146.034,4,0.261
4,0.0515,0.861,228654,0.637,7l3E7lcozEodtVsSTCkcaA,3e-06,"G♯, A♭",0.0967,-5.692,Major,0.24,98.044,4,0.474


In [14]:
df_audio_features['time_signature'].unique()

array([4, 3, 5, 1, 0], dtype=int64)

In [15]:
# out of curiousity, find the most danceable track
dance_monkey_dance = df_audio_features.sort_values(by=['speechiness'], ascending=False)
dance_monkey_dance.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,track_id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
737,0.993,0.636,35240,0.335,0Uv5Kp8sEnriJjkX4oLfNI,0.0,B,0.342,-13.327,Minor,0.966,161.68,4,0.561
6138,0.849,0.718,131227,0.479,4dOGhZDiyGNTuDHD0ia0Qe,0.0,A,0.531,-12.546,Minor,0.937,89.907,4,0.479
1134,0.81,0.657,120303,0.814,3ZVw1HR1QAQoKBygbhG4CE,0.0,B,0.445,-16.493,Major,0.937,136.142,3,0.0788
5614,0.829,0.675,31200,0.346,09IkGzxB5u6LWtKLKxjS2t,0.0,"C♯, D♭",0.159,-10.597,Major,0.912,89.93,4,0.568
3548,0.304,0.629,122093,0.166,3atCD3dVosUsd3KM2yzfeC,0.0,"C♯, D♭",0.161,-18.96,Minor,0.884,80.44,3,0.182


In [16]:
df = pd.merge(df_tracks,df_audio_features,on='track_id',how='inner')
print("Shape of the dataset:", df_audio_features.shape)
df.head()

Shape of the dataset: (6876, 14)


Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Ariana Grande,"thank u, next",2rPE9A1vEgShuZxxzR2tZH,100,0.28,0.724,207333,0.647,0.0,"C♯, D♭",0.102,-5.642,Minor,0.0658,106.96,4,0.435
1,Los Unidades,E-Lo (feat. Jozzy),3eydp9rHJAskzOevEBK267,68,0.469,0.698,214720,0.797,0.00118,"D♯, E♭",0.152,-5.125,Major,0.0615,101.969,4,0.53
2,Travis Scott,SICKO MODE,2xLMifQCjDGFmkHkpNLD9h,95,0.00513,0.834,312820,0.73,0.0,"G♯, A♭",0.124,-3.714,Minor,0.222,155.008,4,0.446
3,Sheck Wes,Mo Bamba,1xzBco0xcoJEDXktl7Jxrr,93,0.194,0.729,183907,0.625,0.00986,E,0.248,-5.266,Minor,0.0315,146.034,4,0.261
4,Kodak Black,ZEZE (feat. Travis Scott & Offset),7l3E7lcozEodtVsSTCkcaA,94,0.0515,0.861,228654,0.637,3e-06,"G♯, A♭",0.0967,-5.692,Major,0.24,98.044,4,0.474


In [17]:
df.to_csv('Spotify_Audio_Features.csv')