## Collecting Data

In [1]:
#Import necessary packages
import json
import config
import sys
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials



In [2]:
#Define function to retrieve API keys from a Json file
def get_keys(path):
    with open(path) as f:
        return json.load(f)

In [3]:
#Retrieve personal keys for Spotify API 
keys = get_keys("/Users/adinasteinman/.secret/spotify_api.json")
client_id = keys['client_id']
client_secret = keys['client_secret']

In [4]:
#Access the Spotipy wrapper with client id and client secret credentials 
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
                                                           client_secret=client_secret))

In [5]:
# playlist_id = 'spotify:user:spotifycharts:playlist:37i9dQZEVXbJiZcmkrIHGU'
# results = sp.playlist(playlist_id)
# print(json.dumps(results, indent=4))

Before we extract our dataset, we will investigate ways to search for artists, songs, albums, etc. through the Spotify API.

In [6]:
#Use the sp.search method to look up songs by the Artist "The Weeknd"
search_str = 'The Weeknd'
result = sp.search(search_str)
print(result)

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=The+Weeknd&type=track&offset=0&limit=10', 'items': [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Xyo4u8uXC1ZmMpatF05PJ'}, 'href': 'https://api.spotify.com/v1/artists/1Xyo4u8uXC1ZmMpatF05PJ', 'id': '1Xyo4u8uXC1ZmMpatF05PJ', 'name': 'The Weeknd', 'type': 'artist', 'uri': 'spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ'}], 'available_markets': ['AD', 'AE', 'AL', 'AR', 'AT', 'AU', 'BA', 'BE', 'BG', 'BH', 'BO', 'BR', 'BY', 'CA', 'CH', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DE', 'DK', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'FI', 'FR', 'GB', 'GR', 'GT', 'HK', 'HN', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JO', 'JP', 'KR', 'KW', 'KZ', 'LB', 'LI', 'LT', 'LU', 'LV', 'MA', 'MC', 'MD', 'ME', 'MK', 'MT', 'MX', 'MY', 'NI', 'NL', 'NO', 'NZ', 'OM', 'PA', 'PE', 'PH', 'PL', 'PS', 'PT', 'PY', 'QA', 'RO', 'RS', 'RU', 'SA', 'SE', 'SG', 'SI', 'SK', 'SV', 'TH', 'TN', 'TR', 'TW', 'UA', 'US', 'UY',

In [7]:
#Create a query that looks at top tracks from 2020 
track_results = sp.search(q='year:2020', type='track', limit=50)
track_results

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=year%3A2020&type=track&offset=0&limit=50',
  'items': [{'album': {'album_type': 'single',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/7tYKF4w9nC0nq9CsPZTHyP'},
       'href': 'https://api.spotify.com/v1/artists/7tYKF4w9nC0nq9CsPZTHyP',
       'id': '7tYKF4w9nC0nq9CsPZTHyP',
       'name': 'SZA',
       'type': 'artist',
       'uri': 'spotify:artist:7tYKF4w9nC0nq9CsPZTHyP'}],
     'available_markets': ['AD',
      'AE',
      'AL',
      'AR',
      'AT',
      'AU',
      'BA',
      'BE',
      'BG',
      'BH',
      'BO',
      'BR',
      'BY',
      'CA',
      'CH',
      'CL',
      'CO',
      'CR',
      'CY',
      'CZ',
      'DE',
      'DK',
      'DO',
      'DZ',
      'EC',
      'EE',
      'EG',
      'ES',
      'FI',
      'FR',
      'GB',
      'GR',
      'GT',
      'HK',
      'HN',
      'HR',
      'HU',
      'ID',
      'IE',
      'IL',
      'IN',
      'IS',

In [8]:
# artist_name = []
# track_name = []
# popularity = []
# track_id = []
# for i in range(0,10000):
#     track_results = sp.search(q='year:2018', type='track', limit=50,offset=i)
# #     for i, t in enumerate(track_results['tracks']['items']):
# #         artist_name.append(t['artists'][0]['name'])
# #         track_name.append(t['name'])
# #         track_id.append(t['id'])
# #         popularity.append(t['popularity'])

In [9]:
# import pandas as pd
# track_dataframe = pd.DataFrame({'artist_name' : artist_name, 'track_name' : track_name, 'track_id' : track_id, 'popularity' : popularity})
# print(track_dataframe.shape)
# track_dataframe.head()

Look at all the possible genres in the Spotify database

In [10]:
#Use the recommendation_genres_seeds method to extract genres 
genres = sp.recommendation_genre_seeds()

In [11]:
#Print the list of genres 
genres['genres']

['acoustic',
 'afrobeat',
 'alt-rock',
 'alternative',
 'ambient',
 'anime',
 'black-metal',
 'bluegrass',
 'blues',
 'bossanova',
 'brazil',
 'breakbeat',
 'british',
 'cantopop',
 'chicago-house',
 'children',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'disney',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'grunge',
 'guitar',
 'happy',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'holidays',
 'honky-tonk',
 'house',
 'idm',
 'indian',
 'indie',
 'indie-pop',
 'industrial',
 'iranian',
 'j-dance',
 'j-idol',
 'j-pop',
 'j-rock',
 'jazz',
 'k-pop',
 'kids',
 'latin',
 'latino',
 'malay',
 'mandopop',
 'metal',
 'metal-misc',
 'metalcore',
 'minimal-techno',
 'movies',
 'mpb',
 'new-age',
 'new-release',
 'opera',
 'pagode',

In [12]:
#Extract specific information from the "acoustic" genre: find the name of the third song
acoustics = sp.search(q='genre: acoustic', limit=5, type='track')['tracks']['items']
acoustics[3]['name']

'Come On Get Higher'

In [13]:
#Creative a variable called 'hugeplaylist' that extracts information from a specific playlist using its playlist id  
hugeplaylist = sp.user_playlist_tracks(playlist_id="54nv8jbrm4JoHEZ49Qvjgl", offset=100)["items"]

# Other

In [14]:
hugeplaylist[0]['track']['name']

'Turning Tables'

In [15]:
hugeplaylist[0]['track']['album']['release_date']

'2011-01-19'

According to the Spotify API, the popularity of a track is defined by: 

"*The popularity of the track. The value will be between 0 and 100, with 100 being the most popular.
The popularity of a track is a value between 0 and 100, with 100 being the most popular. The popularity is calculated by algorithm and is based, in the most part, on the total number of plays the track has had and how recent those plays are.
Generally speaking, songs that are being played a lot now will have a higher popularity than songs that were played a lot in the past. Duplicate tracks (e.g. the same track from a single and an album) are rated independently. Artist and album popularity is derived mathematically from track popularity. Note that the popularity value may lag actual popularity by a few days: the value is not updated in real time.*"

This 'popularity' measure could potentially be useful in future modeling when determining the ratings of our songs. For now, let's see how we can extract the popularity metric for the song "Turning Tables" from our playlist.

In [16]:
hugeplaylist[0]['track']['popularity']

44

It also may be interesting to see if a track is explicit or not. Let's analyze this feature as well.

In [17]:
hugeplaylist[0]['track']['explicit']

False

I will now look to extract data from one playlist on Spotify. The playlist selected was random, however it was chosen due to its large volume (approximately 10,000 songs are in the playlist). I will ues this playlist as our dataset for the remainder of my analysis.  The playlist data can be called from the API using the user_playlist_tracks method, and inserting the playlist's ID. 

In [18]:
#Perform a pagination method that increases the offset by increments of 100 to extract approximately 10,000 songs 
#from the chosen Spotify playlist 

#Start with offset=0
offset = 0 
#Create an empty playlist 
playlist = []
#Apply the user_playlist_tracks method from spotipy to extract playlist data 
p1 = sp.user_playlist_tracks(playlist_id="54nv8jbrm4JoHEZ49Qvjgl", offset=offset)

#Continue to loop through the API call and append results to the empty playlist until 10,000 songs are extracted
while offset<10000:
    for i in p1["items"]:
        playlist.append(i)
    offset+=100
    p1 = sp.user_playlist_tracks(playlist_id="54nv8jbrm4JoHEZ49Qvjgl", offset=offset)    
        

In [19]:
#Print the length of the playlist
len(playlist)

9964

In [20]:
playlist[1]['track']['id']

'1Jx69b09LKTuBQxkEiFfVX'

In [21]:
playlist[0]['track']['artists'][0]['id']

'6jJ0s89eD6GaHleKKya26X'

## Putting Playlist into DataFrame

In [22]:
#Create a function to extract features of our playlist and append them to a DataFrame 
def df_playlist(playlist_id):
    
    # Set column names and build empty dataframe
    playlist_features_list = ["artist","artist_id", "album","track_name", "release_date", "popularity", "is_explicit", "track_id"]
    playlist_df = pd.DataFrame(columns = playlist_features_list)
    
    # Create empty dictionary of playlist features 
    playlist_features = {}
    
    #Instantiate a counter = 0 
    counter=0
    
    # Create a for loop that looks through every track in the playlist, 
    # Then, extract relevant features and append the features to a DataFrame
    for track in playlist_id:
        if track["track"]!=None:
            counter+=1
            playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
            playlist_features["artist_id"] = track["track"]["artists"][0]["id"]
            playlist_features["album"] = track["track"]["album"]["name"]
            playlist_features["track_name"] = track["track"]["name"]
            playlist_features["release_date"] = track["track"]["album"]["release_date"]
            playlist_features["popularity"] = track["track"]["popularity"]
            playlist_features["is_explicit"] = track["track"]["explicit"]
            playlist_features["track_id"] = track["track"]["id"]
            
    # Add new features to a DataFrame then continuously add new features to existing DataFrame
    # This method ensures that all songs extracted from playlist result in one final dataframe    
            track_df = pd.DataFrame(playlist_features, index = [0])
            playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
     
    #Return final playlist dataframe
    return playlist_df

In [23]:
# #Apply function to the playlist that was previously extracted and set equal to a new dataframe called df 
df = df_playlist(playlist)

In [24]:
#Look at shape of new DataFrame 
df.shape

(9963, 8)

In [32]:
#Export new DataFrame to csv file 
df.to_csv('playlistdf')

### Audio Features

In [26]:
sp.audio_features(tracks=['76N7FdzCI9OsiUnzJVLY2m', '5G6x3QgKSzos6khVmDa3rI'])

[{'danceability': 0.618,
  'energy': 0.753,
  'key': 7,
  'loudness': -5.05,
  'mode': 0,
  'speechiness': 0.0451,
  'acousticness': 0.637,
  'instrumentalness': 0,
  'liveness': 0.0905,
  'valence': 0.557,
  'tempo': 120.041,
  'type': 'audio_features',
  'id': '76N7FdzCI9OsiUnzJVLY2m',
  'uri': 'spotify:track:76N7FdzCI9OsiUnzJVLY2m',
  'track_href': 'https://api.spotify.com/v1/tracks/76N7FdzCI9OsiUnzJVLY2m',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/76N7FdzCI9OsiUnzJVLY2m',
  'duration_ms': 221427,
  'time_signature': 4},
 {'danceability': 0.612,
  'energy': 0.692,
  'key': 8,
  'loudness': -5.544,
  'mode': 1,
  'speechiness': 0.279,
  'acousticness': 0.156,
  'instrumentalness': 0,
  'liveness': 0.0902,
  'valence': 0.544,
  'tempo': 135.05,
  'type': 'audio_features',
  'id': '5G6x3QgKSzos6khVmDa3rI',
  'uri': 'spotify:track:5G6x3QgKSzos6khVmDa3rI',
  'track_href': 'https://api.spotify.com/v1/tracks/5G6x3QgKSzos6khVmDa3rI',
  'analysis_url': 'https://api.spotify

In [27]:
#Create a list of just the track ids for each song in hte playlist 
trackid_list = []
for track in playlist:
    if track["track"]!=None: 
        trackid_list.append(track['track']['id'])

In [28]:
#Check the length of the track id list 
len(trackid_list)

9963

In [29]:
audiofeatures = []
counter = 0 

while counter<10000:
        audiofeatures.extend(sp.audio_features(tracks=trackid_list[counter:counter+100]))
        counter+=100
         

In [30]:
len(audiofeatures)

9963

In [31]:
audio_data = pd.DataFrame(audiofeatures)
audio_data.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.638,0.826,8,-4.968,1,0.0479,0.139,0.0,0.0803,0.649,124.072,audio_features,4lCv7b86sLynZbXhfScfm2,spotify:track:4lCv7b86sLynZbXhfScfm2,https://api.spotify.com/v1/tracks/4lCv7b86sLyn...,https://api.spotify.com/v1/audio-analysis/4lCv...,227880,4
1,0.397,0.817,2,-5.495,1,0.042,0.0966,8e-06,0.316,0.416,158.004,audio_features,1Jx69b09LKTuBQxkEiFfVX,spotify:track:1Jx69b09LKTuBQxkEiFfVX,https://api.spotify.com/v1/tracks/1Jx69b09LKTu...,https://api.spotify.com/v1/audio-analysis/1Jx6...,266227,4
2,0.384,0.527,11,-5.294,0,0.0269,0.542,0.0,0.156,0.222,100.73,audio_features,4u83mwF5tUuWlXS86UOXdu,spotify:track:4u83mwF5tUuWlXS86UOXdu,https://api.spotify.com/v1/tracks/4u83mwF5tUuW...,https://api.spotify.com/v1/audio-analysis/4u83...,221200,3
3,0.609,0.629,10,-5.024,1,0.0264,0.425,0.0,0.0978,0.325,99.955,audio_features,4Musyaro0NM5Awx8b5c627,spotify:track:4Musyaro0NM5Awx8b5c627,https://api.spotify.com/v1/tracks/4Musyaro0NM5...,https://api.spotify.com/v1/audio-analysis/4Mus...,241467,4
4,0.729,0.756,8,-5.119,1,0.0294,0.131,0.0,0.0527,0.522,104.945,audio_features,1CkvWZme3pRgbzaxZnTl5X,spotify:track:1CkvWZme3pRgbzaxZnTl5X,https://api.spotify.com/v1/tracks/1CkvWZme3pRg...,https://api.spotify.com/v1/audio-analysis/1Ckv...,228293,4


In [33]:
audio_data.to_csv('dfaudio')