In [31]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import spotipy.util as util
from tqdm.notebook import tqdm
import pprint
import pickle
from sp_client import Spotify_Client
import pandas as pd
sp = Spotify_Client()

pp = pprint.PrettyPrinter(indent=4)

In [2]:
# #gets URIs of every premade Spotify playlist

# uris = set()
# offset = 0
# more_playlists = True
# while more_playlists:
#     print(offset)
#     retrieved = {x['uri'] for x in sp.user_playlists('spotify', offset = offset)['items']}
#     if not retrieved:
#         more_playlists = False
#     else:
#         uris.update(retrieved)
#         offset += 50
# uris = list(uris)
# #save 
# with open('spotify_playlists.pkl', 'wb') as f:
#     pickle.dump(uris, f)

In [3]:
with open('spotify_playlists.pkl', 'rb') as f:
    playlist_uris = pickle.load(f)

In [23]:
def process_playlist(playlist_uri, processed_songs = set()):
    playlist_songs = sp.playlist_tracks(playlist_uri)['items']
    song_uris = []    
    song_names = []
    song_artists = []
    out = []
    
    for song_dict in playlist_songs:
        if song_dict and song_dict['track']:
            song_uri = song_dict['track']['uri']
            if song_uri not in processed_songs:
                song_uris.append(song_uri)
                song_names.append(song_dict['track']['name'])
                song_artists.append(','.join(x['name'] for x in song_dict['track']['artists']))
    
    if len(song_uris) > 100: #can only query 100 uris at a time
        song_uris = song_uris[:100]
    data = sp.audio_features(song_uris)
    
    for song_features, song_uri, song_name, song_artist in zip(data, song_uris, song_names, song_artists):
        if song_features and song_features['instrumentalness'] < 0.5: #skip songs that are mostly instrumentals
            song_features['name'] = song_name
            song_features['artist'] = song_artist
            out.append(song_features)
            processed_songs.add(song_uri)
    return out, processed_songs

In [24]:
def process_all_playlists(playlist_uris, processed_songs = set()):
    data = []
    for p_uri in tqdm(playlist_uris):
        d, processed_songs = process_playlist(p_uri, processed_songs)
        data.extend(d)
    return pd.DataFrame(data), processed_songs

In [6]:
path = r"C:\Users\chris\Documents\GitHub\cs4300sp2021-rad338-jsh328-rpp62-cmc447\sample_data/"
ref_df = pd.read_csv(path + "SpotifyAudioFeaturesApril2019.csv")
processed = set(ref_df['track_id'])

In [7]:
df, processed = process_all_playlists(playlist_uris[:300], processed)
df.to_csv("playlists0_300.csv", index = False)

  0%|          | 0/300 [00:00<?, ?it/s]

Expected id of type track but found type Blues+fr%C3%A5n+Sverige spotify:local:Tomas+Andersson-Wij:En+Introduktion+Till+Tomas+Andersson+Wij:Blues+fr%C3%A5n+Sverige:292
Expected id of type track but found type Bass+Down+Low+%28feat.+The+Cataracs%29 spotify:local:Dev:Bass+Down+Low+-+Single:Bass+Down+Low+%28feat.+The+Cataracs%29:210
Expected id of type track but found type I+Still+Care+For+You spotify:local:Ray+LaMontagne:Gossip+In+The+Grain:I+Still+Care+For+You:354


In [17]:
df1, processed = process_all_playlists(playlist_uris[300:600], processed)
df1.to_csv("playlists300_600.csv", index = False)

  0%|          | 0/300 [00:00<?, ?it/s]

Expected id of type track but found type Jag+kommer spotify:local:Veronica+Maggio:2011+-+Satan+i+gatan:Jag+kommer:200


In [25]:
df2, processed = process_all_playlists(playlist_uris[600:900], processed)
df2.to_csv("playlists600_900.csv", index = False)

  0%|          | 0/300 [00:00<?, ?it/s]

Expected id of type track but found type Lusaka+By+Night spotify:local:John+Wizards:XFM:Lusaka+By+Night:180
Expected id of type track but found type episode spotify:episode:0ufaVne0slHc9dE28EciFC
Expected id of type track but found type episode spotify:episode:6Pf5ioiO4SALW4wNehAtW8
Expected id of type track but found type episode spotify:episode:5szkvxcWisREix8i34QmjH
Expected id of type track but found type episode spotify:episode:7DpLJaDx6Lst2ogdqEL9Ld
Expected id of type track but found type episode spotify:episode:2PCUu5xJs8mTeMGvM532Ob
Expected id of type track but found type episode spotify:episode:1FzdxfMvZ6a1SjmDBYFOzN
Expected id of type track but found type episode spotify:episode:3ORDDslop6jwYs1skgoAS5
Expected id of type track but found type episode spotify:episode:01z76dsMSRSNUW90Q14L4B
Expected id of type track but found type episode spotify:episode:31Ve9dl2Ti17WRKgB9QfKr
Expected id of type track but found type episode spotify:episode:77iBDvb6t3P6o4uG2rzQ4b
Expected id 

In [32]:
df3, processed = process_all_playlists(playlist_uris[900:], processed)
df3.to_csv("playlists900_end.csv", index = False)

  0%|          | 0/498 [00:00<?, ?it/s]

Expected id of type track but found type Witch+Doctor spotify:local:Hit+Crew:Kids%27+Party+Fun:Witch+Doctor:136
Expected id of type track but found type Youre+Still+The+One spotify:local:Shania+Twain::Youre+Still+The+One:212


In [38]:
all_df = pd.concat([df, df1, df2, df3]).drop_duplicates(subset = ['uri']).reset_index(drop = True).drop(columns = ['type', 'id', 'track_href', 'analysis_url'])

In [43]:
all_df = all_df.loc[:, ['name', 'artist'] + list(all_df.columns[:-2])]

In [45]:
all_df.to_csv("all_playlists.csv", index = False)