# Fetching album and audio features for Spotify MPD

In [None]:
import sys
!{sys.executable} -m pip install --user spotipy

In [1]:
import csv
import glob
import os
import pickle
import time

os.environ["SPOTIPY_CLIENT_ID"] = "3bb1e61b35fa4b03aa4566d30d898c52"
os.environ["SPOTIPY_CLIENT_SECRET"] = "aa59de60cb1e4b2aafe43ec72bb59076"

from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials

In [7]:
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.feature import VectorAssembler, StringIndexer, RegexTokenizer, \
    StopWordsRemover, CountVectorizer, StringIndexerModel, CountVectorizerModel, IDF
from pyspark.sql.functions import collect_set, udf, col, mean, first, year, lower, \
    explode, explode_outer, lag, to_timestamp, regexp_replace, expr, max as sparkMax
from pyspark.sql import SparkSession, Window, Row

spark = SparkSession.builder \
    .appName("Fetching data for Spotify MPD") \
    .master("yarn") \
    .config("spark.submit.deployMode","client") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .config("spark.executor.memory", "25G") \
    .config("spark.yarn.executor.memoryOverhead", "5G") \
    .config("spark.executor.instances", "5") \
    .config("spark.executor.cores", "5") \
    .config("spark.kryoserializer.buffer.max", "256m") \
    .config("spark.executor.heartbeatInterval", "100000") \
    .config("spark.network.timeout", "1000000") \
    .config("spark.maxRemoteBlockSizeFetchToMem", "256m") \
    .config("spark.driver.maxResultSize", "8G") \
    .getOrCreate()

sc = spark.sparkContext

In [8]:
dataPath = "hdfs:/user/app/2018S/public/recsys_spotify_2018/mpd.v1/mpd.slice.*.json"


def loadData(s, dataPath): 
    return s.read.option("multiLine", "true").json(dataPath) \
        .select(explode(col("playlists")).alias("playlist")) \
        .select("playlist.pid", "playlist.name", "playlist.modified_at", "playlist.tracks") \
        .select(col("pid").cast("integer"), \
                lower(col("name")).alias("name"), \
                year(to_timestamp("modified_at")).alias("modified_at_year"), \
                explode_outer(col("tracks")).alias("track")) \
        .select("pid", "name", "modified_at_year", \
                "track.track_uri", "track.album_uri", "track.artist_uri", "track.pos") \


data = loadData(spark, dataPath)

In [38]:
album_uris = data.select("album_uri").dropDuplicates().collect()

In [39]:
len(album_uris)

734684

In [42]:
album_uris = [a[0] for a in album_uris]

In [9]:
track_uris = data.select("track_uri").dropDuplicates().collect()

In [5]:
len(track_uris)

2262292

In [10]:
track_uris = [t[0] for t in track_uris]

In [17]:
sc.stop()

In [51]:
spotify = Spotify(client_credentials_manager=SpotifyClientCredentials())

albums = []
for i in range(0, len(album_uris), 20):
    albums += spotify.albums(album_uris[i:i+20])["albums"]
    # actually incorrect sleep condition, but worked - no rate limiting?
    if i % 10:
        time.sleep(1)
    if i % 10000 == 0:
        print("{} of {} albums fetched".format(i, len(album_uris)))
    if i % 100000 == 0:
        with open("spotify-albums-{}.pickle".format(i), "wb") as f:
            pickle.dump(albums, f)
            albums = []

0 of 734684 albums fetched
10000 of 734684 albums fetched
20000 of 734684 albums fetched
30000 of 734684 albums fetched
40000 of 734684 albums fetched
50000 of 734684 albums fetched
60000 of 734684 albums fetched
70000 of 734684 albums fetched
80000 of 734684 albums fetched
90000 of 734684 albums fetched
100000 of 734684 albums fetched
110000 of 734684 albums fetched
120000 of 734684 albums fetched
130000 of 734684 albums fetched
140000 of 734684 albums fetched
150000 of 734684 albums fetched
160000 of 734684 albums fetched
170000 of 734684 albums fetched
180000 of 734684 albums fetched
190000 of 734684 albums fetched
200000 of 734684 albums fetched
210000 of 734684 albums fetched
220000 of 734684 albums fetched
230000 of 734684 albums fetched
240000 of 734684 albums fetched
250000 of 734684 albums fetched
260000 of 734684 albums fetched
270000 of 734684 albums fetched
280000 of 734684 albums fetched
290000 of 734684 albums fetched
300000 of 734684 albums fetched
310000 of 734684 album

In [52]:
with open("spotify-albums-end.pickle".format(i), "wb") as f:
    pickle.dump(albums, f)

In [53]:
len(albums)

34664

In [54]:
album_uris[-1]

'spotify:album:4QN0ym0NxtdORwj9je8Mbt'

In [55]:
albums[-1]

{'album_type': 'single',
 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0QyJDdX61vOFda8pXrgp0g'},
   'href': 'https://api.spotify.com/v1/artists/0QyJDdX61vOFda8pXrgp0g',
   'id': '0QyJDdX61vOFda8pXrgp0g',
   'name': 'Waldo',
   'type': 'artist',
   'uri': 'spotify:artist:0QyJDdX61vOFda8pXrgp0g'}],
 'available_markets': [],
 'copyrights': [{'text': '2017 Independent', 'type': 'C'},
  {'text': '2017 Independent', 'type': 'P'}],
 'external_ids': {'upc': '859721555333'},
 'external_urls': {'spotify': 'https://open.spotify.com/album/4QN0ym0NxtdORwj9je8Mbt'},
 'genres': [],
 'href': 'https://api.spotify.com/v1/albums/4QN0ym0NxtdORwj9je8Mbt',
 'id': '4QN0ym0NxtdORwj9je8Mbt',
 'images': [{'height': 640,
   'url': 'https://i.scdn.co/image/ab67616d0000b27331e5c9f7ede362629a39d76d',
   'width': 640},
  {'height': 300,
   'url': 'https://i.scdn.co/image/ab67616d00001e0231e5c9f7ede362629a39d76d',
   'width': 300},
  {'height': 64,
   'url': 'https://i.scdn.co/image/ab67

In [5]:
with open("spotify-albums.csv","w") as out_f:
    csv_out = csv.writer(out_f)
    csv_out.writerow(["id", "release_date", "release_date_precision"])
    for in_name in glob.glob("spotify-albums-*.pickle"):
        with open(in_name, "rb") as in_f:
            albums = pickle.load(in_f)
            album_rows = [(a["id"], a["release_date"], a["release_date_precision"])
                          for a in albums if a is not None]
            for album_row in album_rows:
                csv_out.writerow(album_row)

In [8]:
spotify = Spotify(client_credentials_manager=SpotifyClientCredentials())
audio_features = spotify.audio_features(track_uris[0:20])

In [None]:
spotify = Spotify(client_credentials_manager=SpotifyClientCredentials())

audio_features = []
for i in range(0, len(track_uris), 20):
    audio_features += spotify.audio_features(track_uris[i:i+20])
    # sleep 1 sec for every 10 requests because of rate limiting
    if i % 200 == 0:
        time.sleep(1)
    if i % 10000 == 0:
        print("{} of {} audio features fetched".format(i, len(track_uris)))
    if i % 100000 == 0:
        with open("spotify-audio-{}.pickle".format(i), "wb") as f:
            pickle.dump(audio_features, f)
            audio_features = []

0 of 2262292 audio features fetched
10000 of 2262292 audio features fetched
20000 of 2262292 audio features fetched
30000 of 2262292 audio features fetched
40000 of 2262292 audio features fetched
50000 of 2262292 audio features fetched
60000 of 2262292 audio features fetched
70000 of 2262292 audio features fetched
80000 of 2262292 audio features fetched
90000 of 2262292 audio features fetched
100000 of 2262292 audio features fetched
110000 of 2262292 audio features fetched
120000 of 2262292 audio features fetched
130000 of 2262292 audio features fetched
140000 of 2262292 audio features fetched
150000 of 2262292 audio features fetched
160000 of 2262292 audio features fetched
170000 of 2262292 audio features fetched
180000 of 2262292 audio features fetched
190000 of 2262292 audio features fetched
200000 of 2262292 audio features fetched
210000 of 2262292 audio features fetched
220000 of 2262292 audio features fetched
230000 of 2262292 audio features fetched
240000 of 2262292 audio featur

In [11]:
track_uris = track_uris[2200000:]

In [13]:
spotify = Spotify(client_credentials_manager=SpotifyClientCredentials())

audio_features = []
for i in range(0, len(track_uris), 20):
    audio_features += spotify.audio_features(track_uris[i:i+20])
    # sleep 1 sec for every 10 requests because of rate limiting
    if i % 200 == 0:
        time.sleep(1)
    if i % 10000 == 0:
        print("{} of {} audio features fetched".format(i, len(track_uris)))

0 of 62292 audio features fetched
10000 of 62292 audio features fetched
20000 of 62292 audio features fetched
30000 of 62292 audio features fetched
40000 of 62292 audio features fetched
50000 of 62292 audio features fetched
60000 of 62292 audio features fetched


In [15]:
with open("spotify-audio-end.pickle", "wb") as f:
    pickle.dump(audio_features, f)

In [14]:
audio_features[-1]

{'danceability': 0.479,
 'energy': 0.55,
 'key': 7,
 'loudness': -7.665,
 'mode': 0,
 'speechiness': 0.0522,
 'acousticness': 0.709,
 'instrumentalness': 0,
 'liveness': 0.915,
 'valence': 0.581,
 'tempo': 95.26,
 'type': 'audio_features',
 'id': '5XsVjQF9KmNxZjVZEVpbd3',
 'uri': 'spotify:track:5XsVjQF9KmNxZjVZEVpbd3',
 'track_href': 'https://api.spotify.com/v1/tracks/5XsVjQF9KmNxZjVZEVpbd3',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/5XsVjQF9KmNxZjVZEVpbd3',
 'duration_ms': 156507,
 'time_signature': 4}

In [16]:
with open("spotify-audio.csv","w") as out_f:
    csv_out = csv.writer(out_f)
    csv_out.writerow(["id",
                      "danceability", "energy", "valence", "acousticness",
                      "key", "loudness", "mode", "tempo"])
    for in_name in glob.glob("spotify-audio-*.pickle"):
        with open(in_name, "rb") as in_f:
            albums = pickle.load(in_f)
            album_rows = [(a["id"],
                           a["danceability"], a["energy"], a["valence"], a["acousticness"],
                           a["key"], a["loudness"], a["mode"], a["tempo"])
                          for a in albums if a is not None]
            for album_row in album_rows:
                csv_out.writerow(album_row)