In [7]:
import spotipy
import spotipy.util as util
import numpy as np
import pandas as pd
import os
import csv
import warnings
import re 

from sklearn.utils import shuffle
from itertools import chain
from collections import defaultdict

In [8]:
absolute_path = "f:\\AI Projects\\Music Classification"
relative_path = "Data\\Spotify"
full_path = os.path.join(absolute_path, relative_path)

In [9]:
CLIENT_ID = "372d9bfe8dce48a1aac1806d4329629d"
CLIENT_SECRET = "1a6e5a340428469dba631f9ffbd4528f"

In [10]:
auth_manager = spotipy.oauth2.SpotifyClientCredentials(client_id=CLIENT_ID,client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

Get all required data

In [11]:
top3tracks = pd.read_csv(os.path.join(full_path,"top3tracks.csv")) 
largest_playlist = pd.read_csv(os.path.join(full_path,"largest_playlist_ever.csv"))
#Drop the index column since we have a duplicate
largest_playlist = largest_playlist.iloc[:,1:]
largest_playlist_artists = pd.read_csv(os.path.join(full_path,"largest_playlist_ever_artists.csv")) 

#Drop genres column to map the genres to new broad genres
largest_playlist = largest_playlist.drop('genres', axis=1)
top3tracks = top3tracks.drop('genres', axis=1)

Extract artist information from the dataframe

In [None]:
#Ignore the frame.append method deprecated warning
warnings.filterwarnings('ignore')

columns = ["name","uri","followers","popularity","genres"]
artists_df = pd.DataFrame(columns=columns)
largest_playlist_artists = largest_playlist.loc[:,"artist"].unique().tolist()
top3tracks_artists = top3tracks.loc[:,"artist"].unique().tolist()
#Combine artists from both lists while preserving the order and dropping any duplicates.
all_artists = largest_playlist_artists + [x for x in top3tracks_artists if x not in largest_playlist_artists]

for art in all_artists:
    print(art)
    results = sp.search(q='artist: ' + art, type='artist')
    name = results["artists"]["items"][0]["name"]
    uri = results["artists"]["items"][0]["uri"]
    if not (artists_df['name'].str.contains(name, regex=False).any() & 
    artists_df['uri'].str.contains(uri, regex=False).any()):
        followers = results["artists"]["items"][0]["followers"]["total"]
        popularity = results["artists"]["items"][0]["popularity"]
        genres = results["artists"]["items"][0]["genres"]
        new_row = {"name" : name, "uri" : uri,"followers" : followers,"popularity" : popularity, "genres" : genres}
        artists_df = artists_df.append(new_row, ignore_index=True)


In [20]:
artists_df.to_csv(os.path.join(full_path,"full_artists.csv"),index=False)
artists_df.to_excel(os.path.join(full_path,"full_artists.xlsx"),index=False)

Map each genre to a broad genre using pattern matching

In [14]:
genre_map = {
    r'.*gregorian.*|.*medieval.*|.*madrigal.*|.*renaissance.*': "early",
    r'.*classical.*|.*romanticism,*|.*streichquartett.*|.*orchestral.*|.*romantic era.*|.*compositional.*': "classical",
    r'.*country.*|.*bluegrass.*|.*sertanejo.*|.*americana.*': "country",
    r'.*dance.*|.*edm.*|.*disco.*|.*house.*|.*techno.*|.*rave.*|.*dubstep.*|.*hardstyle.*|.*rawstyle.*': "dance",
    r'.*electronic.*|.*wave.*|.*big beat.*|.*downtempo.*|.*nightcore.*|.*electro.*|.*bossbeat.*|.*ambeat.*|.*otacore.*': "electronic",
    r'.*folk.*|.*indiecoustica.*|.*americana.*': "folk",
    r'.*hip hop.*|.*hip-hop.*|.*lo-fi.*|.*chillhop.*': "hip hop",
    r'.*rap.*': "rap",
    r'.*jazz.*|.*stride.*|.*jazz.*|.*blues.*|.*adult standards.*': "jazz",
    r'.*metal.*': "metal",
    r'.*rock.*|.*indiecoustica.*|.*indie.*|.*americana.*|.*otacore.*': "rock",
    r'.*r&b.*|.*beats.*': "r&b",
    r'.*soul.*': "soul",
    r'.*soundtrack.*': "soundtrack",
    r'.*reggae.*': "reggae",
    r'.*pop.*|.*adult standards.*|.*indie.*|.*otacore.*|.*a cappella.*': "pop",
    r'.*world.*': "world",
}

In [15]:
# %%capture 
#Avoid showing the full output
for i in range(0,len(largest_playlist_artists)):
    artist = largest_playlist_artists.iloc[i,0]
    genre = largest_playlist_artists.iloc[i,2]
    broad_genres = []
    for pattern, broad_genre in genre_map.items():
        if re.match(pattern, genre, re.IGNORECASE):
            broad_genres.append(broad_genre)
    print(artist,genre, broad_genres)

Aerosmith ['album rock', 'classic rock', 'hard rock', 'rock'] ['rock']
Heart ['album rock', 'classic rock', 'hard rock', 'heartland rock', 'mellow gold', 'new wave pop', 'rock', 'soft rock'] ['electronic', 'rock', 'pop']
Foo Fighters ['alternative metal', 'alternative rock', 'modern rock', 'permanent wave', 'post-grunge', 'rock'] ['electronic', 'metal', 'rock']
Bon Jovi ['glam metal', 'rock'] ['metal', 'rock']
ZZ Top ['album rock', 'blues rock', 'classic rock', 'country rock', 'hard rock', 'rock'] ['country', 'jazz', 'rock']
David Bowie ['art rock', 'classic rock', 'glam rock', 'permanent wave', 'rock'] ['electronic', 'rock']
The White Stripes ['alternative rock', 'blues rock', 'detroit rock', 'garage rock', 'modern blues rock', 'modern rock', 'permanent wave', 'punk blues', 'rock'] ['electronic', 'jazz', 'rock']
Fleetwood Mac ['album rock', 'classic rock', 'rock', 'soft rock', 'yacht rock'] ['rock']
AC/DC ['australian rock', 'hard rock', 'rock'] ['rock']
Def Leppard ['album rock', 'ha