In [2]:
import spotipy
import spotipy.util as util
import numpy as np
import pandas as pd
import os
import csv
import warnings
import re 

from sklearn.utils import shuffle
from itertools import chain
from collections import defaultdict

In [3]:
absolute_path = "f:\\AI Projects\\Music Classification"
relative_path = "data\\processed\\Spotify"
full_path = os.path.join(absolute_path, relative_path)

In [3]:
CLIENT_ID = "372d9bfe8dce48a1aac1806d4329629d"
CLIENT_SECRET = "1a6e5a340428469dba631f9ffbd4528f"

In [4]:
auth_manager = spotipy.oauth2.SpotifyClientCredentials(client_id=CLIENT_ID,client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

Get all required data

In [5]:
top3tracks = pd.read_csv(os.path.join(full_path,"top3tracks.csv")) 
largest_playlist = pd.read_csv(os.path.join(full_path,"largest_playlist_ever.csv"))
#Drop the index column since we have a duplicate
largest_playlist = largest_playlist.iloc[:,1:]
largest_playlist_artists = pd.read_csv(os.path.join(full_path,"largest_playlist_ever_artists.csv")) 

#Drop genres column to map the genres to new broad genres
largest_playlist = largest_playlist.drop('genres', axis=1)
top3tracks = top3tracks.drop('genres', axis=1)

Extract artist information from the dataframe

In [None]:
#Ignore the frame.append method deprecated warning
warnings.filterwarnings('ignore')

columns = ["name","uri","followers","popularity","genres"]
artists_df = pd.DataFrame(columns=columns)
largest_playlist_artists = largest_playlist.loc[:,"artist"].unique().tolist()
top3tracks_artists = top3tracks.loc[:,"artist"].unique().tolist()
#Combine artists from both lists while preserving the order and dropping any duplicates.
all_artists = largest_playlist_artists + [x for x in top3tracks_artists if x not in largest_playlist_artists]

for art in all_artists:
    print(art)
    results = sp.search(q='artist: ' + art, type='artist')
    name = results["artists"]["items"][0]["name"]
    uri = results["artists"]["items"][0]["uri"]
    if not (artists_df['name'].str.contains(name, regex=False).any() & 
    artists_df['uri'].str.contains(uri, regex=False).any()):
        followers = results["artists"]["items"][0]["followers"]["total"]
        popularity = results["artists"]["items"][0]["popularity"]
        genres = results["artists"]["items"][0]["genres"]
        new_row = {"name" : name, "uri" : uri,"followers" : followers,"popularity" : popularity, "genres" : genres}
        artists_df = artists_df.append(new_row, ignore_index=True)


In [20]:
artists_df.to_csv(os.path.join(full_path,"full_artists.csv"),index=False)
artists_df.to_excel(os.path.join(full_path,"full_artists.xlsx"),index=False)

Map each genre to a broad genre using pattern matching

In [6]:
genre_map = {
    r'.*gregorian.*|.*medieval.*|.*madrigal.*|.*renaissance.*': "early",
    r'.*classical.*|.*romanticism,*|.*streichquartett.*|.*orchestral.*|.*romantic era.*|.*compositional.*': "classical",
    r'.*country.*|.*bluegrass.*|.*sertanejo.*|.*americana.*': "country",
    r'.*dance.*|.*edm.*|.*disco.*|.*house.*|.*techno.*|.*rave.*|.*dubstep.*|.*hardstyle.*|.*rawstyle.*': "dance",
    r'.*electronic.*|.*wave.*|.*big beat.*|.*downtempo.*|.*nightcore.*|.*electro.*|.*bossbeat.*|.*ambeat.*|.*otacore.*': "electronic",
    r'.*folk.*|.*indiecoustica.*|.*americana.*': "folk",
    r'.*hip hop.*|.*hip-hop.*|.*lo-fi.*|.*chillhop.*': "hip hop",
    r'.*rap.*': "rap",
    r'.*jazz.*|.*stride.*|.*jazz.*|.*blues.*|.*adult standards.*': "jazz",
    r'.*metal.*': "metal",
    r'.*rock.*|.*indiecoustica.*|.*indie.*|.*americana.*|.*otacore.*': "rock",
    r'.*r&b.*|.*beats.*': "r&b",
    r'.*soul.*': "soul",
    r'.*soundtrack.*': "soundtrack",
    r'.*reggae.*': "reggae",
    r'.*pop.*|.*adult standards.*|.*indie.*|.*otacore.*|.*a cappella.*': "pop",
    r'.*world.*': "world",
}

In [16]:
artists_df = pd.read_csv(os.path.join(full_path,"full_artists.csv"))

artists_df["broad_genres"] = ""

for i in range(0,len(artists_df)):
    artist = artists_df.loc[i,"name"]
    genre = artists_df.loc[i,"genres"]
    broad_genres = []
    for pattern, broad_genre in genre_map.items():
        if re.match(pattern, genre, re.IGNORECASE):
            broad_genres.append(broad_genre)
    artists_df.at[i,"broad_genres"] = str(broad_genres)


artists_df.to_csv(os.path.join(full_path,"full_artists_mapped.csv"),index=False)

Combine artists and playlist together

In [13]:
artists_df = pd.read_csv(os.path.join(full_path,"full_artists_mapped.csv")) 
top3tracks = pd.read_csv(os.path.join(full_path,"top3tracks.csv")) 
top3tracks.rename(columns={'album_name': 'album'}, inplace=True)
largest_playlist = pd.read_csv(os.path.join(full_path,"largest_playlist_ever.csv"))
largest_playlist = largest_playlist.iloc[:,1:]

#Join the playlist dataframes together
full_df = pd.concat([largest_playlist, top3tracks])
full_df = full_df.drop(columns=["genres","added_date","explicit"])

# #Merge artists and playlists dataframes
full_df = pd.merge(left=artists_df,right=full_df,left_on="name",right_on="artist",how='right')

#Drop rows containing nan or no broad genres
full_df = full_df.replace(r'\[\]', float('NaN'), regex = True)
full_df = full_df.dropna()
full_df = full_df.reset_index(drop=True)

full_df.to_csv(os.path.join(full_path,"complete_dataframe_1.csv"),index=False)
full_df.to_excel(os.path.join(full_path,"complete_dataframe_1.xlsx"),index=False)

In [31]:
import ast
from collections import Counter

full_df = pd.read_csv(os.path.join(full_path,"complete_dataframe_1.csv"))
broad_genres_col = full_df.loc[:,"broad_genres"].values.tolist()
broad_genres_col = ast.literal_eval(str(broad_genres_col))

broad_genres = [ast.literal_eval(s) for s in broad_genres_col]
broad_genres = Counter([item for sublist in broad_genres for item in sublist])
broad_genres


Counter({'rock': 2108,
         'pop': 5794,
         'dance': 2793,
         'rap': 2083,
         'hip hop': 1870,
         'electronic': 1012,
         'reggae': 95,
         'soul': 302,
         'r&b': 411,
         'metal': 326,
         'classical': 518,
         'soundtrack': 48,
         'country': 199,
         'folk': 253,
         'jazz': 821,
         'early': 29,
         'world': 3})