In [1]:
# %%capture
!pip install datasets transformers sentence-transformers

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [3]:
# Import libraries
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Load your artist data (same as before)
def load_artists(path="filtered_artists.json"):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

artists = load_artists()

available_artist_wiki_ids = [artist['id'] for artist in artists]

In [4]:
import gzip

# Unzip Artist Genre Data
with gzip.open('artist_genres.csv.gz', 'rb') as f, open('artist_genres.csv', 'wb') as fw:
    fw.write(f.read())

# Introduction

This experiment will explore the merits of using the cosine similarity between sentence embeddings as a metric for recommending artists.

First, we have to load the models. We will initialize one for wikipedia descriptions and one for artist names.

In [5]:
# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to embeddings
texts = [artist['wikipedia_content'] for artist in artists]
embeddings_list = model.encode(texts, show_progress_bar=True)

embeddings_list.shape

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/672 [00:00<?, ?it/s]

(21473, 768)

In [7]:
import pandas as pd

genre_df = pd.read_csv('artist_genres.csv')
names = genre_df['artist_name']
pairs = list(genre_df[['artist_id', 'artist_name']].itertuples(index=False, name=None))

name_embeddings_list = model.encode([pair[1] for pair in pairs], show_progress_bar=True)
name_embeddings_list.shape

available_artist_name_ids = [pair[0] for pair in pairs]

Batches:   0%|          | 0/10285 [00:00<?, ?it/s]

In [8]:
# Map artist ID to corresponding Wiki embedding
embeddings = {}
for artist, embedding in zip(artists, embeddings_list):
    artist_id = artist['id']
    embeddings[artist_id] = embedding

# Map artist ID to corresponding Name embedding
name_embeddings = {}
for pair, name_embedding in zip(pairs, name_embeddings_list):
    artist_id = pair[0]
    name_embeddings[artist_id] = name_embedding

# Map artist ID to name
id_to_name = {}
for artist in artists:
    id_to_name[artist['id']] = artist['name']

In [13]:
from uuid import UUID
from dataclasses import dataclass
from typing import Optional, List


@dataclass
class SimilarArtist:
    id: UUID
    score: float


@dataclass
class RecommendedArtist:
    id: UUID
    score: float
    recsplains: Optional[List[SimilarArtist]] = None


class EmbeddingRecommender:

    def __init__(self, _embeddings):
        self.embeddings = _embeddings

    def recommend(self, seed_ids, all_ids, top_n=10) -> List[RecommendedArtist]:
        # Calculate average seed embedding
        all_vecs = np.vstack([self.embeddings[i] for i in all_ids])

        seed_vecs = [self.embeddings[seed_id] for seed_id in seed_ids]
        avg_vec   = np.mean(seed_vecs, axis=0, keepdims=True)

        # Compute cosine simulatity
        sims = cosine_similarity(avg_vec, all_vecs)[0]

        # Sort & Pick top_n artists
        ranked = sorted(zip(all_ids, sims), key=lambda x: -x[1])
        results = []
        for artist_id, score in ranked:
            if artist_id in seed_ids:
                continue
            results.append(RecommendedArtist(artist_id, float(score), None))
            if len(results) >= top_n:
                break
        return results

wiki_recommender = EmbeddingRecommender(embeddings)
name_recommender = EmbeddingRecommender(name_embeddings)

# Evaluation

For evaluation, we will use an AUC metric that operates over a list of relevances scores (0 or 1, depending on whether or not an artist has genres overlapping with the seeds for that recommendation).
The goal is to rank all of the relevant artists at the front of the list, and all of the irrelevant artists at the back.

# Evaluation

In [10]:
from random import shuffle
from typing import Tuple

def pick_random_tags(all_tags, num_seed_tags=2, num_local_tags=8) -> Tuple[List[str], List[str]]:
    shuffle(all_tags)

    seed_tags = all_tags[0:num_seed_tags]
    local_tags = all_tags[0:num_local_tags]
    return seed_tags, local_tags


def calc_auc_score(rank_relevance):
    """
    usage : result = model.calc_auc_score([1,0,1,0,0,0,1,0,0,0,0,0])
    :param rank_relevance: list of 1s (relevant) and 0s (not relevant)
    :return: AUC score between 0 and 1. 0.5 is random. 1.0 is perfect (all relevant items at the top.)
    """
    num_true = sum(rank_relevance)
    num_false = len(rank_relevance) - num_true

    if num_true == 0 or num_false == 0:
        return -1

    tpr = 0
    total = 0
    for val in rank_relevance:
        if val:
            tpr += 1
        else:
            total += tpr

    auc = total / (num_true * num_false)
    return auc

In [11]:
from math import sqrt
import uuid


class EvalUtils:
    def __init__(self,
                 model, df, genre_data,
                 use_random_tags=True, lower_popularity_bound=20,
                 higher_popularity_bound=80,
                 popularity_step_size=5,
                 n_trials=225,
                 tag_source='lastfm', aggregate=True,
                 min_score=0.5,
                 n_seeds_per_tag=10,
                 n_locals_per_tag=10,
                 n_seed_tags=2,
                 n_local_tags=8,
                 quiet=False,
                 tags_cleaned=None):
        """
        :param use_random_tags: whether to pick random tags or use hardcoded ones
        :param lower_popularity_bound:
        :param higher_popularity_bound:
        :param n_trials:
        :param tag_source: 'lastfm', 'spotify', or 'all'
        :param aggregate:
        """
        self.df = df
        if type(tag_source) is tuple:
            self.tag_source = tag_source
        else:
            if tag_source not in ['lastfm', 'spotify', 'all']:
                print("tag_source invalid. Falling back to 'lastfm, spotify'.")
                self.tag_source = 'lastfm, spotify'
            else:
                self.tag_source = (tag_source,)
        # properties used in other functions
        self.use_random_tags = use_random_tags
        self.low_popularity_bound = lower_popularity_bound
        self.high_popularity_bound = higher_popularity_bound
        self.popularity_step_size = popularity_step_size
        self.n_trials = n_trials
        self.aggregate = aggregate
        self.n_seeds_per_tag = n_seeds_per_tag
        self.n_locals_per_tag = n_locals_per_tag
        self.min_score = min_score
        self.quiet = quiet

        self.n_seed_tags = n_seed_tags
        self.n_local_tags = n_local_tags

        self.model = model
        self.results = []

        self.df_ndx = self.df.reset_index().set_index('genre_name').sort_index()

        self.genre_data = genre_data

        if tags_cleaned is None:
            self.tags_cleaned = ['pop', 'rock', 'rap', 'hip hop', 'country', 'r&b', 'latin', 'folk', 'jazz',
                'metal', 'edm', 'soul', 'funk', 'reggae', 'disco', 'punk', 'classical',
                'house', 'techno', 'grunge', 'indie rock', 'alternative rock']
        else:
            self.tags_cleaned = tags_cleaned

    def run_eval(self):
        trial_datas = []
        for popularity_lower_bound in range(self.low_popularity_bound,
                                            self.high_popularity_bound,
                                            self.popularity_step_size):
            trial_datas.append(self._do_step(popularity_lower_bound))
        return trial_datas

    def _do_step(self, step_val):
        aucs = []
        n_successes = 0
        for x in range(self.n_trials):
            seed_tags, local_tags = pick_random_tags(self.tags_cleaned, self.n_seed_tags, self.n_local_tags)

            seed_dict, local_dict = self.get_artists_from_tags(seed_tags, local_tags, step_val)

            if len(seed_dict) != 0 and len(local_dict) != 0:
                auc_res = self._do_trial(seed_dict, local_dict, seed_tags)
                if auc_res != -1:
                    aucs.append(auc_res)
                    n_successes += 1
                    if not self.quiet:
                        print(
                            f'[{step_val}->{step_val + self.popularity_step_size}, {n_successes}] AUC: {auc_res}, Avg: {sum(aucs) / n_successes}')

        total = sum(aucs)
        avg = total / n_successes if n_successes > 0 else 0
        stdev = np.std(aucs)
        stderr = stdev / sqrt(n_successes)

        trial_data = {'low_pop': step_val, 'high_pop': f'{step_val + self.popularity_step_size}',
                      'total': total, 'count': n_successes, 'avg': avg, 'stdev': stdev,
                      'stderr': stderr}
        return trial_data

    def get_artists_from_tags(self, seed_tags, local_tags, popularity):
        """
        :param popularity: seed artist minimum popularity
        :param seed_tags: list of seed tags
        :param local_tags: list of local tags
        :return:
        """
        df = self.df

        # local artist match conditions
        genre_src_sel = df['data_source'].isin(set(self.tag_source))
        above_min_score = df['score'] >= self.min_score
        # in_local_tags = df['genre_name'].isin(local_tags)
        spotify_at_least_pop = df['spotify_popularity'] >= popularity
        spotify_at_most_pop = df['spotify_popularity'] <= popularity + self.popularity_step_size

        local_tags = self.df_ndx.loc[local_tags].reset_index()
        local_tags.set_index('index', inplace=True)

        selected_rows = local_tags.loc[genre_src_sel & above_min_score & spotify_at_least_pop & spotify_at_most_pop]

        sel_rows = selected_rows.groupby('genre_name', group_keys=False)

        sel_rows = sel_rows.sample(n=self.n_locals_per_tag, replace=True).reset_index(drop=True)

        local_artists = sel_rows.set_index('artist_id').to_dict()['genre_name']

        # seed artists
        seed_tags = self.df_ndx.loc[seed_tags].reset_index()
        seed_tags.set_index('index', inplace=True)

        seed_grouped = (
            seed_tags
                .loc[genre_src_sel & above_min_score]
                .sort_values(['genre_name','spotify_popularity'],
                            ascending=[True, False])
                .groupby('genre_name', group_keys=True)
                .head(self.n_seeds_per_tag)
            )
        seed_artists = seed_grouped.set_index('artist_id').to_dict()['genre_name']
        [seed_artists.pop(key, None) for key in local_artists]

        return seed_artists, local_artists

    # Process the data for a single trial.
    def _do_trial(self, seed_artist_dict, local_artist_dict, seed_tags):
        local_artists = list(local_artist_dict.keys())
        seed_artists = list(seed_artist_dict.keys())

        recs = self.model.recommend(seed_artists, local_artists + seed_artists)

        sorted_recs = sorted(recs, key=lambda d: d.score, reverse=True)

        sorted_rec_ids = [x.id for x in sorted_recs]

        local_artist_uuids = [uuid.UUID(x) for x in local_artists]
        not_included_in_rec_ids = list(set(local_artist_uuids).difference(set(sorted_rec_ids)))
        shuffle(not_included_in_rec_ids)  # randomly break ties for artists that are not recommended
        sorted_rec_ids += not_included_in_rec_ids

        relevances = []
        for artist_id in sorted_rec_ids:
            ar_genre = local_artist_dict[str(artist_id)]

            if len(self.genre_data[str(artist_id)].intersection(set(seed_tags))) > 0:
                relevances.append(1)
            else:
                relevances.append(0)

        result = calc_auc_score(relevances)
        return result


def eval_model(pop_low, pop_high, step, model, valid_artist_ids):
    df = pd.read_csv('artist_genres.csv')
    df = df[df['artist_id'].isin(valid_artist_ids)]

    clean_tags = ['pop', 'rock', 'rap', 'hip hop', 'country', 'r&b', 'latin', 'folk', 'jazz',
                  'metal', 'edm', 'soul', 'funk', 'reggae', 'disco', 'punk', 'classical',
                  'house', 'techno', 'grunge', 'indie rock', 'alternative rock']

    clean_tags = [tag for tag in clean_tags if tag in set(df['genre_name'].unique())]

    with open('artist_genres.csv') as file:
        lines = file.read().split("\n")
        genre_data = {}
        for line in lines[1:]:
            if len(line) > 0:
                parts = line.split(',')
                artist_id = parts[0]
                genre_name = parts[2]
                if genre_name in clean_tags and artist_id in valid_artist_ids:
                    if artist_id in genre_data:
                        genre_data[artist_id].add(genre_name)
                    else:
                        genre_data[artist_id] = set()
                        genre_data[artist_id].add(genre_name)

    eval_utils = EvalUtils(model, df, genre_data,
                           aggregate=True,
                           n_trials=200,
                           n_seed_tags=2,
                           n_local_tags=8,
                           popularity_step_size=step,
                           lower_popularity_bound=pop_low,
                           higher_popularity_bound=pop_high,
                           quiet=False,
                           tag_source=('spotify', 'bandsintown'),  # , 'lastfm', 'bandsintown', 'None'
                           tags_cleaned=clean_tags
                           )

    res = eval_utils.run_eval()
    return res


In [14]:
print(eval_model(5, 90, 5, wiki_recommender, available_artist_wiki_ids))
print(eval_model(45, 90, 55, wiki_recommender, available_artist_wiki_ids))
print(eval_model(75, 90, 15, wiki_recommender, available_artist_wiki_ids))

[5->10, 1] AUC: 0.5, Avg: 0.5
[5->10, 2] AUC: 0.625, Avg: 0.5625
[5->10, 3] AUC: 0.625, Avg: 0.5833333333333334
[5->10, 4] AUC: 0.375, Avg: 0.53125
[5->10, 5] AUC: 0.6875, Avg: 0.5625
[5->10, 6] AUC: 0.5, Avg: 0.5520833333333334
[5->10, 7] AUC: 0.375, Avg: 0.5267857142857143
[5->10, 8] AUC: 0.25, Avg: 0.4921875
[5->10, 9] AUC: 0.5, Avg: 0.4930555555555556
[5->10, 10] AUC: 0.5, Avg: 0.49375
[5->10, 11] AUC: 0.5833333333333334, Avg: 0.5018939393939393
[5->10, 12] AUC: 0.75, Avg: 0.5225694444444444
[5->10, 13] AUC: 0.4166666666666667, Avg: 0.5144230769230769
[5->10, 14] AUC: 0.75, Avg: 0.53125
[5->10, 15] AUC: 0.625, Avg: 0.5375
[5->10, 16] AUC: 0.375, Avg: 0.52734375
[5->10, 17] AUC: 0.625, Avg: 0.5330882352941176
[5->10, 18] AUC: 0.4166666666666667, Avg: 0.5266203703703703
[5->10, 19] AUC: 0.5, Avg: 0.525219298245614
[5->10, 20] AUC: 0.25, Avg: 0.5114583333333333
[5->10, 21] AUC: 0.5833333333333334, Avg: 0.5148809523809523
[5->10, 22] AUC: 0.5, Avg: 0.5142045454545454
[5->10, 23] AUC: 0

With wikipedia descriptions, the model achieves around 58% accuracy on average. This varies somewhat depending on the popularity band. The more popular artists clearly do better in the recommendations.

In [15]:
print(eval_model(5, 90, 5, name_recommender, available_artist_name_ids))
print(eval_model(45, 90, 55, name_recommender, available_artist_name_ids))
print(eval_model(75, 90, 15, name_recommender, available_artist_name_ids))

[5->10, 1] AUC: 0.46111111111111114, Avg: 0.46111111111111114
[5->10, 2] AUC: 0.45542635658914726, Avg: 0.45826873385012923
[5->10, 3] AUC: 0.5295429208472687, Avg: 0.4820267961825091
[5->10, 4] AUC: 0.4270833333333333, Avg: 0.46829093047021514
[5->10, 5] AUC: 0.26157407407407407, Avg: 0.4269475591909869
[5->10, 6] AUC: 0.6925531914893617, Avg: 0.47121516457404944
[5->10, 7] AUC: 0.5487012987012987, Avg: 0.4822846123065136
[5->10, 8] AUC: 0.372, Avg: 0.4684990357681994
[5->10, 9] AUC: 0.4392156862745098, Avg: 0.4652453302689006
[5->10, 10] AUC: 0.5026737967914439, Avg: 0.4689881769211549
[5->10, 11] AUC: 0.4180602006688963, Avg: 0.4643583608982223
[5->10, 12] AUC: 0.5171171171171172, Avg: 0.4687549239164636
[5->10, 13] AUC: 0.5778388278388278, Avg: 0.47714599344895314
[5->10, 14] AUC: 0.5716666666666667, Avg: 0.4838974701073612
[5->10, 15] AUC: 0.4686046511627907, Avg: 0.48287794884438984
[5->10, 16] AUC: 0.5693333333333334, Avg: 0.4882814103749488
[5->10, 17] AUC: 0.6043478260869565, 

With artist names, the model only gets around 52%-53% accuracy.

# Qualitative Analysis

In [18]:
#Initialize vectors
all_ids = list(embeddings.keys())

In [19]:
# Rappers
test_seeds = [
    "2cca00c0-db1f-4630-b119-d937d1635024",   #Drake
    "bbb6c760-16e8-4c28-b3d6-e7b295a2cadc",   #Bad Bunny
    "c87f2137-16d8-4399-9e5f-77dec6102560",   #Metro Boomin
]

top10 = wiki_recommender.recommend(test_seeds, all_ids, top_n=10)
for artist in top10:
    print(f"{id_to_name[artist.id]} ({artist.id}): similarity {artist.score:.3f}")

Big Sean (5e8056e9-239e-488f-a45e-0f3feefa29cd): similarity 0.683
Hit-Boy (86e2af2c-acc1-4efd-88bd-761bd95f0ea9): similarity 0.682
YoungBoy Never Broke Again (2f037e18-0b79-4c07-b3bf-c8079dd3a2c4): similarity 0.678
Kanye West (1f2d636c-12a6-4fc9-9734-a47b1df0a28c): similarity 0.675
Lil Wayne (71c66910-584b-4739-a464-2e9ec0fbf339): similarity 0.665
Future (44006268-82b1-4352-9c97-43c9c5f2b0d0): similarity 0.657
T-Wayne (f5275ba3-5d5c-4524-8065-8487cf4be099): similarity 0.656
Diddy (bde4a79c-e2d7-48ad-ad0f-921f015bf2de): similarity 0.634
Tyga (05d86f03-2b3a-4d2c-8a2f-4110ef12dadb): similarity 0.633
XXXTENTACION (5509b8c3-d952-4b58-9524-31e98e89c66a): similarity 0.633


In [20]:
# K-Pop
test_seeds = [
    "31f28501-1b65-4b86-890e-65e125b26892",   #BlackPink
    "1b5d838d-3369-430a-92c2-3695fcbc838d",   #Lisa
    "0109d633-21d1-46aa-a762-117c2c633149",   #BTS
]

top10 = wiki_recommender.recommend(test_seeds, all_ids, top_n=10)
for artist in top10:
    print(f"{id_to_name[artist.id]} ({artist.id}): similarity {artist.score:.3f}")

Brave Girls (1ae0129d-6575-45c7-9dc5-9822f4559519): similarity 0.665
B.A.P (2aa54be9-664b-49e8-9d75-f5131cee3538): similarity 0.658
BTOB (4a513167-872a-413a-88e6-0f610797c04d): similarity 0.650
BgA (a321382d-40f9-4917-b559-f3ab11a15b3d): similarity 0.650
KARA (ba012266-f87a-4ef3-9f82-2d1740d41db5): similarity 0.644
2NE1 (08a8e126-a86b-4ad2-b2f2-9e3c0cc1da3e): similarity 0.637
AOA (98719a8a-62d5-4f58-b246-056fb1024b09): similarity 0.635
Stellar (1cc26eec-5807-474a-9439-48b3afcc76fd): similarity 0.634
Girls' Generation (03128249-fdec-4441-a039-f70e4782a874): similarity 0.632
BIGBANG (875f4377-851b-4b49-b640-f072b6b280c5): similarity 0.628


In [21]:
# Pop
test_seeds = [
    "1b72e4a5-5d2e-419a-beca-8a32b8e6f32c",   #Taylor Swift
    "0edd3ee3-6fa5-444d-ade4-791fb6d23e22",   #Billie Eilish
    "5bc0af0c-f5df-4aa0-80c0-d08fc190bbd1",   #Dua Lipa
    "564d7b65-b58a-4283-a9a7-4fe2b9f54b90",   #Tate McRae
    "01452137-0f9f-4809-9f57-130b628701d9",   #Beyoncé
    "73a18448-74fe-4e77-9d77-29393bc5597d",   #Ariana Grande
]

top10 = wiki_recommender.recommend(test_seeds, all_ids, top_n=10)
for artist in top10:
    print(f"{id_to_name[artist.id]} ({artist.id}): similarity {artist.score:.3f}")

Avril Lavigne (e430f538-4fa7-4b6f-a8ea-c648276a0ddb): similarity 0.686
Meghan Trainor (1436c057-115a-405d-abec-4f8f3e4a5b62): similarity 0.683
Kelly Clarkson (c8672196-6d6c-4f87-a4a2-058902df0a72): similarity 0.683
Christina Aguilera (48f4fb07-b7d3-4d35-9cb5-2302d53213cc): similarity 0.682
Keyshia Cole (f7392184-9f79-42bb-8beb-6d17754c4dd2): similarity 0.678
Alanis Morissette (92ab30ba-e4c3-48ea-8f69-bb58750b1927): similarity 0.675
Michelle Williams (fcba9242-386c-46f2-916d-a77800348022): similarity 0.671
Mariah Carey (997dfdfd-850f-40c4-b2ca-8424c5c8ecfa): similarity 0.669
Rita Ora (b09bf343-e832-447b-8c27-84636a501c68): similarity 0.668
Lauren Daigle (bccdf16a-a174-4930-ba70-5d7ff25e3a06): similarity 0.662


These various qualitative analyses show that the model does have some concept of artist similarity, since it appears that when you use (eg.) famous pop artists as seeds, you get recommended famous pop artists. However, the quantitative analysis is much less promising.