In [1]:
# %%capture
!pip install datasets transformers sentence-transformers

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [22]:
# Import libraries
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Load your artist data (same as before)
def load_artists(path="filtered_artists.json"):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

artists = load_artists()

available_artist_ids = [artist['id'] for artist in artists]

In [4]:
# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to embeddings
texts = [artist['wikipedia_content'] for artist in artists]
embeddings_list = model.encode(texts, show_progress_bar=True)

embeddings_list.shape

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/325 [00:00<?, ?it/s]

(10388, 768)

In [5]:
# Map artist ID to corresponding embedding
embeddings = {}
for artist, embedding in zip(artists, embeddings_list):
    artist_id = artist['id']
    embeddings[artist_id] = embedding

# Map artist ID to name
id_to_name = {}
for artist in artists:
    id_to_name[artist['id']] = artist['name']

In [16]:
from uuid import UUID
from dataclasses import dataclass


@dataclass
class SimilarArtist:
    id: UUID
    score: float


@dataclass
class RecommendedArtist:
    id: UUID
    score: float
    recsplains: Optional[List[SimilarArtist]] = None


def recommend(seed_ids, all_ids, all_vecs, top_n=10) -> List[RecommendedArtist]:

    # Calculate average seed embedding
    seed_vecs = [embeddings[seed_id] for seed_id in seed_ids]
    avg_vec   = np.mean(seed_vecs, axis=0, keepdims=True)

    # Compute cosine simulatity
    sims = cosine_similarity(avg_vec, all_vecs)[0]

    # Sort & Pick top_n artists
    ranked = sorted(zip(all_ids, sims), key=lambda x: -x[1])
    results = []
    for artist_id, score in ranked:
        if artist_id in seed_ids:
            continue
        results.append(RecommendedArtist(artist_id, float(score), None))
        if len(results) >= top_n:
            break
    return results

# Evaluation

In [15]:
import gzip

# Unzip Artist Data
with gzip.open('artist_genres.csv.gz', 'rb') as f, open('artist_genres.csv', 'wb') as fw:
    fw.write(f.read())

with gzip.open('artists.csv.gz', 'rb') as f, open('artists.csv', 'wb') as fw:
    fw.write(f.read())

In [33]:
from random import shuffle
from typing import Tuple, List, Optional

def pick_random_tags(all_tags, num_seed_tags=2, num_local_tags=8) -> Tuple[List[str], List[str]]:
    shuffle(all_tags)

    seed_tags = all_tags[0:num_seed_tags]
    local_tags = all_tags[0:num_local_tags]
    return seed_tags, local_tags


def calc_auc_score(rank_relevance):
    """
    usage : result = model.calc_auc_score([1,0,1,0,0,0,1,0,0,0,0,0])
    :param rank_relevance: list of 1s (relevant) and 0s (not relevant)
    :return: AUC score between 0 and 1. 0.5 is random. 1.0 is perfect (all relevant items at the top.)
    """
    num_true = sum(rank_relevance)
    num_false = len(rank_relevance) - num_true

    if num_true == 0 or num_false == 0:
        return -1

    tpr = 0
    total = 0
    for val in rank_relevance:
        if val:
            tpr += 1
        else:
            total += tpr

    auc = total / (num_true * num_false)
    return auc

In [63]:
from math import sqrt
import pandas as pd
import uuid


class EvalUtils:
    def __init__(self,
                 recommendMethod, df, genre_data,
                 use_random_tags=True, lower_popularity_bound=20,
                 higher_popularity_bound=80,
                 popularity_step_size=5,
                 n_trials=225,
                 tag_source='lastfm', aggregate=True,
                 min_score=0.5,
                 n_seeds_per_tag=10,
                 n_locals_per_tag=10,
                 n_seed_tags=2,
                 n_local_tags=8,
                 quiet=False,
                 tags_cleaned=None):
        """
        :param use_random_tags: whether to pick random tags or use hardcoded ones
        :param lower_popularity_bound:
        :param higher_popularity_bound:
        :param n_trials:
        :param tag_source: 'lastfm', 'spotify', or 'all'
        :param aggregate:
        """
        self.df = df
        if type(tag_source) is tuple:
            self.tag_source = tag_source
        else:
            if tag_source not in ['lastfm', 'spotify', 'all']:
                print("tag_source invalid. Falling back to 'lastfm, spotify'.")
                self.tag_source = 'lastfm, spotify'
            else:
                self.tag_source = (tag_source,)
        # properties used in other functions
        self.use_random_tags = use_random_tags
        self.low_popularity_bound = lower_popularity_bound
        self.high_popularity_bound = higher_popularity_bound
        self.popularity_step_size = popularity_step_size
        self.n_trials = n_trials
        self.aggregate = aggregate
        self.n_seeds_per_tag = n_seeds_per_tag
        self.n_locals_per_tag = n_locals_per_tag
        self.min_score = min_score
        self.quiet = quiet

        self.n_seed_tags = n_seed_tags
        self.n_local_tags = n_local_tags

        self.recommend = recommendMethod
        self.results = []

        self.df_ndx = self.df.reset_index().set_index('genre_name').sort_index()

        self.genre_data = genre_data

        if tags_cleaned is None:
            self.tags_cleaned = ['pop', 'rock', 'rap', 'hip hop', 'country', 'r&b', 'latin', 'folk', 'jazz',
                'metal', 'edm', 'soul', 'funk', 'reggae', 'disco', 'punk', 'classical',
                'house', 'techno', 'grunge', 'indie rock', 'alternative rock']
        else:
            self.tags_cleaned = tags_cleaned

    def run_eval(self):
        trial_datas = []
        for popularity_lower_bound in range(self.low_popularity_bound,
                                            self.high_popularity_bound,
                                            self.popularity_step_size):
            trial_datas.append(self._do_step(popularity_lower_bound))
        return trial_datas

    def _do_step(self, step_val):
        aucs = []
        n_successes = 0
        for x in range(self.n_trials):
            seed_tags, local_tags = pick_random_tags(self.tags_cleaned, self.n_seed_tags, self.n_local_tags)

            seed_dict, local_dict = self.get_artists_from_tags(seed_tags, local_tags, step_val)

            if len(seed_dict) != 0 and len(local_dict) != 0:
                auc_res = self._do_trial(seed_dict, local_dict, seed_tags)
                if auc_res != -1:
                    aucs.append(auc_res)
                    n_successes += 1
                    if not self.quiet:
                        print(
                            f'[{step_val}->{step_val + self.popularity_step_size}, {n_successes}] AUC: {auc_res}, Avg: {sum(aucs) / n_successes}')

        total = sum(aucs)
        avg = total / n_successes if n_successes > 0 else 0
        stdev = np.std(aucs)
        stderr = stdev / sqrt(n_successes)

        trial_data = {'low_pop': step_val, 'high_pop': f'{step_val + self.popularity_step_size}',
                      'total': total, 'count': n_successes, 'avg': avg, 'stdev': stdev,
                      'stderr': stderr}
        return trial_data

    def get_artists_from_tags(self, seed_tags, local_tags, popularity):
        """
        :param popularity: seed artist minimum popularity
        :param seed_tags: list of seed tags
        :param local_tags: list of local tags
        :return:
        """
        df = self.df

        # local artist match conditions
        genre_src_sel = df['data_source'].isin(set(self.tag_source))
        above_min_score = df['score'] >= self.min_score
        # in_local_tags = df['genre_name'].isin(local_tags)
        spotify_at_least_pop = df['spotify_popularity'] >= popularity
        spotify_at_most_pop = df['spotify_popularity'] <= popularity + self.popularity_step_size

        local_tags = self.df_ndx.loc[local_tags].reset_index()
        local_tags.set_index('index', inplace=True)

        selected_rows = local_tags.loc[genre_src_sel & above_min_score & spotify_at_least_pop & spotify_at_most_pop]

        sel_rows = selected_rows.groupby('genre_name', group_keys=False)

        sel_rows = sel_rows.sample(n=self.n_locals_per_tag, replace=True).reset_index(drop=True)

        local_artists = sel_rows.set_index('artist_id').to_dict()['genre_name']

        # seed artists
        seed_tags = self.df_ndx.loc[seed_tags].reset_index()
        seed_tags.set_index('index', inplace=True)

        seed_grouped = (
            seed_tags
                .loc[genre_src_sel & above_min_score]
                .sort_values(['genre_name','spotify_popularity'],
                            ascending=[True, False])
                .groupby('genre_name', group_keys=True)
                .head(self.n_seeds_per_tag)
            )
        seed_artists = seed_grouped.set_index('artist_id').to_dict()['genre_name']
        [seed_artists.pop(key, None) for key in local_artists]

        return seed_artists, local_artists

    # Process the data for a single trial.
    def _do_trial(self, seed_artist_dict, local_artist_dict, seed_tags):
        local_artists = list(local_artist_dict.keys())
        seed_artists = list(seed_artist_dict.keys())
        artist_vecs = np.vstack([embeddings[i] for i in local_artists] + [embeddings[i] for i in seed_artists])

        recs = self.recommend(seed_artists, local_artists, artist_vecs)

        sorted_recs = sorted(recs, key=lambda d: d.score, reverse=True)

        sorted_rec_ids = [x.id for x in sorted_recs]

        local_artist_uuids = [uuid.UUID(x) for x in local_artists]
        not_included_in_rec_ids = list(set(local_artist_uuids).difference(set(sorted_rec_ids)))
        shuffle(not_included_in_rec_ids)  # randomly break ties for artists that are not recommended
        sorted_rec_ids += not_included_in_rec_ids

        relevances = []
        for artist_id in sorted_rec_ids:
            ar_genre = local_artist_dict[str(artist_id)]
            # if ar_genre in seed_tags:
            if len(self.genre_data[str(artist_id)].intersection(set(seed_tags))) > 0:
                relevances.append(1)
            else:
                relevances.append(0)

        result = calc_auc_score(relevances)
        return result


def eval_model(pop_low, pop_high, step):
    df = pd.read_csv('artist_genres.csv')
    df = df[df['artist_id'].isin(available_artist_ids)]

    clean_tags = ['pop', 'rock', 'rap', 'hip hop', 'country', 'r&b', 'latin', 'folk', 'jazz',
                  'metal', 'edm', 'soul', 'funk', 'reggae', 'disco', 'punk', 'classical',
                  'house', 'techno', 'grunge', 'indie rock', 'alternative rock']

    clean_tags = [tag for tag in clean_tags if tag in set(df['genre_name'].unique())]

    with open('artist_genres.csv') as file:
        lines = file.read().split("\n")
        genre_data = {}
        for line in lines[1:]:
            if len(line) > 0:
                parts = line.split(',')
                artist_id = parts[0]
                genre_name = parts[2]
                if genre_name in clean_tags and artist_id in available_artist_ids:
                    if artist_id in genre_data:
                        genre_data[artist_id].add(genre_name)
                    else:
                        genre_data[artist_id] = set()
                        genre_data[artist_id].add(genre_name)

    eval_utils = EvalUtils(recommend, df, genre_data,
                           aggregate=True,
                           n_trials=200,
                           n_seed_tags=2,
                           n_local_tags=8,
                           popularity_step_size=step,
                           lower_popularity_bound=pop_low,
                           higher_popularity_bound=pop_high,
                           quiet=False,
                           tag_source=('spotify', 'bandsintown'),  # , 'lastfm', 'bandsintown', 'None'
                           tags_cleaned=clean_tags
                           )

    res = eval_utils.run_eval()
    return res


In [66]:
print(eval_model(45, 90, 5))
print(eval_model(45, 90, 55))
print(eval_model(75, 90, 15))

[45->50, 1] AUC: 0.577922077922078, Avg: 0.577922077922078
[45->50, 2] AUC: 0.48872180451127817, Avg: 0.5333219412166781
[45->50, 3] AUC: 0.4392156862745098, Avg: 0.5019531895692887
[45->50, 4] AUC: 0.21428571428571427, Avg: 0.43003632074839504
[45->50, 5] AUC: 0.626984126984127, Avg: 0.4694258819955414
[45->50, 6] AUC: 0.5837209302325581, Avg: 0.4884750567017109
[45->50, 7] AUC: 0.5885167464114832, Avg: 0.5027667266602498
[45->50, 8] AUC: 0.6488095238095238, Avg: 0.5210220763039091
[45->50, 9] AUC: 0.5352941176470588, Avg: 0.5226078586753702
[45->50, 10] AUC: 0.7043189368770764, Avg: 0.5407789664955407
[45->50, 11] AUC: 0.6550724637681159, Avg: 0.5511692844294113
[45->50, 12] AUC: 0.5125, Avg: 0.5479468440602936
[45->50, 13] AUC: 0.5212121212121212, Avg: 0.5458903269181266
[45->50, 14] AUC: 0.5188172043010753, Avg: 0.54395653244548
[45->50, 15] AUC: 0.6979591836734694, Avg: 0.5542233758606794
[45->50, 16] AUC: 0.4111111111111111, Avg: 0.5452788593138314
[45->50, 17] AUC: 0.22093023255

# Testing

In [55]:
#Initialize vectors
all_ids = list(embeddings.keys())
all_vecs = np.vstack([embeddings[i] for i in all_ids])

In [58]:
# Rappers
test_seeds = [
    "2cca00c0-db1f-4630-b119-d937d1635024",   #Drake
    "bbb6c760-16e8-4c28-b3d6-e7b295a2cadc",   #Bad Bunny
    "c87f2137-16d8-4399-9e5f-77dec6102560",   #Metro Boomin
]

top10 = recommend(test_seeds, all_ids, all_vecs, top_n=10)
for artist in top10:
    print(f"{id_to_name[artist.id]} ({artist.id}): similarity {artist.score:.3f}")

Big Sean (5e8056e9-239e-488f-a45e-0f3feefa29cd): similarity 0.683
Hit-Boy (86e2af2c-acc1-4efd-88bd-761bd95f0ea9): similarity 0.682
YoungBoy Never Broke Again (2f037e18-0b79-4c07-b3bf-c8079dd3a2c4): similarity 0.678
Kanye West (1f2d636c-12a6-4fc9-9734-a47b1df0a28c): similarity 0.675
Lil Wayne (71c66910-584b-4739-a464-2e9ec0fbf339): similarity 0.665
Future (44006268-82b1-4352-9c97-43c9c5f2b0d0): similarity 0.657
Diddy (bde4a79c-e2d7-48ad-ad0f-921f015bf2de): similarity 0.634
Tyga (05d86f03-2b3a-4d2c-8a2f-4110ef12dadb): similarity 0.633
XXXTENTACION (5509b8c3-d952-4b58-9524-31e98e89c66a): similarity 0.633
Young Thug (c848ae09-f65e-41e7-a9d8-00119f845432): similarity 0.633


In [59]:
# K-Pop
test_seeds = [
    "31f28501-1b65-4b86-890e-65e125b26892",   #BlackPink
    "1b5d838d-3369-430a-92c2-3695fcbc838d",   #Lisa
    "0109d633-21d1-46aa-a762-117c2c633149",   #BTS
]

top10 = recommend(test_seeds, all_ids, all_vecs, top_n=10)
for artist in top10:
    print(f"{id_to_name[artist.id]} ({artist.id}): similarity {artist.score:.3f}")

Brave Girls (1ae0129d-6575-45c7-9dc5-9822f4559519): similarity 0.665
BTOB (4a513167-872a-413a-88e6-0f610797c04d): similarity 0.650
KARA (ba012266-f87a-4ef3-9f82-2d1740d41db5): similarity 0.644
2NE1 (08a8e126-a86b-4ad2-b2f2-9e3c0cc1da3e): similarity 0.637
AOA (98719a8a-62d5-4f58-b246-056fb1024b09): similarity 0.635
Stellar (1cc26eec-5807-474a-9439-48b3afcc76fd): similarity 0.634
Girls' Generation (03128249-fdec-4441-a039-f70e4782a874): similarity 0.632
BIGBANG (875f4377-851b-4b49-b640-f072b6b280c5): similarity 0.628
(G)I-DLE (9644f780-3394-4faa-b6a8-d9de5dcb3044): similarity 0.624
LOONA (f639b0c4-2fa7-45c5-9872-fbdab0b4e9c7): similarity 0.621


In [62]:
# Pop
test_seeds = [
    "1b72e4a5-5d2e-419a-beca-8a32b8e6f32c",   #Taylor Swift
    "0edd3ee3-6fa5-444d-ade4-791fb6d23e22",   #Billie Eilish
    "5bc0af0c-f5df-4aa0-80c0-d08fc190bbd1",   #Dua Lipa
    "564d7b65-b58a-4283-a9a7-4fe2b9f54b90",   #Tate McRae
    "01452137-0f9f-4809-9f57-130b628701d9",   #Beyoncé
    "73a18448-74fe-4e77-9d77-29393bc5597d",   #Ariana Grande
]

top10 = recommend(test_seeds, all_ids, all_vecs, top_n=10)
for artist in top10:
    print(f"{id_to_name[artist.id]} ({artist.id}): similarity {artist.score:.3f}")

Avril Lavigne (e430f538-4fa7-4b6f-a8ea-c648276a0ddb): similarity 0.686
Meghan Trainor (1436c057-115a-405d-abec-4f8f3e4a5b62): similarity 0.683
Kelly Clarkson (c8672196-6d6c-4f87-a4a2-058902df0a72): similarity 0.683
Christina Aguilera (48f4fb07-b7d3-4d35-9cb5-2302d53213cc): similarity 0.682
Keyshia Cole (f7392184-9f79-42bb-8beb-6d17754c4dd2): similarity 0.678
Alanis Morissette (92ab30ba-e4c3-48ea-8f69-bb58750b1927): similarity 0.675
Michelle Williams (fcba9242-386c-46f2-916d-a77800348022): similarity 0.671
Mariah Carey (997dfdfd-850f-40c4-b2ca-8424c5c8ecfa): similarity 0.669
Rita Ora (b09bf343-e832-447b-8c27-84636a501c68): similarity 0.668
Lauren Daigle (bccdf16a-a174-4930-ba70-5d7ff25e3a06): similarity 0.662
