In [1]:
# Import libraries
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Load your artist data (same as before)
def load_artists(path="artists.json"):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

artists = load_artists()

In [2]:
# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to embeddings
texts = [artist['wikipedia_content'] for artist in artists]
embeddings_list = model.encode(texts, show_progress_bar=True)

embeddings_list.shape

Batches:   0%|          | 0/325 [00:00<?, ?it/s]

(10388, 768)

In [3]:
# Map artist ID to corresponding embedding
embeddings = {}
for artist, embedding in zip(artists, embeddings_list):
    artist_id = artist['id']
    embeddings[artist_id] = embedding

# Map artist ID to name
id_to_name = {}
for artist in artists:
    id_to_name[artist['id']] = artist['name']

In [4]:
def recommend(seed_ids, all_ids, all_vecs, top_n=10, normalize_vecs=False):
    
    # Calculate average seed embedding
    seed_vecs = [embeddings[seed_id] for seed_id in seed_ids]
    avg_vec   = np.mean(seed_vecs, axis=0, keepdims=True)

    # Optionally normalize avg_vectors
    if normalize:
        avg_vec  = normalize(avg_vec, axis=1)
        all_vecs = normalize(all_vecs, axis=1)

    # Compute cosine simulatity
    sims = cosine_similarity(avg_vec, all_vecs)[0]

    # Sort & Pick top_n artists
    ranked = sorted(zip(all_ids, sims), key=lambda x: -x[1])
    results = []
    for artist_id, score in ranked:
        if artist_id in seed_ids:
            continue
        results.append((artist_id, id_to_name[artist_id], float(score)))
        if len(results) >= top_n:
            break
    return results

# Testing

In [5]:
test_seed_1 = [
    "1b72e4a5-5d2e-419a-beca-8a32b8e6f32c",
    "bbb6c760-16e8-4c28-b3d6-e7b295a2cadc"
]

all_ids = list(embeddings.keys())
all_vecs = np.vstack([embeddings[i] for i in all_ids])

print("Test One")
top10_1 = recommend(test_seed_1, all_ids, all_vecs, top_n=10, )
for art_id, name, score in top10_1:
    print(f"{name} ({art_id}): similarity {score:.3f}")

Test One
Agus Padilla (8c60339b-1cf5-431d-a54b-3386e5948cf6): similarity 0.576
Snow Tha Product (b01522ca-6b5c-4692-8471-1dd8362b71e2): similarity 0.574
Rico Nasty (7a6e3e39-be7d-449b-a7af-8d4d6914c9a1): similarity 0.572
070 Shake (464abc49-32ec-4ab4-83d6-64bac9f4d735): similarity 0.569
Arca (c3cdf4ff-2ecf-4bd0-977d-9fac7f06ed61): similarity 0.568
Tiago PZK (168ce981-cbb3-4450-9dd7-2de4efebfc89): similarity 0.563
Britney Spears (4f02731a-2807-47f9-a604-584ca3192675): similarity 0.561
Preston Pablo (69b89325-cf09-4e8e-80f4-3b964cbe6573): similarity 0.558
Myriam Hernandez (ff98c316-c169-46f7-a5cc-9c7aea736e55): similarity 0.557
Erykah Badu (957ab85e-306c-4de1-a01f-3461048095f3): similarity 0.556


In [None]:
test_seed_2 = [
    "4948bf06-91fb-411b-ba43-b77183f29246",   #The Weeknd
    "2cca00c0-db1f-4630-b119-d937d1635024",   #Drake
    "bbb6c760-16e8-4c28-b3d6-e7b295a2cadc",   #Bad Bunny
    "c87f2137-16d8-4399-9e5f-77dec6102560",   #Metro Boomin
]
print("\nTest Two")
top10_2 = recommend(test_seed_2, all_ids, all_vecs, top_n=10, True)
for art_id, name, score in top10_2:
    print(f"{name} ({art_id}): similarity {score:.3f}")
    