In [1]:
# Import libraries
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Load your artist data (same as before)
def load_artists(path="artists.json"):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

artists = load_artists()

In [2]:
# Load model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to embeddings
texts = [artist['wikipedia_content'] for artist in artists]
embeddings_list = model.encode(texts, show_progress_bar=True)

embeddings_list.shape

Batches:   0%|          | 0/325 [00:00<?, ?it/s]

(10388, 768)

In [3]:
# Map artist ID to corresponding embedding
embeddings = {}
for artist, embedding in zip(artists, embeddings_list):
    artist_id = artist['id']
    embeddings[artist_id] = embedding

# Map artist ID to name
id_to_name = {}
for artist in artists:
    id_to_name[artist['id']] = artist['name']

In [12]:
def recommend(seed_ids, all_ids, all_vecs, top_n=10):
    
    # Calculate average seed embedding
    seed_vecs = [embeddings[seed_id] for seed_id in seed_ids]
    avg_vec   = np.mean(seed_vecs, axis=0, keepdims=True)

    # Compute cosine simulatity
    sims = cosine_similarity(avg_vec, all_vecs)[0]

    # Sort & Pick top_n artists
    ranked = sorted(zip(all_ids, sims), key=lambda x: -x[1])
    results = []
    for artist_id, score in ranked:
        if artist_id in seed_ids:
            continue
        results.append((artist_id, id_to_name[artist_id], float(score)))
        if len(results) >= top_n:
            break
    return results

# Testing

In [29]:
#Initialize vectors
all_ids = list(embeddings.keys())
all_vecs = np.vstack([embeddings[i] for i in all_ids])

In [21]:
# Rappers
test_seeds = [
    "2cca00c0-db1f-4630-b119-d937d1635024",   #Drake
    "bbb6c760-16e8-4c28-b3d6-e7b295a2cadc",   #Bad Bunny
    "c87f2137-16d8-4399-9e5f-77dec6102560",   #Metro Boomin
]

top10 = recommend(test_seeds, all_ids, all_vecs, top_n=10)
for art_id, name, score in top10:
    print(f"{name} ({art_id}): similarity {score:.3f}")

YoungBoy Never Broke Again (2f037e18-0b79-4c07-b3bf-c8079dd3a2c4): similarity 0.706
Hit-Boy (86e2af2c-acc1-4efd-88bd-761bd95f0ea9): similarity 0.701
Future (44006268-82b1-4352-9c97-43c9c5f2b0d0): similarity 0.680
Kanye West (1f2d636c-12a6-4fc9-9734-a47b1df0a28c): similarity 0.679
Big Sean (5e8056e9-239e-488f-a45e-0f3feefa29cd): similarity 0.677
Kid Ink (6cdbd8d2-9ed4-41bb-8e5a-580add8be98a): similarity 0.657
XXXTENTACION (5509b8c3-d952-4b58-9524-31e98e89c66a): similarity 0.651
NGHTMRE (fbbfd79b-7ce7-49d1-9e1c-c9888a3aba88): similarity 0.650
Young Thug (c848ae09-f65e-41e7-a9d8-00119f845432): similarity 0.648
2 Chainz (c9180fc4-6029-41e2-8bac-eb5d8175e156): similarity 0.646


In [22]:
# K-Pop
test_seeds = [
    "31f28501-1b65-4b86-890e-65e125b26892",   #BlackPink
    "1b5d838d-3369-430a-92c2-3695fcbc838d",   #Lisa
    "0109d633-21d1-46aa-a762-117c2c633149",   #BTS
]

top10 = recommend(test_seeds, all_ids, all_vecs, top_n=10)
for art_id, name, score in top10:
    print(f"{name} ({art_id}): similarity {score:.3f}")

Brave Girls (1ae0129d-6575-45c7-9dc5-9822f4559519): similarity 0.665
BTOB (4a513167-872a-413a-88e6-0f610797c04d): similarity 0.650
KARA (ba012266-f87a-4ef3-9f82-2d1740d41db5): similarity 0.644
2NE1 (08a8e126-a86b-4ad2-b2f2-9e3c0cc1da3e): similarity 0.637
AOA (98719a8a-62d5-4f58-b246-056fb1024b09): similarity 0.635
Stellar (1cc26eec-5807-474a-9439-48b3afcc76fd): similarity 0.634
Girls' Generation (03128249-fdec-4441-a039-f70e4782a874): similarity 0.632
BIGBANG (875f4377-851b-4b49-b640-f072b6b280c5): similarity 0.628
(G)I-DLE (9644f780-3394-4faa-b6a8-d9de5dcb3044): similarity 0.624
LOONA (f639b0c4-2fa7-45c5-9872-fbdab0b4e9c7): similarity 0.621


In [28]:
# Pop
test_seeds = [
    "1b72e4a5-5d2e-419a-beca-8a32b8e6f32c",   #Taylor Swift
    "0edd3ee3-6fa5-444d-ade4-791fb6d23e22",   #Billie Eilish
    "5bc0af0c-f5df-4aa0-80c0-d08fc190bbd1",   #Dua Lipa
    "564d7b65-b58a-4283-a9a7-4fe2b9f54b90",   #Tate McRae
    "01452137-0f9f-4809-9f57-130b628701d9",   #Beyoncé
    "73a18448-74fe-4e77-9d77-29393bc5597d",   #Ariana Grande
]

top10 = recommend(test_seeds, all_ids, all_vecs, top_n=10)
for art_id, name, score in top10:
    print(f"{name} ({art_id}): similarity {score:.3f}")

Avril Lavigne (e430f538-4fa7-4b6f-a8ea-c648276a0ddb): similarity 0.686
Meghan Trainor (1436c057-115a-405d-abec-4f8f3e4a5b62): similarity 0.683
Kelly Clarkson (c8672196-6d6c-4f87-a4a2-058902df0a72): similarity 0.683
Christina Aguilera (48f4fb07-b7d3-4d35-9cb5-2302d53213cc): similarity 0.682
Keyshia Cole (f7392184-9f79-42bb-8beb-6d17754c4dd2): similarity 0.678
Alanis Morissette (92ab30ba-e4c3-48ea-8f69-bb58750b1927): similarity 0.675
Michelle Williams (fcba9242-386c-46f2-916d-a77800348022): similarity 0.671
Mariah Carey (997dfdfd-850f-40c4-b2ca-8424c5c8ecfa): similarity 0.669
Rita Ora (b09bf343-e832-447b-8c27-84636a501c68): similarity 0.668
Lauren Daigle (bccdf16a-a174-4930-ba70-5d7ff25e3a06): similarity 0.662
