In [19]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from collections import Counter
from sklearn.preprocessing import normalize
from gensim.models import Word2Vec
from datasets import load_dataset
import hopsworks
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sentence_transformers import SentenceTransformer
import pickle

In [20]:
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

with open('../secrets/spotify_client_id.txt', 'r') as file:
    SPOTIFY_CLIENT_ID = file.readline().strip()

with open('../secrets/spotify_client_secret.txt', 'r') as file:
    SPOTIFY_CLIENT_SECRET = file.readline().strip()

In [21]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [22]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)

2025-01-03 09:25:15,772 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-03 09:25:15,788 INFO: Initializing external client
2025-01-03 09:25:15,789 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-03 09:25:17,239 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [23]:
def get_embeddings(genres, artists, model):
    """
    Generate embeddings for genres and artists using a SentenceTransformer model.
    """
    # Combine genres and artists into a single list for embedding
    inputs = genres + artists
    
    # Generate embeddings
    embeddings = model.encode(inputs, show_progress_bar=True)
    
    # Split the embeddings back into genres and artists
    genre_embeddings = embeddings[:len(genres)]
    artist_embeddings = embeddings[len(genres):]
    
    return genre_embeddings, artist_embeddings

In [35]:
def generate_user_embedding(user_playlists, transformer_model, top_artist_count, playlists_count):
    print("Generating user embedding...")
    all_genres = []
    all_artists = []
    all_release_years = []
    playlist_features = []

    per_playlist_genre_embeddings = []  # Collect genre embeddings for each playlist

    for playlist in user_playlists[:playlists_count]:  # Limit to first playlists_count playlists
        print(f"Processing playlist: {playlist['name']}")
        playlist_name = playlist.get("name", "Unknown")
        playlist_id = playlist["id"]

        # Fetch tracks in the playlist
        tracks = sp.playlist_tracks(playlist_id)["items"]
        print(f"Number of tracks: {len(tracks)}")

        genres = []
        popularity = []
        release_years = []
        explicit_flags = []
        artist_names = []
        artist_ids = []

        # Collect all artist IDs for batch processing
        for item in tracks:
            track = item["track"]
            if not track or track["is_local"]:
                continue
            artist_ids.append(track["artists"][0]["id"])  # Only taking the first artist for simplicity
            release_date = track["album"]["release_date"]

            # Extract year from release date
            release_year = release_date.split('-')[0]
            release_years.append(int(release_year))

            popularity.append(track.get("popularity", 0))
            explicit_flags.append(track.get("explicit", False))

        # Batch the artist IDs for the Get Several Artists API call
        batch_size = 50
        artist_info = []
        for i in range(0, len(artist_ids), batch_size):
            batch = artist_ids[i:i + batch_size]
            response = sp.artists(batch)
            artist_info.extend(response["artists"])

        # Process artist information
        for artist in artist_info:
            artist_name = artist.get("name", "Unknown")
            track_genres = artist.get("genres", [])

            artist_names.append(artist_name)
            genres.extend(track_genres)

        # Generate per-playlist genre embedding
        if genres:
            genre_embeddings = transformer_model.encode(genres, show_progress_bar=False)
            playlist_genre_embedding = np.mean(genre_embeddings, axis=0)  # Average embedding for this playlist
        else:
            playlist_genre_embedding = np.zeros(384)

        per_playlist_genre_embeddings.append(playlist_genre_embedding)

        # Playlist-level features
        playlist_features.append({
            "playlist_name": playlist_name,
            "num_tracks": len(tracks),
            "avg_popularity": np.mean(popularity) if popularity else 0,
            "explicit_ratio": np.mean(explicit_flags) if explicit_flags else 0
        })

        all_genres.extend(genres)
        all_artists.extend(artist_names)
        all_release_years.extend(release_years)

    # Combine per-playlist genre embeddings using playlist sizes as weights
    if per_playlist_genre_embeddings:
        playlist_sizes = [p["num_tracks"] for p in playlist_features]
        playlist_weights = normalize(np.array(playlist_sizes).reshape(1, -1))[0]
        playlist_embedding = np.sum(
            [playlist_weights[i] * per_playlist_genre_embeddings[i] for i in range(len(per_playlist_genre_embeddings))],
            axis=0
        )
    else:
        playlist_embedding = np.zeros(384)

    # Generate overall artist and genre embeddings
    print("Generating contextual embeddings...")

    # Genre Embeddings
    genre_embeddings = transformer_model.encode(all_genres, show_progress_bar=False) if all_genres else np.zeros((1, 384))
    genre_embedding = np.mean(genre_embeddings, axis=0) if len(genre_embeddings) > 0 else np.zeros(384)

    # Artist Embeddings
    artist_counter = Counter(all_artists)
    top_artists = [artist for artist, _ in artist_counter.most_common(top_artist_count)]
    artist_embeddings = transformer_model.encode(top_artists, show_progress_bar=False) if top_artists else np.zeros((1, 384))
    artist_embedding = np.mean(artist_embeddings, axis=0) if len(artist_embeddings) > 0 else np.zeros(384)

    # Release year embedding
    release_year_embedding = np.array([np.mean(all_release_years)]) if all_release_years else np.zeros(1)

    # print("User embedding generated successfully!")
    # print("Genre embedding shape:", genre_embedding.shape)
    # print("Artist embedding shape:", artist_embedding.shape)
    # print("Playlist embedding shape:", playlist_embedding.shape)
    # print("Release year embedding shape:", release_year_embedding.shape)

    # Return individual embeddings
    return genre_embedding, artist_embedding, playlist_embedding, release_year_embedding


In [41]:
profiles_count = 50  # Number of profiles to process
top_artist_count = 5  # Number of top artists to embed
playlists_count = 5  # Number of playlists to consider per user

# Load dataset
dataset = load_dataset("erenfazlioglu/spotifyuserids")
print(f"Loaded dataset with {len(dataset['train'])} profiles")
rows = dataset["train"][:profiles_count]

Loaded dataset with 1710459 profiles


In [42]:
transformer_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # You can replace this with another model if needed

# Collect embeddings in a DataFrame
embeddings = []
for spotify_id in rows["spotify_id"]:
    print(f"Processing user: {spotify_id}")
    try:
        # Fetch user playlists
        playlists = sp.user_playlists(spotify_id)["items"]
        if not playlists:
            print(f"No playlists found for user {spotify_id}")
            continue

        # Generate individual embeddings
        genre_embedding, artist_embedding, playlist_embedding, release_year_embedding = generate_user_embedding(
            playlists, transformer_model, top_artist_count, playlists_count
        )

        # Append embeddings to list as a dictionary
        embeddings.append({
            "user_id": spotify_id,
            "genre_embedding": genre_embedding.tolist(),
            "artist_embedding": artist_embedding.tolist(),
            "playlist_embedding": playlist_embedding.tolist(),
            "release_year_embedding": release_year_embedding.tolist()
        })

    except Exception as e:
        print(f"Error processing user {spotify_id}: {e}")

# Create a DataFrame from the embeddings
print(f"Embeddings for {len(embeddings)} users:")
embeddings


2025-01-03 10:37:27,022 INFO: Use pytorch device_name: mps


2025-01-03 10:37:27,023 INFO: Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
Processing user: 31h7ml3xiavflj5n7d4av5u5xaie
Error processing user 31h7ml3xiavflj5n7d4av5u5xaie: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Processing user: 31jftrtdt3442xn74mb44j4l4lf4
Error processing user 31jftrtdt3442xn74mb44j4l4lf4: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Processing user: 31tgsl3dejcqihle3pv7o6eeng2a
Generating user embedding...
Processing playlist: Canciones Lindas

Number of tracks: 1
Processing playlist: Livii y Taylor Swift 🪩
Number of tracks: 18
Processing playlist: Mi playlist # 3
Number of tracks: 7
Processing playlist: Mi playlist # 2
Number of tracks: 10
Processing playlist: Mi playlist # 1
Number of tracks: 39
Generating contextual embeddings...
Processing user: 31imc4msmvetbl26gly5n55jbkka
Generating user embedding...
Processing playlist: De Dentro Pra Fora

Num

[{'user_id': '31tgsl3dejcqihle3pv7o6eeng2a',
  'genre_embedding': [-0.3863714933395386,
   -0.5621631145477295,
   0.10337795317173004,
   -0.7364705204963684,
   -0.07512132078409195,
   -0.18781925737857819,
   0.7260070443153381,
   -0.01263619028031826,
   0.07371838390827179,
   -0.09231123328208923,
   0.4327949285507202,
   0.042067840695381165,
   0.2121342420578003,
   -0.21269774436950684,
   0.7843590378761292,
   0.01441043708473444,
   -0.07273326069116592,
   0.023834368214011192,
   0.00814073160290718,
   -0.10195628553628922,
   0.08815795183181763,
   -0.4711640775203705,
   -0.06812021881341934,
   -0.25078532099723816,
   -0.5275488495826721,
   0.1519177407026291,
   -0.352340966463089,
   0.10231530666351318,
   -0.02990412339568138,
   0.03343770653009415,
   0.45600396394729614,
   0.5348583459854126,
   0.3585558533668518,
   -0.32004910707473755,
   -0.25237366557121277,
   0.31209924817085266,
   -0.3864402174949646,
   0.20191524922847748,
   -0.134623542428

In [43]:
df_embeddings = pd.DataFrame(embeddings)

print(f"Embeddings shape: {df_embeddings.shape}")
df_embeddings.head()

Embeddings shape: (27, 5)


Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding
0,31tgsl3dejcqihle3pv7o6eeng2a,"[-0.3863714933395386, -0.5621631145477295, 0.1...","[0.3942939341068268, -0.33002883195877075, 0.3...","[-0.7064403295516968, -0.9494979381561279, 0.1...",[2019.9066666666668]
1,31imc4msmvetbl26gly5n55jbkka,"[0.1219930648803711, 0.1214483454823494, -0.27...","[0.28332215547561646, -0.15212738513946533, -0...","[0.1219930648803711, 0.1214483454823494, -0.27...",[2018.0]
2,31frxab22c2ez34gnfggtqqsnope,"[0.14112409949302673, 0.12898339331150055, -0....","[-0.010701656341552734, 0.32390376925468445, -...","[0.26713827252388, 0.21868111193180084, -0.577...",[2017.2272727272727]
3,31fg5ma4zjh37mcqzto3xt2sxc3a,"[-0.31446775794029236, -0.2531762421131134, 0....","[-0.29996415972709656, 0.5447441339492798, 0.0...","[-0.31446775794029236, -0.2531762421131134, 0....",[2019.0]
4,vvzx3nq79szk5qfmkznpm230n,"[-0.05180082842707634, -0.16555051505565643, 0...","[-0.1893274188041687, 0.4208906590938568, -0.4...","[-0.10432210564613342, -0.23293562233448029, 0...",[2021.1666666666667]


In [44]:
fs = project.get_feature_store()

In [45]:
feature_store = project.get_feature_store()
feature_group = feature_store.get_or_create_feature_group(
    name="spotify_user_embeddings",
    version=2,
    primary_key=["user_id"],
    description="Spotify user embeddings based on playlists"
)
feature_group.insert(df_embeddings)

Uploading Dataframe: 100.00% |██████████| Rows 27/27 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: spotify_user_embeddings_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1208515/jobs/named/spotify_user_embeddings_2_offline_fg_materialization/executions


(Job('spotify_user_embeddings_2_offline_fg_materialization', 'SPARK'), None)