In [None]:
import numpy as np
import tensorflow as tf
import hopsworks
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pickle
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import time

In [6]:
# Load credentials
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

with open('../secrets/spotify_client_id.txt', 'r') as file:
    SPOTIFY_CLIENT_ID = file.readline().strip()

with open('../secrets/spotify_client_secret.txt', 'r') as file:
    SPOTIFY_CLIENT_SECRET = file.readline().strip()

In [7]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Log spotify Bearer Token
print(client_credentials_manager.get_access_token())



{'access_token': 'BQCeJy9BVHYGhGmN9rPbTWnG9sbCeUDROx9-5zxx7gf76F24tCSFdNFQVnKRqC3WnuiZvp_YQ59I1tLfnPBLS9PR6AKindsqHLph1pxP8yAY5uviMJU', 'token_type': 'Bearer', 'expires_in': 3600, 'expires_at': 1735892213}


In [8]:
# Connect to the project and feature store
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

2025-01-03 08:22:52,366 INFO: Initializing external client
2025-01-03 08:22:52,367 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-03 08:22:53,787 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [9]:
# Get the model registry
mr = project.get_model_registry()

# Retrieve the genre embedding model
genre_model_registry = mr.get_model("genre_embedding_model", version=1)
genre_model_file_path = genre_model_registry.download()
with open(genre_model_file_path + '/genre_embedding_model.pkl', "rb") as f:
    genre_embedding_model = pickle.load(f)

# Retrieve the artist embedding model
artist_model_registry = mr.get_model("artist_embedding_model", version=1)
artist_model_file_path = artist_model_registry.download()
with open(artist_model_file_path + '/artist_embedding_model.pkl', "rb") as f:
    artist_embedding_model = pickle.load(f)

# Retrieve the Keras model from the model registry
model_registry = mr.get_model("two_tower_recommender", version=1)
model_file_path = model_registry.download()
model = tf.keras.models.load_model(model_file_path + '/two_tower_model.keras')

print("Models loaded successfully!")

Models loaded successfully!(0 dirs, 1 files)... DONE


In [None]:
def generate_user_embedding(user_playlists, genre_embedding_model, artist_embedding_model, top_artist_count, playlists_count):
    print("Generating user embedding...")
    all_genres = []
    all_artists = []
    all_release_years = []
    playlist_features = []

    for playlist in user_playlists[:playlists_count]:  # Limit to first playlists_count playlists
        print(f"Processing playlist: {playlist['name']}")
        playlist_name = playlist.get("name", "Unknown")
        playlist_id = playlist["id"]

        # Fetch tracks in the playlist
        tracks = sp.playlist_tracks(playlist_id)["items"]

        print(f"Number of tracks: {len(tracks)}")

        genres = []
        popularity = []
        release_years = []
        explicit_flags = []
        artist_names = []
        artist_ids = []

        # Collect all artist IDs for batch processing
        for item in tracks:
            track = item["track"]
            if not track or track["is_local"]:
                continue
            artist_ids.append(track["artists"][0]["id"])  # Only taking the first artist for simplicity
            release_date = track["album"]["release_date"]

            # Extract year from release date
            release_year = release_date.split('-')[0]
            release_years.append(int(release_year))

            popularity.append(track.get("popularity", 0))
            explicit_flags.append(track.get("explicit", False))

        # Batch the artist IDs for the Get Several Artists API call
        batch_size = 50
        artist_info = []
        for i in range(0, len(artist_ids), batch_size):
            batch = artist_ids[i:i + batch_size]
            response = sp.artists(batch)
            artist_info.extend(response["artists"])

        # Process artist information
        for artist in artist_info:
            artist_name = artist.get("name", "Unknown")
            track_genres = artist.get("genres", [])

            artist_names.append(artist_name)
            genres.extend(track_genres)

        # Playlist-level features
        playlist_features.append({
            "playlist_name": playlist_name,
            "num_tracks": len(tracks),
            "avg_popularity": np.mean(popularity) if popularity else 0,
            "explicit_ratio": np.mean(explicit_flags) if explicit_flags else 0
        })

        all_genres.extend(genres)
        all_artists.extend(artist_names)
        all_release_years.extend(release_years)

    # Top artist embedding
    artist_counter = Counter(all_artists)
    top_artists = [artist for artist, _ in artist_counter.most_common(top_artist_count)]
    artist_embedding = np.mean(
        [artist_embedding_model.wv[artist] for artist in top_artists if artist in artist_embedding_model.wv],
        axis=0
    ) if top_artists else np.zeros(100)

    # Genre embedding
    genre_vectors = [
        genre_embedding_model.wv[genre] for genre in all_genres if genre in genre_embedding_model.wv
    ]
    genre_embedding = np.mean(genre_vectors, axis=0) if genre_vectors else np.zeros(100)

    # Aggregated playlist embedding
    playlist_sizes = [p["num_tracks"] for p in playlist_features]
    playlist_weights = normalize(np.array(playlist_sizes).reshape(1, -1))[0]
    playlist_embedding = np.sum([playlist_weights[i] * genre_embedding for i in range(len(playlist_features))], axis=0)

    # Release year embedding
    release_year_embedding = np.array([np.mean(all_release_years)])

    print("User embedding generated successfully!")
    print("Genre embedding shape:", genre_embedding.shape)
    print("Artist embedding shape:", artist_embedding.shape)
    print("Playlist embedding shape:", playlist_embedding.shape)
    print("Release year embedding shape:", release_year_embedding.shape)

    # Return individual embeddings
    return genre_embedding, artist_embedding, playlist_embedding, release_year_embedding

In [13]:
# Inference function
def get_best_matching_user(user_id, genre_embedding_model, artist_embedding_model, top_artist_count, playlists_count):
    # Fetch user playlists
    playlists = sp.user_playlists(user_id)["items"]
    if not playlists:
        print(f"No playlists found for user {user_id}")
        return None

    # Generate the user's embedding
    genre_embedding, artist_embedding, playlist_embedding, release_year_embedding = generate_user_embedding(
        playlists, genre_embedding_model, artist_embedding_model, top_artist_count, playlists_count
    )

    print("User embeddings generated successfully!")

    # Concatenate all embeddings into a single vector
    user_embedding = np.concatenate([genre_embedding, artist_embedding, playlist_embedding, release_year_embedding])

    # Get all user embeddings from the database (assuming these are already stored in the feature store)
    user_embeddings_fg = fs.get_feature_group(name="spotify_user_embeddings", version=1)
    all_user_embeddings = user_embeddings_fg.read()

    # Normalize all embeddings
    normalized_user_embeddings = np.array(all_user_embeddings["normalized_embedding"].tolist())
    user_embedding_normalized = normalize(user_embedding.reshape(1, -1))

    # Compute cosine similarity for all users
    similarities = cosine_similarity(user_embedding_normalized, normalized_user_embeddings).flatten()

    # Get the index of the most similar user
    best_match_index = np.argmax(similarities)
    best_match_user_id = all_user_embeddings.iloc[best_match_index]["user_id"]

    return best_match_user_id, similarities[best_match_index]

In [27]:
user_id = "minifixiowow"
top_artist_count = 5
playlists_count = 5

best_match_user_id, similarity_score = get_best_matching_user(user_id, genre_embedding_model, artist_embedding_model, top_artist_count, playlists_count)

print(f"The best match for user {user_id} is user {best_match_user_id} with a similarity score of {similarity_score}")

Generating user embedding...
Processing playlist: spoiled little brat

Number of tracks: 36
Processing playlist: ⁿᵒLoVé
Number of tracks: 41
Processing playlist: FAST LIFE & FAMILLE
Number of tracks: 21
Processing playlist: force à nous
Number of tracks: 10
Processing playlist: être moi
Number of tracks: 28
Top 5 artists: ['Oklou', 'underscores', 'Fred again..', 'Tiakola', 'Lomepal']


User embedding generated successfully!
Genre embedding shape: (100,)
Artist embedding shape: ()
Playlist embedding shape: (100,)
Release year embedding shape: (1,)
User embeddings generated successfully!


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 0 dimension(s)