In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from collections import Counter
from sklearn.preprocessing import normalize
from gensim.models import Word2Vec
from datasets import load_dataset
import hopsworks
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pickle

In [2]:
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

with open('../secrets/spotify_client_id.txt', 'r') as file:
    SPOTIFY_CLIENT_ID = file.readline().strip()

with open('../secrets/spotify_client_secret.txt', 'r') as file:
    SPOTIFY_CLIENT_SECRET = file.readline().strip()

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)

2025-01-02 18:31:21,132 INFO: Initializing external client
2025-01-02 18:31:21,132 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-02 18:31:22,414 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [16]:
# Generate embedding for a user's profile
def generate_user_embedding(user_playlists, genre_embedding_model, artist_embedding_model, top_artist_count, playlists_count):
    all_genres = []
    all_artists = []
    all_release_years = []
    playlist_features = []

    for playlist in user_playlists[:playlists_count]:  # Limit to first 10 playlists
        playlist_name = playlist.get("name", "Unknown")
        playlist_id = playlist["id"]

        # Fetch tracks in the playlist
        tracks = sp.playlist_tracks(playlist_id)["items"]

        genres = []
        popularity = []
        release_years = []
        explicit_flags = []
        artist_names = []

        for item in tracks:
            track = item["track"]
            if not track:
                continue

            # Extract artist information
            artist_id = track["artists"][0]["id"]
            artist = sp.artist(artist_id)
            track_genres = artist.get("genres", [])
            artist_name = artist.get("name", "Unknown")

            # Append track features
            genres.extend(track_genres)
            artist_names.append(artist_name)
            release_date = track["album"]["release_date"]
            
            # Extract year from release date
            release_year = release_date.split('-')[0]
            release_years.append(int(release_year))

            popularity.append(track.get("popularity", 0))
            explicit_flags.append(track.get("explicit", False))

        # Playlist-level features
        playlist_features.append({
            "playlist_name": playlist_name,
            "num_tracks": len(tracks),
            "avg_popularity": np.mean(popularity) if popularity else 0,
            "explicit_ratio": np.mean(explicit_flags) if explicit_flags else 0
        })

        all_genres.extend(genres)
        all_artists.extend(artist_names)
        all_release_years.extend(release_years)

    # Top artist embedding
    artist_counter = Counter(all_artists)
    top_artists = [artist for artist, _ in artist_counter.most_common(top_artist_count)]
    artist_embedding = np.mean(
        [artist_embedding_model.wv[artist] for artist in top_artists if artist in artist_embedding_model.wv],
        axis=0
    ) if top_artists else np.zeros(100)

    # Genre embedding
    genre_vectors = [
        genre_embedding_model.wv[genre] for genre in all_genres if genre in genre_embedding_model.wv
    ]
    genre_embedding = np.mean(genre_vectors, axis=0) if genre_vectors else np.zeros(100)

    # Aggregated playlist embedding
    playlist_sizes = [p["num_tracks"] for p in playlist_features]
    playlist_weights = normalize(np.array(playlist_sizes).reshape(1, -1))[0]
    playlist_embedding = np.sum([playlist_weights[i] * genre_embedding for i in range(len(playlist_features))], axis=0)

    # Release year embedding
    release_year_embedding = np.array([np.mean(all_release_years)])

    # Return individual embeddings
    return genre_embedding, artist_embedding, playlist_embedding, release_year_embedding

In [15]:
profiles_count = 50  # Number of profiles to process
top_artist_count = 5  # Number of top artists to embed
playlists_count = 10  # Number of playlists to consider per user

# Load dataset
dataset = load_dataset("erenfazlioglu/spotifyuserids")
print(f"Loaded dataset with {len(dataset['train'])} profiles")
rows = dataset["train"][:profiles_count]

Loaded dataset with 1710459 profiles


In [9]:
# Train custom Word2Vec model for genres and artists
corpus = []
for spotify_id in rows["spotify_id"]:
    print(f"Processing user {spotify_id}")
    
    try:
        # Fetch user playlists
        playlists = sp.user_playlists(spotify_id)["items"]
        
        # Process each playlist
        for playlist in playlists:
            tracks = sp.playlist_tracks(playlist["id"])["items"]
            
            # Process each track in the playlist
            for item in tracks:
                track = item["track"]
                if not track:
                    continue
                
                # Fetch artist details
                artist_id = track["artists"][0]["id"]
                artist = sp.artist(artist_id)
                
                # Fetch genres and artist name
                track_genres = artist.get("genres", [])
                artist_name = artist.get("name", "Unknown")
                
                # Add genres and artist name to the corpus
                corpus.append(track_genres + [artist_name])
    
    except spotipy.exceptions.SpotifyException as e:
        # Handle 404 errors or any Spotify API error
        if e.http_status == 404:
            print(f"User {spotify_id} not found or no playlists available.")
        else:
            print(f"Error processing user {spotify_id}: {e}")
    
    except Exception as e:
        # General error handling
        print(f"An unexpected error occurred with user {spotify_id}: {e}")

print(f"Training Word2Vec model on {len(corpus)} playlists")

# Train Word2Vec model
genre_embedding_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
artist_embedding_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

Processing user 31h7ml3xiavflj5n7d4av5u5xaie
Processing user 31jftrtdt3442xn74mb44j4l4lf4
Processing user 31tgsl3dejcqihle3pv7o6eeng2a
Processing user 31imc4msmvetbl26gly5n55jbkka
Processing user 31frxab22c2ez34gnfggtqqsnope
Processing user 31fg5ma4zjh37mcqzto3xt2sxc3a
Processing user 31vbhry6lnvr5jdwbvzkmwmpoyaq
Processing user 31yfhhcsm7gfze67kyakl2htpsqe
Processing user vvzx3nq79szk5qfmkznpm230n
Processing user 312klqvvav432mq4562g7qebwu6q
Processing user 314nnom4dpapiu6dd2ffoska7eza
Processing user l2qyufju6ub2s40t30xbk46ih
Processing user 31izr6gm4co2saq6vom5jmh6ghdy
Processing user 31kyyozssce7jzaaddtkoib3ezyu
Processing user 31wz2lctfb6ftrjybh4aaiq2cwui
Processing user vtu3lxe2uqkalv6pjnuw54hep
Processing user 31662nnwj4hzlceyhda5awuaynsu
2025-01-02 18:34:14,942 ERROR: HTTP Error for GET to https://api.spotify.com/v1/users/31662nnwj4hzlceyhda5awuaynsu/playlists with Params: {'limit': 50, 'offset': 0} returned 404 due to Resource not found
User 31662nnwj4hzlceyhda5awuaynsu not fo

In [13]:
mr = project.get_model_registry()

# Save genre embedding model
with open("genre_embedding_model.pkl", "wb") as f:
    pickle.dump(genre_embedding_model, f)

# Save artist embedding model
with open("artist_embedding_model.pkl", "wb") as f:
    pickle.dump(artist_embedding_model, f)

# Register genre model in Hopsworks
genre_model_registry = mr.python.create_model(
    name="genre_embedding_model",
    metrics={"vector_size": genre_embedding_model.vector_size},  # Only numeric metrics
    description="Trained Word2Vec model for genre embeddings",
)

# Register artist model in Hopsworks
artist_model_registry = mr.python.create_model(
    name="artist_embedding_model",
    metrics={"vector_size": artist_embedding_model.vector_size},  # Only numeric metrics
    description="Trained Word2Vec model for artist embeddings",
)

# Upload models to Hopsworks
genre_model_registry.save("genre_embedding_model.pkl")
artist_model_registry.save("artist_embedding_model.pkl")

print("Word2Vec models saved to Hopsworks successfully!")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1673998 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1208515/models/genre_embedding_model/1


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1673998 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1208515/models/artist_embedding_model/1
Word2Vec models saved to Hopsworks successfully!


In [17]:
# Collect embeddings in a DataFrame
embeddings = []
for spotify_id in rows["spotify_id"]:
    try:
        # Fetch user playlists
        playlists = sp.user_playlists(spotify_id)["items"]
        if not playlists:
            print(f"No playlists found for user {spotify_id}")
            continue

        # Generate individual embeddings
        genre_embedding, artist_embedding, playlist_embedding, release_year_embedding = generate_user_embedding(
            playlists, genre_embedding_model, artist_embedding_model, top_artist_count, playlists_count
        )

        # Append embeddings to list as a dictionary
        embeddings.append({
            "user_id": spotify_id,
            "genre_embedding": genre_embedding.tolist(),
            "artist_embedding": artist_embedding.tolist(),
            "playlist_embedding": playlist_embedding.tolist(),
            "release_year_embedding": release_year_embedding.tolist()
        })

    except Exception as e:
        print(f"Error processing user {spotify_id}: {e}")

# Create a DataFrame from the embeddings
print(f"Embeddings for {len(embeddings)} users:")
embeddings





KeyboardInterrupt: 

In [32]:
df_embeddings = pd.DataFrame(embeddings)

print(f"Embeddings shape: {df_embeddings.shape}")
df_embeddings.head()

Embeddings shape: (6, 5)


Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding
0,31h7ml3xiavflj5n7d4av5u5xaie,"[-0.0009209662675857544, 0.0001077811539289541...","[0.0018118077423423529, 0.0005614424007944763,...","[-0.0012177809840068221, 0.0001425175287295133...",[2019.8850574712644]
1,31tgsl3dejcqihle3pv7o6eeng2a,"[5.25292671227362e-05, 0.00023966847220435739,...","[-0.0045182956382632256, 0.004658684134483337,...","[8.820458606351167e-05, 0.00040243961848318577...",[2019.9066666666668]
2,31imc4msmvetbl26gly5n55jbkka,"[-0.0036893084179610014, 0.002161552431061864,...","[0.003511666087433696, 0.007666133344173431, 0...","[-0.0036893084179610014, 0.002161552431061864,...",[2018.0]
3,31frxab22c2ez34gnfggtqqsnope,"[-0.0012761030811816454, 0.001490566530264914,...","[-0.0010105168912559748, 0.000450886000180617,...","[-0.0024445701856166124, 0.002855407539755106,...",[2018.175]
4,31fg5ma4zjh37mcqzto3xt2sxc3a,"[0.007611020002514124, 0.00913164857774973, 0....","[0.0019403230398893356, -0.0005995116662234068...","[0.007611020002514124, 0.00913164857774973, 0....",[2019.0]


In [34]:
fs = project.get_feature_store()

In [35]:
feature_store = project.get_feature_store()
feature_group = feature_store.get_or_create_feature_group(
    name="spotify_user_embeddings",
    version=1,
    primary_key=["user_id"],
    description="Spotify user embeddings based on playlists"
)
feature_group.insert(df_embeddings)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1208515/fs/1195121/fg/1393458


Uploading Dataframe: 100.00% |██████████| Rows 6/6 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: spotify_user_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1208515/jobs/named/spotify_user_embeddings_1_offline_fg_materialization/executions


(Job('spotify_user_embeddings_1_offline_fg_materialization', 'SPARK'), None)