In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from collections import Counter
from sklearn.preprocessing import normalize
from gensim.models import Word2Vec
from datasets import load_dataset
import hopsworks
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sentence_transformers import SentenceTransformer
import pickle

In [4]:
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

with open('../secrets/spotify_client_id.txt', 'r') as file:
    SPOTIFY_CLIENT_ID = file.readline().strip()

with open('../secrets/spotify_client_secret.txt', 'r') as file:
    SPOTIFY_CLIENT_SECRET = file.readline().strip()

In [5]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [6]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)

2025-01-05 17:01:11,644 INFO: Initializing external client
2025-01-05 17:01:11,645 INFO: Base URL: https://c.app.hopsworks.ai:443

Multiple projects found. 

	 (1) id2223_final_project
	 (2) lab1_mohamed_Emile
	 (3) id2223_lab1_group9



Enter number corresponding to the project to use:  1


2025-01-05 17:01:24,187 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [7]:
def get_embeddings(genres, artists, model):
    """
    Generate embeddings for genres and artists using a SentenceTransformer model.
    """
    # Combine genres and artists into a single list for embedding
    inputs = genres + artists
    
    # Generate embeddings
    embeddings = model.encode(inputs, show_progress_bar=True)
    
    # Split the embeddings back into genres and artists
    genre_embeddings = embeddings[:len(genres)]
    artist_embeddings = embeddings[len(genres):]
    
    return genre_embeddings, artist_embeddings

In [8]:
def generate_user_embedding(user_playlists, transformer_model, top_artist_count, playlists_count):
    print("Generating user embedding...")
    all_genres = []
    all_artists = []
    all_release_years = []
    playlist_features = []

    per_playlist_genre_embeddings = []  # Collect genre embeddings for each playlist

    for playlist in user_playlists[:playlists_count]:  # Limit to first playlists_count playlists
        print(f"Processing playlist: {playlist['name']}")
        playlist_name = playlist.get("name", "Unknown")
        playlist_id = playlist["id"]

        # Fetch tracks in the playlist
        tracks = sp.playlist_tracks(playlist_id)["items"]
        print(f"Number of tracks: {len(tracks)}")

        genres = []
        popularity = []
        release_years = []
        explicit_flags = []
        artist_names = []
        artist_ids = []

        # Collect all artist IDs for batch processing
        for item in tracks:
            track = item["track"]
            if not track or track["is_local"]:
                continue
            artist_ids.append(track["artists"][0]["id"])  # Only taking the first artist for simplicity
            release_date = track["album"]["release_date"]

            # Extract year from release date
            release_year = release_date.split('-')[0]
            release_years.append(int(release_year))

            popularity.append(track.get("popularity", 0))
            explicit_flags.append(track.get("explicit", False))

        # Batch the artist IDs for the Get Several Artists API call
        batch_size = 50
        artist_info = []
        for i in range(0, len(artist_ids), batch_size):
            batch = artist_ids[i:i + batch_size]
            response = sp.artists(batch)
            artist_info.extend(response["artists"])

        # Process artist information
        for artist in artist_info:
            artist_name = artist.get("name", "Unknown")
            track_genres = artist.get("genres", [])

            artist_names.append(artist_name)
            genres.extend(track_genres)

        # Generate per-playlist genre embedding
        if genres:
            genre_embeddings = transformer_model.encode(genres, show_progress_bar=False)
            playlist_genre_embedding = np.mean(genre_embeddings, axis=0)  # Average embedding for this playlist
        else:
            playlist_genre_embedding = np.zeros(384)

        per_playlist_genre_embeddings.append(playlist_genre_embedding)

        # Playlist-level features
        playlist_features.append({
            "playlist_name": playlist_name,
            "num_tracks": len(tracks),
            "avg_popularity": np.mean(popularity) if popularity else 0,
            "explicit_ratio": np.mean(explicit_flags) if explicit_flags else 0
        })

        all_genres.extend(genres)
        all_artists.extend(artist_names)
        all_release_years.extend(release_years)

    # Combine per-playlist genre embeddings using playlist sizes as weights
    if per_playlist_genre_embeddings:
        playlist_sizes = [p["num_tracks"] for p in playlist_features]
        playlist_weights = normalize(np.array(playlist_sizes).reshape(1, -1))[0]
        playlist_embedding = np.sum(
            [playlist_weights[i] * per_playlist_genre_embeddings[i] for i in range(len(per_playlist_genre_embeddings))],
            axis=0
        )
    else:
        playlist_embedding = np.zeros(384)

    # Generate overall artist and genre embeddings
    print("Generating contextual embeddings...")

    # Genre Embeddings
    genre_embeddings = transformer_model.encode(all_genres, show_progress_bar=False) if all_genres else np.zeros((1, 384))
    genre_embedding = np.mean(genre_embeddings, axis=0) if len(genre_embeddings) > 0 else np.zeros(384)

    # Artist Embeddings
    artist_counter = Counter(all_artists)
    top_artists = [artist for artist, _ in artist_counter.most_common(top_artist_count)]
    artist_embeddings = transformer_model.encode(top_artists, show_progress_bar=False) if top_artists else np.zeros((1, 384))
    artist_embedding = np.mean(artist_embeddings, axis=0) if len(artist_embeddings) > 0 else np.zeros(384)

    # Release year embedding
    release_year_embedding = np.array([np.mean(all_release_years)]) if all_release_years else np.zeros(1)

    # print("User embedding generated successfully!")
    # print("Genre embedding shape:", genre_embedding.shape)
    # print("Artist embedding shape:", artist_embedding.shape)
    # print("Playlist embedding shape:", playlist_embedding.shape)
    # print("Release year embedding shape:", release_year_embedding.shape)

    # Return individual embeddings
    return genre_embedding, artist_embedding, playlist_embedding, release_year_embedding


In [9]:
profiles_offset = 50  # Offset to start processing profiles
profiles_count = 100  # Number of profiles to process
top_artist_count = 5  # Number of top artists to embed
playlists_count = 5  # Number of playlists to consider per user

# Load dataset
dataset = load_dataset("erenfazlioglu/spotifyuserids")
print(f"Loaded dataset with {len(dataset['train'])} profiles")
rows = dataset["train"][profiles_offset:profiles_offset + profiles_count]

README.md:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

spotify.csv:   0%|          | 0.00/49.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1710459 [00:00<?, ? examples/s]

Loaded dataset with 1710459 profiles


In [11]:
transformer_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # You can replace this with another model if needed

# Collect embeddings in a DataFrame
embeddings = []
for spotify_id in rows["spotify_id"]:
    print(f"Processing user: {spotify_id}")
    try:
        # Fetch user playlists
        playlists = sp.user_playlists(spotify_id)["items"]
        if not playlists:
            print(f"No playlists found for user {spotify_id}")
            continue

        # Generate individual embeddings
        genre_embedding, artist_embedding, playlist_embedding, release_year_embedding = generate_user_embedding(
            playlists, transformer_model, top_artist_count, playlists_count
        )

        # Append embeddings to list as a dictionary
        embeddings.append({
            "user_id": spotify_id,
            "genre_embedding": genre_embedding.tolist(),
            "artist_embedding": artist_embedding.tolist(),
            "playlist_embedding": playlist_embedding.tolist(),
            "release_year_embedding": release_year_embedding.tolist()
        })

    except Exception as e:
        print(f"Error processing user {spotify_id}: {e}")

# Create a DataFrame from the embeddings
# print(f"Embeddings for {len(embeddings)} users:")
# embeddings


2025-01-05 17:24:59,361 INFO: Use pytorch device_name: cpu
2025-01-05 17:24:59,363 INFO: Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
Processing user: x0zwtofskj7dez3lu556v0zb3
No playlists found for user x0zwtofskj7dez3lu556v0zb3
Processing user: 31qr5mtzfd3sdt5afqxthoyi7u5a
Generating user embedding...
Processing playlist: Para Vi e Ju❤️
Number of tracks: 1
Processing playlist: Viagem#4
Number of tracks: 11
Processing playlist: Especiais
Number of tracks: 51
Processing playlist: Zuzu
Number of tracks: 51
Processing playlist: Teca
Number of tracks: 44
Generating contextual embeddings...
Processing user: 31omaiq7zc7oayhypkfhjm2uf5hi
Generating user embedding...
Processing playlist: Guárdame
Number of tracks: 1
Processing playlist: Favorite
Number of tracks: 36
Processing playlist: Confieso
Number of tracks: 13
Processing playlist: Mio
Number of tracks: 33
Processing playlist: My playlist #2
Number of tracks: 1
Generating contextual embeddings...
Processing user: 31q3uwv

In [12]:
# Create a DataFrame from the embeddings
df_embeddings = pd.DataFrame(embeddings)

print(f"Embeddings shape: {df_embeddings.shape}")
df_embeddings.head()

Embeddings shape: (74, 5)


Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding
0,31qr5mtzfd3sdt5afqxthoyi7u5a,"[-0.22496947646141052, -0.13784313201904297, 0...","[0.13576865196228027, -0.06823533773422241, 0....","[-0.3518819212913513, -0.2729158103466034, 0.2...",[1997.1518987341772]
1,31omaiq7zc7oayhypkfhjm2uf5hi,"[0.11507690697908401, -0.18045610189437866, 0....","[0.0375894196331501, 0.4050615429878235, -0.19...","[0.19645436108112335, -0.31298553943634033, 0....",[2011.297619047619]
2,317m5vatgab3nzfflyw3zlj4scxa,"[-0.5591443777084351, 0.46316635608673096, -0....","[0.21053551137447357, 0.588090717792511, -0.15...","[-0.5591443777084351, 0.46316635608673096, -0....",[2019.0]
3,31d6gsnobxsbkta4m7ks2gclxoe4,"[-0.6133443713188171, -0.575651228427887, 0.10...","[-0.4078860580921173, 0.31861019134521484, 0.0...","[-0.6133443713188171, -0.575651228427887, 0.10...",[2015.0]
4,a45xxy3t0gb2yt5db8vdaw0e0,"[-0.04495399445295334, 0.4432730972766876, -0....","[0.22953471541404724, 0.172836571931839, -0.03...","[-0.005955148488283157, 0.4861380457878113, -0...",[2019.9541984732825]


In [56]:
fs = project.get_feature_store()

In [57]:
feature_store = project.get_feature_store()
feature_group = feature_store.get_or_create_feature_group(
    name="spotify_user_embeddings",
    version=2,
    primary_key=["user_id"],
    description="Spotify user embeddings based on playlists"
)
feature_group.insert(df_embeddings)

Uploading Dataframe: 100.00% |██████████| Rows 74/74 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: spotify_user_embeddings_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1208515/jobs/named/spotify_user_embeddings_2_offline_fg_materialization/executions


(Job('spotify_user_embeddings_2_offline_fg_materialization', 'SPARK'), None)