In [19]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from collections import Counter
from sklearn.preprocessing import normalize
from gensim.models import Word2Vec
from datasets import load_dataset
import hopsworks
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

with open('../secrets/spotify_client_id.txt', 'r') as file:
    SPOTIFY_CLIENT_ID = file.readline().strip()

with open('../secrets/spotify_client_secret.txt', 'r') as file:
    SPOTIFY_CLIENT_SECRET = file.readline().strip()

In [21]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [28]:
# Normalize release dates to a range [0, 1]
def normalize_dates(release_dates):
    if not release_dates:
        return 0.0
    dates = [datetime.strptime(date, "%Y-%m-%d").timestamp() for date in release_dates if date]
    return np.mean(dates) / datetime.now().timestamp()

# Generate embedding for a user's profile
def generate_user_embedding(user_playlists, genre_embedding_model, artist_embedding_model, top_artist_count):
    all_genres = []
    all_artists = []
    all_release_years = []
    playlist_features = []

    for playlist in user_playlists[:10]:  # Limit to first 10 playlists
        playlist_name = playlist.get("name", "Unknown")
        playlist_id = playlist["id"]

        # Fetch tracks in the playlist
        tracks = sp.playlist_tracks(playlist_id)["items"]

        genres = []
        popularity = []
        release_years = []
        explicit_flags = []
        artist_names = []

        for item in tracks:
            track = item["track"]
            if not track:
                continue

            # Extract artist information
            artist_id = track["artists"][0]["id"]
            artist = sp.artist(artist_id)
            track_genres = artist.get("genres", [])
            artist_name = artist.get("name", "Unknown")

            # Append track features
            genres.extend(track_genres)
            artist_names.append(artist_name)
            release_date = track["album"]["release_date"]
            
            # Extract year from release date
            release_year = release_date.split('-')[0]
            release_years.append(int(release_year))

            popularity.append(track.get("popularity", 0))
            explicit_flags.append(track.get("explicit", False))

        # Playlist-level features
        playlist_features.append({
            "playlist_name": playlist_name,
            "num_tracks": len(tracks),
            "avg_popularity": np.mean(popularity) if popularity else 0,
            "explicit_ratio": np.mean(explicit_flags) if explicit_flags else 0
        })

        all_genres.extend(genres)
        all_artists.extend(artist_names)
        all_release_years.extend(release_years)

    # Top artist embedding
    artist_counter = Counter(all_artists)
    top_artists = [artist for artist, _ in artist_counter.most_common(top_artist_count)]
    artist_embedding = np.mean(
        [artist_embedding_model.wv[artist] for artist in top_artists if artist in artist_embedding_model.wv],
        axis=0
    ) if top_artists else np.zeros(100)

    # Genre embedding
    genre_vectors = [
        genre_embedding_model.wv[genre] for genre in all_genres if genre in genre_embedding_model.wv
    ]
    genre_embedding = np.mean(genre_vectors, axis=0) if genre_vectors else np.zeros(100)

    # Aggregated playlist embedding
    playlist_sizes = [p["num_tracks"] for p in playlist_features]
    playlist_weights = normalize(np.array(playlist_sizes).reshape(1, -1))[0]
    playlist_embedding = np.sum([playlist_weights[i] * genre_embedding for i in range(len(playlist_features))], axis=0)

    # Release year embedding
    release_year_embedding = np.array([np.mean(all_release_years)])

    # Return individual embeddings
    return genre_embedding, artist_embedding, playlist_embedding, release_year_embedding

In [23]:
# Store embeddings in HopsWorks
def store_in_hopsworks(df):
    project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
    feature_store = project.get_feature_store()
    feature_group = feature_store.get_or_create_feature_group(
        name="spotify_user_embeddings",
        version=1,
        primary_key=["user_id"],
        description="Spotify user embeddings based on playlists"
    )
    feature_group.insert(df)
    print("Stored embeddings in HopsWorks")    

In [29]:
profiles_count = 10  # Number of profiles to process
top_artist_count = 5  # Number of top artists to embed

# Load dataset
dataset = load_dataset("erenfazlioglu/spotifyuserids")
print(f"Loaded dataset with {len(dataset['train'])} profiles")
rows = dataset["train"][:profiles_count]

Loaded dataset with 1710459 profiles


In [30]:
# Train custom Word2Vec model for genres and artists
corpus = []
for spotify_id in rows["spotify_id"]:
    playlists = sp.user_playlists(spotify_id)["items"]
    for playlist in playlists:
        tracks = sp.playlist_tracks(playlist["id"])["items"]
        for item in tracks:
            track = item["track"]
            if not track:
                continue
            artist_id = track["artists"][0]["id"]
            artist = sp.artist(artist_id)
            track_genres = artist.get("genres", [])
            artist_name = artist.get("name", "Unknown")
            corpus.append(track_genres + [artist_name])

print(f"Training Word2Vec model on {len(corpus)} playlists")

# Train Word2Vec model
genre_embedding_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
artist_embedding_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)


Training Word2Vec model on 210 playlists
2025-01-02 14:21:04,245 INFO: collecting all words and their counts
2025-01-02 14:21:04,248 INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-01-02 14:21:04,248 INFO: collected 192 word types from a corpus of 533 raw words and 210 sentences
2025-01-02 14:21:04,249 INFO: Creating a fresh vocabulary
2025-01-02 14:21:04,251 INFO: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 192 unique words (100.00% of original 192, drops 0)', 'datetime': '2025-01-02T14:21:04.251129', 'gensim': '4.3.3', 'python': '3.10.13 (main, Nov  9 2024, 19:45:20) [Clang 15.0.0 (clang-1500.3.9.4)]', 'platform': 'macOS-14.5-arm64-arm-64bit', 'event': 'prepare_vocab'}
2025-01-02 14:21:04,252 INFO: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 533 word corpus (100.00% of original 533, drops 0)', 'datetime': '2025-01-02T14:21:04.252095', 'gensim': '4.3.3', 'python': '3.10.13 (main, Nov  9 2024, 19:45:20) [Clang 15.0.0 

In [31]:
# Collect embeddings in a DataFrame
embeddings = []
for spotify_id in rows["spotify_id"]:
    try:
        # Fetch user playlists
        playlists = sp.user_playlists(spotify_id)["items"]
        if not playlists:
            print(f"No playlists found for user {spotify_id}")
            continue

        # Generate individual embeddings
        genre_embedding, artist_embedding, playlist_embedding, release_year_embedding = generate_user_embedding(
            playlists, genre_embedding_model, artist_embedding_model, top_artist_count
        )

        # Append embeddings to list as a dictionary
        embeddings.append({
            "user_id": spotify_id,
            "genre_embedding": genre_embedding.tolist(),
            "artist_embedding": artist_embedding.tolist(),
            "playlist_embedding": playlist_embedding.tolist(),
            "release_year_embedding": release_year_embedding.tolist()
        })

    except Exception as e:
        print(f"Error processing user {spotify_id}: {e}")

# Create a DataFrame from the embeddings
print(f"Embeddings for {len(embeddings)} users:")
embeddings


No playlists found for user 31jftrtdt3442xn74mb44j4l4lf4




No playlists found for user 31vbhry6lnvr5jdwbvzkmwmpoyaq
No playlists found for user 31yfhhcsm7gfze67kyakl2htpsqe

No playlists found for user 312klqvvav432mq4562g7qebwu6q
Embeddings for 6 users:


[{'user_id': '31h7ml3xiavflj5n7d4av5u5xaie',
  'genre_embedding': [-0.0009209662675857544,
   0.00010778115392895415,
   0.0018662126967683434,
   0.0019467115635052323,
   -0.0005725488299503922,
   -0.0021639298647642136,
   0.0021171628031879663,
   0.0029985541477799416,
   -0.0005112177459523082,
   -0.0034737084060907364,
   0.003272007452324033,
   -0.0004962360253557563,
   0.0010165980784222484,
   -0.00023131075431592762,
   0.0008223623153753579,
   0.00028773638769052923,
   0.002305330941453576,
   -0.0023181342985481024,
   -0.0023326100781559944,
   -0.00259407632984221,
   0.0010191642213612795,
   0.001282733865082264,
   0.0033413649071007967,
   0.0004952771705575287,
   -0.001523803686723113,
   0.0014668346848338842,
   0.0002575753314886242,
   0.0019251361954957247,
   -0.0011050069006159902,
   0.0005507166497409344,
   -0.0009846361353993416,
   -0.00035011241561733186,
   0.0015152987325564027,
   -0.0009564967476762831,
   3.550488327164203e-05,
   0.00324124

In [32]:
df_embeddings = pd.DataFrame(embeddings)

print(f"Embeddings shape: {df_embeddings.shape}")
df_embeddings.head()

Embeddings shape: (6, 5)


Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding
0,31h7ml3xiavflj5n7d4av5u5xaie,"[-0.0009209662675857544, 0.0001077811539289541...","[0.0018118077423423529, 0.0005614424007944763,...","[-0.0012177809840068221, 0.0001425175287295133...",[2019.8850574712644]
1,31tgsl3dejcqihle3pv7o6eeng2a,"[5.25292671227362e-05, 0.00023966847220435739,...","[-0.0045182956382632256, 0.004658684134483337,...","[8.820458606351167e-05, 0.00040243961848318577...",[2019.9066666666668]
2,31imc4msmvetbl26gly5n55jbkka,"[-0.0036893084179610014, 0.002161552431061864,...","[0.003511666087433696, 0.007666133344173431, 0...","[-0.0036893084179610014, 0.002161552431061864,...",[2018.0]
3,31frxab22c2ez34gnfggtqqsnope,"[-0.0012761030811816454, 0.001490566530264914,...","[-0.0010105168912559748, 0.000450886000180617,...","[-0.0024445701856166124, 0.002855407539755106,...",[2018.175]
4,31fg5ma4zjh37mcqzto3xt2sxc3a,"[0.007611020002514124, 0.00913164857774973, 0....","[0.0019403230398893356, -0.0005995116662234068...","[0.007611020002514124, 0.00913164857774973, 0....",[2019.0]


In [33]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
project

2025-01-02 14:21:48,708 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-02 14:21:48,734 INFO: Initializing external client
2025-01-02 14:21:48,735 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-02 14:21:49,882 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


Project('id2223_final_project', 'minifixio@gmail.com', 'Default project')

In [34]:
fs = project.get_feature_store()

In [35]:
feature_store = project.get_feature_store()
feature_group = feature_store.get_or_create_feature_group(
    name="spotify_user_embeddings",
    version=1,
    primary_key=["user_id"],
    description="Spotify user embeddings based on playlists"
)
feature_group.insert(df_embeddings)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1208515/fs/1195121/fg/1393458


Uploading Dataframe: 100.00% |██████████| Rows 6/6 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: spotify_user_embeddings_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1208515/jobs/named/spotify_user_embeddings_1_offline_fg_materialization/executions


(Job('spotify_user_embeddings_1_offline_fg_materialization', 'SPARK'), None)