In [98]:
import numpy as np
import tensorflow as tf
import hopsworks
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pickle
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import time
from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import Model

In [99]:
# Load credentials
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

with open('../secrets/spotify_client_id.txt', 'r') as file:
    SPOTIFY_CLIENT_ID = file.readline().strip()

with open('../secrets/spotify_client_secret.txt', 'r') as file:
    SPOTIFY_CLIENT_SECRET = file.readline().strip()

In [100]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [101]:
# Connect to the project and feature store
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

2025-01-09 18:10:07,864 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-09 18:10:07,880 INFO: Initializing external client
2025-01-09 18:10:07,881 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-09 18:10:09,351 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [102]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch

class EmbeddingDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user_id = self.data.iloc[idx]['user_id']
        genre_embedding = self.data.iloc[idx]['genre_embedding']
        artist_embedding = self.data.iloc[idx]['artist_embedding']
        playlist_embedding = self.data.iloc[idx]['playlist_embedding']
        return user_id, genre_embedding, artist_embedding, playlist_embedding

class Tower(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Tower, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

class TwoTowerModel(nn.Module):
    def __init__(self, embedding_dim, output_dim):
        super(TwoTowerModel, self).__init__()
        # Separate processing for each embedding type
        self.genre_fc = Tower(input_dim=embedding_dim, output_dim=output_dim)
        self.artist_fc = Tower(input_dim=embedding_dim, output_dim=output_dim)
        self.playlist_fc = Tower(input_dim=embedding_dim, output_dim=output_dim)
        
        # Joint Tower for final embedding
        self.fc_merge = nn.Sequential(
            nn.Linear(output_dim * 3, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
        
    def forward(self, genre, artist, playlist):
        genre_embed = self.genre_fc(genre)
        artist_embed = self.artist_fc(artist)
        playlist_embed = self.playlist_fc(playlist)
        
        # Concatenate embeddings and pass through final layers
        combined = torch.cat([genre_embed, artist_embed, playlist_embed], dim=-1)
        final_embed = self.fc_merge(combined)
        return final_embed

    def compute_similarity(self, query_embedding, database_embedding):
        # Cosine similarity for comparison
        return torch.nn.functional.cosine_similarity(query_embedding, database_embedding)

In [103]:
import os
mr = project.get_model_registry()

# Retrieve the PyTorch model from the model registry
model_registry = mr.get_model("two_tower_model_torch", version=1)  # Adjust version as needed
model_file_path = model_registry.download()

# Load the model
checkpoint = torch.load(os.path.join(model_file_path, 'two_tower_model_torch.pth'))

# Recreate the model architecture
model = TwoTowerModel(
    embedding_dim=checkpoint['embedding_dim'],
    output_dim=checkpoint['output_dim']
)

# Load the state dict
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()  # Set to evaluation mode

print("Model loaded successfully!")

Model loaded successfully! (0 dirs, 2 files)... DONE


In [104]:
user_embeddings_fg = fs.get_feature_group(name="spotify_user_embeddings", version=2)
user_embeddings_df = user_embeddings_fg.read()

user_embeddings_df['genre_embedding'] = user_embeddings_df['genre_embedding'].apply(
    lambda x: torch.tensor(x, dtype=torch.float)
)
user_embeddings_df['artist_embedding'] = user_embeddings_df['artist_embedding'].apply(
    lambda x: torch.tensor(x, dtype=torch.float)
)
user_embeddings_df['playlist_embedding'] = user_embeddings_df['playlist_embedding'].apply(
    lambda x: torch.tensor(x, dtype=torch.float)
)
user_embeddings_df['release_year_embedding'] = user_embeddings_df['release_year_embedding'].apply(lambda x: torch.tensor([x], dtype=torch.float))

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.50s) 



In [105]:
def get_embeddings(genres, artists, model):
    """
    Generate embeddings for genres and artists using a SentenceTransformer model.
    """
    # Combine genres and artists into a single list for embedding
    inputs = genres + artists
    
    # Generate embeddings
    embeddings = model.encode(inputs, show_progress_bar=True)
    
    # Split the embeddings back into genres and artists
    genre_embeddings = embeddings[:len(genres)]
    artist_embeddings = embeddings[len(genres):]
    
    return genre_embeddings, artist_embeddings

In [106]:
def generate_user_embedding(user_playlists, transformer_model, top_artist_count, playlists_count):
    print("Generating user embedding...")
    all_genres = []
    all_artists = []
    all_release_years = []
    playlist_features = []

    per_playlist_genre_embeddings = []  # Collect genre embeddings for each playlist

    for playlist in user_playlists[:playlists_count]:  # Limit to first playlists_count playlists
        # print(f"Processing playlist: {playlist['name']}")
        playlist_name = playlist.get("name", "Unknown")
        playlist_id = playlist["id"]

        # Fetch tracks in the playlist
        tracks = sp.playlist_tracks(playlist_id)["items"]
        # print(f"Number of tracks: {len(tracks)}")

        genres = []
        popularity = []
        release_years = []
        explicit_flags = []
        artist_names = []
        artist_ids = []

        # Collect all artist IDs for batch processing
        for item in tracks:
            track = item["track"]
            if not track or track["is_local"]:
                continue
            artist_ids.append(track["artists"][0]["id"])  # Only taking the first artist for simplicity
            release_date = track["album"]["release_date"]

            # Extract year from release date
            release_year = release_date.split('-')[0]
            release_years.append(int(release_year))

            popularity.append(track.get("popularity", 0))
            explicit_flags.append(track.get("explicit", False))

        # Batch the artist IDs for the Get Several Artists API call
        batch_size = 50
        artist_info = []
        for i in range(0, len(artist_ids), batch_size):
            batch = artist_ids[i:i + batch_size]
            response = sp.artists(batch)
            artist_info.extend(response["artists"])

        # Process artist information
        for artist in artist_info:
            artist_name = artist.get("name", "Unknown")
            track_genres = artist.get("genres", [])

            artist_names.append(artist_name)
            genres.extend(track_genres)

        # Generate per-playlist genre embedding
        if genres:
            genre_embeddings = transformer_model.encode(genres, show_progress_bar=False)
            playlist_genre_embedding = np.mean(genre_embeddings, axis=0)  # Average embedding for this playlist
        else:
            playlist_genre_embedding = np.zeros(384)

        per_playlist_genre_embeddings.append(playlist_genre_embedding)

        # Playlist-level features
        playlist_features.append({
            "playlist_name": playlist_name,
            "num_tracks": len(tracks),
            "avg_popularity": np.mean(popularity) if popularity else 0,
            "explicit_ratio": np.mean(explicit_flags) if explicit_flags else 0
        })

        all_genres.extend(genres)
        all_artists.extend(artist_names)
        all_release_years.extend(release_years)

    # Combine per-playlist genre embeddings using playlist sizes as weights
    if per_playlist_genre_embeddings:
        playlist_sizes = [p["num_tracks"] for p in playlist_features]
        playlist_weights = normalize(np.array(playlist_sizes).reshape(1, -1))[0]
        playlist_embedding = np.sum(
            [playlist_weights[i] * per_playlist_genre_embeddings[i] for i in range(len(per_playlist_genre_embeddings))],
            axis=0
        )
    else:
        playlist_embedding = np.zeros(384)

    # Generate overall artist and genre embeddings
    print("Generating contextual embeddings...")

    # Genre Embeddings
    genre_embeddings = transformer_model.encode(all_genres, show_progress_bar=False) if all_genres else np.zeros((1, 384))
    genre_embedding = np.mean(genre_embeddings, axis=0) if len(genre_embeddings) > 0 else np.zeros(384)

    # Artist Embeddings
    artist_counter = Counter(all_artists)
    top_artists = [artist for artist, _ in artist_counter.most_common(top_artist_count)]
    artist_embeddings = transformer_model.encode(top_artists, show_progress_bar=False) if top_artists else np.zeros((1, 384))
    artist_embedding = np.mean(artist_embeddings, axis=0) if len(artist_embeddings) > 0 else np.zeros(384)

    # Release year embedding
    release_year_embedding = np.array([np.mean(all_release_years)]) if all_release_years else np.zeros(1)

    print("User embedding generated successfully!")
    print("Genre embedding shape:", genre_embedding.shape)
    print("Artist embedding shape:", artist_embedding.shape)
    print("Playlist embedding shape:", playlist_embedding.shape)
    print("Release year embedding shape:", release_year_embedding.shape)

    # Return individual embeddings
    return genre_embedding, artist_embedding, playlist_embedding, release_year_embedding


In [107]:
# Inference function
def get_best_matching_users(user_id, transformer_model, top_artist_count, playlists_count, top_k):
    # Fetch user playlists
    playlists = sp.user_playlists(user_id)["items"]
    if not playlists:
        print(f"No playlists found for user {user_id}")
        return None

    # Generate the user's embedding
    genre_embedding, artist_embedding, playlist_embedding, release_year_embedding = generate_user_embedding(
        playlists, transformer_model, top_artist_count, playlists_count
    )

    # Convert to PyTorch tensors
    genre_embedding = torch.tensor(genre_embedding, dtype=torch.float)
    artist_embedding = torch.tensor(artist_embedding, dtype=torch.float)
    playlist_embedding = torch.tensor(playlist_embedding, dtype=torch.float)

    print("User embeddings generated successfully!")

    model.eval()
    top_k_indices = []
    with torch.no_grad():
        query_embedding = model(genre_embedding.unsqueeze(0), artist_embedding.unsqueeze(0), playlist_embedding.unsqueeze(0))
        
        # Compute embeddings for all database entries
        db_genres = torch.stack(user_embeddings_df['genre_embedding'].tolist())
        db_artists = torch.stack(user_embeddings_df['artist_embedding'].tolist())
        db_playlists = torch.stack(user_embeddings_df['playlist_embedding'].tolist())
        db_embeddings = model(db_genres, db_artists, db_playlists)
        
        # Compute similarities
        similarities = torch.nn.functional.cosine_similarity(query_embedding, db_embeddings)
        top_k_indices = torch.topk(similarities, k=top_k+1).indices
        top_k_indices, similarities[top_k_indices]
        scores = similarities[top_k_indices]

        # Find user_id ID in the dataset and remove it from the top_k_indices
        user_index = user_embeddings_df[user_embeddings_df['user_id'] == user_id].index[0]
        index_to_delete = np.where(top_k_indices == user_index)[0][0]
        top_k_indices = np.delete(top_k_indices, index_to_delete)
        scores = np.delete(scores, index_to_delete)

    # print("Top K Similar Embeddings:", top_k_indices)
    # print("Scores:", scores)

    return user_embeddings_df.iloc[top_k_indices], scores

In [108]:
user_id = "minifixiowow"
top_artist_count = 5
playlists_count = 5

transformer_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # You can replace this with another model if needed

# Get the top K most similar users
top_k = 5
similar_users, similarity_scores = get_best_matching_users(user_id, transformer_model, top_artist_count, playlists_count, top_k)

print(f"Top {top_k} similar users:")

j = 0
for i, row in similar_users.iterrows():
    print(row.user_id, similarity_scores[j].item())
    j += 1


2025-01-09 18:10:20,707 INFO: Use pytorch device_name: mps
2025-01-09 18:10:20,708 INFO: Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
Generating user embedding...

Generating contextual embeddings...
User embedding generated successfully!
Genre embedding shape: (384,)
Artist embedding shape: (384,)
Playlist embedding shape: (384,)
Release year embedding shape: (1,)
User embeddings generated successfully!
Top 5 similar users:
zsuska_82 0.9920231699943542
11182303216 0.9918758273124695
ufmshw1g5mvuo04vs3bda3amp 0.991561770439148
31qr5mtzfd3sdt5afqxthoyi7u5a 0.9897653460502625
11179757726 0.9894211292266846
