In [38]:
from sklearn.preprocessing import normalize
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import hopsworks
import os

In [39]:
if os.getenv('HOPSWORKS_API_KEY') is not None:
    HOPSWORKS_API_KEY = os.getenv('HOPSWORKS_API_KEY')
else:
    with open('../secrets/hopsworks_api_key.txt', 'r') as file:
        HOPSWORKS_API_KEY = file.readline().strip()

In [40]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store() 

2025-01-09 18:14:51,945 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-09 18:14:52,068 INFO: Initializing external client
2025-01-09 18:14:52,069 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-09 18:14:53,453 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [41]:
user_embeddings_fg = fs.get_feature_group(
    name='spotify_user_embeddings',
    version=2,
)

user_embeddings_df = user_embeddings_fg.read()
print(f"A total of {len(user_embeddings_df)} user embeddings are available.")
user_embeddings_df.head()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.83s) 
A total of 192 user embeddings are available.


Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding
0,31imc4msmvetbl26gly5n55jbkka,"[0.1219930648803711, 0.1214483454823494, -0.27...","[0.28332215547561646, -0.15212738513946533, -0...","[0.1219930648803711, 0.1214483454823494, -0.27...",[2018.0]
1,31tgsl3dejcqihle3pv7o6eeng2a,"[-0.3863714933395386, -0.5621631145477295, 0.1...","[0.3942939341068268, -0.33002883195877075, 0.3...","[-0.7064403295516968, -0.9494979381561279, 0.1...",[2019.9066666666668]
2,31fg5ma4zjh37mcqzto3xt2sxc3a,"[-0.31446775794029236, -0.2531762421131134, 0....","[-0.29996415972709656, 0.5447441339492798, 0.0...","[-0.31446775794029236, -0.2531762421131134, 0....",[2019.0]
3,31h7ml3xiavflj5n7d4av5u5xaie,"[-0.3711507022380829, -0.19814574718475342, -0...","[-0.09817744046449661, 0.1808864027261734, -0....","[-0.5167202949523926, -0.22468358278274536, -0...",[2019.8850574712644]
4,31frxab22c2ez34gnfggtqqsnope,"[0.14112409949302673, 0.12898339331150055, -0....","[-0.010701656341552734, 0.32390376925468445, -...","[0.26713827252388, 0.21868111193180084, -0.577...",[2017.2272727272727]


In [42]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import json
import torch.nn as nn

In [43]:
user_embeddings_df['genre_embedding'] = user_embeddings_df['genre_embedding'].apply(
    lambda x: torch.tensor(x, dtype=torch.float)
)
user_embeddings_df['artist_embedding'] = user_embeddings_df['artist_embedding'].apply(
    lambda x: torch.tensor(x, dtype=torch.float)
)
user_embeddings_df['playlist_embedding'] = user_embeddings_df['playlist_embedding'].apply(
    lambda x: torch.tensor(x, dtype=torch.float)
)
user_embeddings_df['release_year_embedding'] = user_embeddings_df['release_year_embedding'].apply(lambda x: torch.tensor([x], dtype=torch.float))




In [44]:
class EmbeddingDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user_id = self.data.iloc[idx]['user_id']
        genre_embedding = self.data.iloc[idx]['genre_embedding']
        artist_embedding = self.data.iloc[idx]['artist_embedding']
        playlist_embedding = self.data.iloc[idx]['playlist_embedding']
        return user_id, genre_embedding, artist_embedding, playlist_embedding


class Tower(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Tower, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

class TwoTowerModel(nn.Module):
    def __init__(self, embedding_dim, output_dim):
        super(TwoTowerModel, self).__init__()
        # Separate processing for each embedding type
        self.genre_fc = Tower(input_dim=embedding_dim, output_dim=output_dim)
        self.artist_fc = Tower(input_dim=embedding_dim, output_dim=output_dim)
        self.playlist_fc = Tower(input_dim=embedding_dim, output_dim=output_dim)
        
        # Joint Tower for final embedding
        self.fc_merge = nn.Sequential(
            nn.Linear(output_dim * 3, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )
        
    def forward(self, genre, artist, playlist):
        genre_embed = self.genre_fc(genre)
        artist_embed = self.artist_fc(artist)
        playlist_embed = self.playlist_fc(playlist)
        
        # Concatenate embeddings and pass through final layers
        combined = torch.cat([genre_embed, artist_embed, playlist_embed], dim=-1)
        final_embed = self.fc_merge(combined)
        return final_embed

    def compute_similarity(self, query_embedding, database_embedding):
        # Cosine similarity for comparison
        return torch.nn.functional.cosine_similarity(query_embedding, database_embedding)



In [45]:
# Initialize model
df = user_embeddings_df
print(df['genre_embedding'][0].shape)
embedding_dim = len(df['genre_embedding'][0])  # Assuming all embeddings have the same dimension
output_dim = 64
margin = 0.5

model = TwoTowerModel(embedding_dim=embedding_dim, output_dim=output_dim)
criterion = nn.CosineEmbeddingLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1/10e9, weight_decay=1e-5)

# Prepare dataset and dataloader
train_dataset = EmbeddingDataset(df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Dummy negative sampling (replace with actual negatives)
def negative_sample(batch_size, df):
    # Randomly sample other embeddings as negatives
    sampled = df.sample(batch_size)  # Ensure the sample size matches the batch size
    return torch.stack(sampled['genre_embedding'].tolist()), \
           torch.stack(sampled['artist_embedding'].tolist()), \
           torch.stack(sampled['playlist_embedding'].tolist())


# Training loop
for epoch in range(100):
    total_loss = 0
    for user_ids, genres, artists, playlists in train_loader:
        # Ensure embeddings are converted to tensors
        genres = torch.tensor(genres.tolist(), dtype=torch.float32)
        artists = torch.tensor(artists.tolist(), dtype=torch.float32)
        playlists = torch.tensor(playlists.tolist(), dtype=torch.float32)
        
        # Generate negative samples
        neg_genres, neg_artists, neg_playlists = negative_sample(len(genres), df)
        
        # Forward pass for positives and negatives
        positive_embed = model(genres, artists, playlists)
        negative_embed = model(neg_genres, neg_artists, neg_playlists)
        
        # Create labels for the current batch size
        labels = torch.ones(positive_embed.size(0))
        
        # Calculate loss - note we're using just one pair of embeddings and their labels
        loss = criterion(positive_embed, negative_embed, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")


torch.Size([384])
Epoch 1, Loss: 0.14455203711986542
Epoch 2, Loss: 0.13911998830735683
Epoch 3, Loss: 0.146803455427289
Epoch 4, Loss: 0.13988189585506916
Epoch 5, Loss: 0.14928926154971123
Epoch 6, Loss: 0.14081386663019657
Epoch 7, Loss: 0.14487585425376892
Epoch 8, Loss: 0.1427348032593727
Epoch 9, Loss: 0.14560957998037338
Epoch 10, Loss: 0.147005470469594
Epoch 11, Loss: 0.1354917697608471
Epoch 12, Loss: 0.13710320927202702
Epoch 13, Loss: 0.1370595544576645
Epoch 14, Loss: 0.14349603280425072
Epoch 15, Loss: 0.13999547064304352
Epoch 16, Loss: 0.1417810656130314
Epoch 17, Loss: 0.14545726589858532
Epoch 18, Loss: 0.14155980199575424
Epoch 19, Loss: 0.14047716185450554
Epoch 20, Loss: 0.13836096972227097
Epoch 21, Loss: 0.1461825668811798
Epoch 22, Loss: 0.1480455193668604
Epoch 23, Loss: 0.1432174388319254
Epoch 24, Loss: 0.14932548627257347
Epoch 25, Loss: 0.1438207607716322
Epoch 26, Loss: 0.14593925885856152
Epoch 27, Loss: 0.14979504235088825
Epoch 28, Loss: 0.1427450478076

In [50]:
def find_similar_embedding(query_genre, query_artist, query_playlist, database, model, top_k=5):
    model.eval()
    with torch.no_grad():
        query_embedding = model(query_genre.unsqueeze(0), query_artist.unsqueeze(0), query_playlist.unsqueeze(0))
        
        # Compute embeddings for all database entries
        db_genres = torch.stack(database['genre_embedding'].tolist())
        db_artists = torch.stack(database['artist_embedding'].tolist())
        db_playlists = torch.stack(database['playlist_embedding'].tolist())
        db_embeddings = model(db_genres, db_artists, db_playlists)
        
        # Compute similarities
        similarities = torch.nn.functional.cosine_similarity(query_embedding, db_embeddings)
        top_k_indices = torch.topk(similarities, k=top_k).indices
        return top_k_indices, similarities[top_k_indices]

# Example usage
index_to_query = 5
query_genre = df['genre_embedding'][index_to_query]
query_artist = df['artist_embedding'][index_to_query]
query_playlist = df['playlist_embedding'][index_to_query]

top_k_indices, scores = find_similar_embedding(query_genre, query_artist, query_playlist, df, model)
# Remove index_to_query from the top_k_indices and also remove its score
index_to_query_index = np.where(top_k_indices == index_to_query)[0][0]
top_k_indices = np.delete(top_k_indices, index_to_query_index)
scores = np.delete(scores, index_to_query_index)

print("Top K Similar Embeddings:", top_k_indices)
print("Similarity Scores:", scores)

Top K Similar Embeddings: tensor([ 19, 173,   8, 121])
Similarity Scores: tensor([0.9877, 0.9866, 0.9846, 0.9837])


In [47]:
model_dir = "torch_model"
os.makedirs(model_dir, exist_ok=True)

# Save the model and metadata
torch.save({
    'model_state_dict': model.state_dict(),
    'embedding_dim': embedding_dim,
    'output_dim': output_dim,
}, os.path.join(model_dir, 'two_tower_model_torch.pth'))

In [48]:
# Get the model registry handle
mr = project.get_model_registry()
model_registry = mr.get_model("two_tower_model_torch", version=1) 
model_registry.delete()

# Create the model metadata object
torch_model = mr.torch.create_model(
    name="two_tower_model_torch",
    metrics={'final_loss': total_loss},  # You can add your training metrics here
    description="Two-tower model for music recommendations",
    version=1,
    input_example={
        'genre_embedding': genres[0].numpy().tolist(),
        'artist_embedding': artists[0].numpy().tolist(),
        'playlist_embedding': playlists[0].numpy().tolist()
    }
)

# Save the model to the registry
torch_model.save(model_dir)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/828414 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/828294 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/24563 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1208515/models/two_tower_model_torch/1


Model(name: 'two_tower_model_torch', version: 1)