In [51]:
from sklearn.preprocessing import normalize
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.metrics.pairwise import cosine_similarity
import random
import numpy as np
import hopsworks

In [52]:
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

In [53]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store() 

2025-01-03 10:59:25,120 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-03 10:59:25,234 INFO: Initializing external client
2025-01-03 10:59:25,239 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-03 10:59:27,145 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [54]:
user_embeddings_fg = fs.get_feature_group(
    name='spotify_user_embeddings',
    version=2,
)

user_embeddings_df = user_embeddings_fg.read()
user_embeddings_df.head()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.41s) 


Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding
0,31imc4msmvetbl26gly5n55jbkka,"[0.1219930648803711, 0.1214483454823494, -0.27...","[0.28332215547561646, -0.15212738513946533, -0...","[0.1219930648803711, 0.1214483454823494, -0.27...",[2018.0]
1,31tgsl3dejcqihle3pv7o6eeng2a,"[-0.3863714933395386, -0.5621631145477295, 0.1...","[0.3942939341068268, -0.33002883195877075, 0.3...","[-0.7064403295516968, -0.9494979381561279, 0.1...",[2019.9066666666668]
2,31fg5ma4zjh37mcqzto3xt2sxc3a,"[-0.31446775794029236, -0.2531762421131134, 0....","[-0.29996415972709656, 0.5447441339492798, 0.0...","[-0.31446775794029236, -0.2531762421131134, 0....",[2019.0]
3,31h7ml3xiavflj5n7d4av5u5xaie,"[-0.3711507022380829, -0.19814574718475342, -0...","[-0.09817744046449661, 0.1808864027261734, -0....","[-0.5167202949523926, -0.22468358278274536, -0...",[2019.8850574712644]
4,31frxab22c2ez34gnfggtqqsnope,"[0.14477893710136414, 0.14487089216709137, -0....","[0.05995742231607437, 0.09421724081039429, -0....","[0.2765352241694927, 0.26221722178161144, -0.5...",[2018.175]


In [55]:
user_embeddings_df['full_embedding'] = user_embeddings_df.apply(
    lambda row: np.concatenate(
        [row['genre_embedding'], row['artist_embedding'], row['playlist_embedding'], row['release_year_embedding']]
    ),
    axis=1
)
normalized_embeddings = normalize(np.array(user_embeddings_df['full_embedding'].tolist()))
user_embeddings_df['normalized_embedding'] = normalized_embeddings.tolist()
user_embeddings_df

Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding,full_embedding,normalized_embedding
0,31imc4msmvetbl26gly5n55jbkka,"[0.1219930648803711, 0.1214483454823494, -0.27...","[0.28332215547561646, -0.15212738513946533, -0...","[0.1219930648803711, 0.1214483454823494, -0.27...",[2018.0],"[0.1219930648803711, 0.1214483454823494, -0.27...","[6.0451476847805844e-05, 6.018155091299676e-05..."
1,31tgsl3dejcqihle3pv7o6eeng2a,"[-0.3863714933395386, -0.5621631145477295, 0.1...","[0.3942939341068268, -0.33002883195877075, 0.3...","[-0.7064403295516968, -0.9494979381561279, 0.1...",[2019.9066666666668],"[-0.3863714933395386, -0.5621631145477295, 0.1...","[-0.00019127666431846473, -0.00027830387905733..."
2,31fg5ma4zjh37mcqzto3xt2sxc3a,"[-0.31446775794029236, -0.2531762421131134, 0....","[-0.29996415972709656, 0.5447441339492798, 0.0...","[-0.31446775794029236, -0.2531762421131134, 0....",[2019.0],"[-0.31446775794029236, -0.2531762421131134, 0....","[-0.0001557508143566418, -0.000125394113988495..."
3,31h7ml3xiavflj5n7d4av5u5xaie,"[-0.3711507022380829, -0.19814574718475342, -0...","[-0.09817744046449661, 0.1808864027261734, -0....","[-0.5167202949523926, -0.22468358278274536, -0...",[2019.8850574712644],"[-0.3711507022380829, -0.19814574718475342, -0...","[-0.00018374682362942734, -9.809667997751179e-..."
4,31frxab22c2ez34gnfggtqqsnope,"[0.14477893710136414, 0.14487089216709137, -0....","[0.05995742231607437, 0.09421724081039429, -0....","[0.2765352241694927, 0.26221722178161144, -0.5...",[2018.175],"[0.14477893710136414, 0.14487089216709137, -0....","[7.17360648309936e-05, 7.178162736024418e-05, ..."
5,vvzx3nq79szk5qfmkznpm230n,"[-0.05180082842707634, -0.16555051505565643, 0...","[-0.1893274188041687, 0.4208906590938568, -0.4...","[-0.10432210564613342, -0.23293562233448029, 0...",[2021.1666666666667],"[-0.05180082842707634, -0.16555051505565643, 0...","[-2.562872517099519e-05, -8.190696521873343e-0..."


In [56]:
def build_user_tower(input_dim, embedding_dim=128):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='relu')(x)
    user_embedding = layers.Dense(embedding_dim, activation=None)(x)  # Final user embedding
    return Model(inputs, user_embedding, name="UserTower")

def build_candidate_tower(input_dim, embedding_dim=128):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='relu')(x)
    candidate_embedding = layers.Dense(embedding_dim, activation=None)(x)  # Final candidate embedding
    return Model(inputs, candidate_embedding, name="CandidateTower")

In [57]:
# Instantiate towers
input_dim = len(normalized_embeddings[0])  # Dimensionality of the concatenated embedding
embedding_dim = 128

user_tower = build_user_tower(input_dim, embedding_dim)
candidate_tower = build_candidate_tower(input_dim, embedding_dim)

# Compute cosine similarity
user_embedding = user_tower.output
candidate_embedding = candidate_tower.output
cosine_similarity_model = tf.keras.layers.Dot(axes=1, normalize=True)([user_embedding, candidate_embedding])

# Final model
model = tf.keras.Model(inputs=[user_tower.input, candidate_tower.input], outputs=cosine_similarity_model)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

In [58]:
def generate_pairs(embeddings, similarity_threshold=0.8, negative_ratio=1):
    pairs = []
    labels = []

    # Compute cosine similarity for all pairs
    similarity_matrix = cosine_similarity(embeddings)  # This is a valid pairwise similarity matrix

    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            if similarity_matrix[i, j] > similarity_threshold:
                # Positive pair
                pairs.append((embeddings[i], embeddings[j]))
                labels.append(1)

                # Generate negative pairs
                for _ in range(negative_ratio):
                    negative_index = np.random.choice(len(embeddings))
                    while negative_index == i or negative_index == j:
                        negative_index = np.random.choice(len(embeddings))
                    pairs.append((embeddings[i], embeddings[negative_index]))
                    labels.append(0)
    
    return np.array(pairs), np.array(labels)

In [59]:
# Generate training data
pairs, labels = generate_pairs(normalized_embeddings)
user_1 = np.array([pair[0] for pair in pairs])
user_2 = np.array([pair[1] for pair in pairs])

history = model.fit(
    [user_1, user_2],  # Input: pairs of user embeddings
    labels,            # Output: similarity labels
    batch_size=32,
    epochs=10,
    validation_split=0.2
)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5000 - loss: 1.4633 - val_accuracy: 0.5000 - val_loss: 0.6952
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.5000 - loss: 0.7098 - val_accuracy: 0.5000 - val_loss: 0.7249
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.4167 - loss: 0.6920 - val_accuracy: 0.5000 - val_loss: 0.7668
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.5000 - loss: 0.7346 - val_accuracy: 0.5000 - val_loss: 0.7772
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.5000 - loss: 0.7351 - val_accuracy: 0.5000 - val_loss: 0.7650
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.5000 - loss: 0.7365 - val_accuracy: 0.5000 - val_loss: 0.7436
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━

In [60]:
mr = project.get_model_registry()
model.save("two_tower_model.keras")

# Create a new model version
model_dir = "two_tower_model.keras"
model_name = "two_tower_recommender"

model_registry = mr.python.create_model(
    name=model_name,
    metrics={"accuracy": history.history["accuracy"][-1]},  # Log the final accuracy
    description="Two-Tower Recommender Model for User Similarity",
)

model_registry.save(model_dir)
print(f"Model '{model_name}' uploaded to Hopsworks!")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/8329233 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1208515/models/two_tower_recommender/3
Model 'two_tower_recommender' uploaded to Hopsworks!
