In [27]:
from sklearn.preprocessing import normalize
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.metrics.pairwise import cosine_similarity
import random
import numpy as np
import hopsworks

In [2]:
with open('../secrets/hopsworks_api_key.txt', 'r') as file:
    HOPSWORKS_API_KEY = file.readline().strip()

with open('../secrets/spotify_client_id.txt', 'r') as file:
    SPOTIFY_CLIENT_ID = file.readline().strip()

with open('../secrets/spotify_client_secret.txt', 'r') as file:
    SPOTIFY_CLIENT_SECRET = file.readline().strip()

In [5]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store() 

2025-01-02 16:49:00,462 INFO: Initializing external client
2025-01-02 16:49:00,463 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-02 16:49:12,131 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1208515


In [8]:
user_embeddings_fg = fs.get_feature_group(
    name='spotify_user_embeddings',
    version=1,
)

user_embeddings_df = user_embeddings_fg.read()
user_embeddings_df.head()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.61s) 


Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding
0,31h7ml3xiavflj5n7d4av5u5xaie,"[-0.0009209662675857544, 0.0001077811539289541...","[0.0018118077423423529, 0.0005614424007944763,...","[-0.0012177809840068221, 0.0001425175287295133...",[2019.8850574712644]
1,31imc4msmvetbl26gly5n55jbkka,"[-0.0036893084179610014, 0.002161552431061864,...","[0.003511666087433696, 0.007666133344173431, 0...","[-0.0036893084179610014, 0.002161552431061864,...",[2018.0]
2,31tgsl3dejcqihle3pv7o6eeng2a,"[5.25292671227362e-05, 0.00023966847220435739,...","[-0.0045182956382632256, 0.004658684134483337,...","[8.820458606351167e-05, 0.00040243961848318577...",[2019.9066666666668]
3,31fg5ma4zjh37mcqzto3xt2sxc3a,"[0.007611020002514124, 0.00913164857774973, 0....","[0.0019403230398893356, -0.0005995116662234068...","[0.007611020002514124, 0.00913164857774973, 0....",[2019.0]
4,31frxab22c2ez34gnfggtqqsnope,"[-0.0012761030811816454, 0.001490566530264914,...","[-0.0010105168912559748, 0.000450886000180617,...","[-0.0024445701856166124, 0.002855407539755106,...",[2018.175]


In [18]:
user_embeddings_df['full_embedding'] = user_embeddings_df.apply(
    lambda row: np.concatenate(
        [row['genre_embedding'], row['artist_embedding'], row['playlist_embedding'], row['release_year_embedding']]
    ),
    axis=1
)
normalized_embeddings = normalize(np.array(user_embeddings_df['full_embedding'].tolist()))
user_embeddings_df['normalized_embedding'] = normalized_embeddings.tolist()
user_embeddings_df

Unnamed: 0,user_id,genre_embedding,artist_embedding,playlist_embedding,release_year_embedding,full_embedding,normalized_embedding
0,31h7ml3xiavflj5n7d4av5u5xaie,"[-0.0009209662675857544, 0.0001077811539289541...","[0.0018118077423423529, 0.0005614424007944763,...","[-0.0012177809840068221, 0.0001425175287295133...",[2019.8850574712644],"[-0.0009209662675857544, 0.0001077811539289541...","[-4.559498393319837e-07, 5.336004319218629e-08..."
1,31imc4msmvetbl26gly5n55jbkka,"[-0.0036893084179610014, 0.002161552431061864,...","[0.003511666087433696, 0.007666133344173431, 0...","[-0.0036893084179610014, 0.002161552431061864,...",[2018.0],"[-0.0036893084179610014, 0.002161552431061864,...","[-1.8282004037257978e-06, 1.071135990665108e-0..."
2,31tgsl3dejcqihle3pv7o6eeng2a,"[5.25292671227362e-05, 0.00023966847220435739,...","[-0.0045182956382632256, 0.004658684134483337,...","[8.820458606351167e-05, 0.00040243961848318577...",[2019.9066666666668],"[5.25292671227362e-05, 0.00023966847220435739,...","[2.6005789250768823e-08, 1.1865324074743354e-0..."
3,31fg5ma4zjh37mcqzto3xt2sxc3a,"[0.007611020002514124, 0.00913164857774973, 0....","[0.0019403230398893356, -0.0005995116662234068...","[0.007611020002514124, 0.00913164857774973, 0....",[2019.0],"[0.007611020002514124, 0.00913164857774973, 0....","[3.7696978667723223e-06, 4.522857140342114e-06..."
4,31frxab22c2ez34gnfggtqqsnope,"[-0.0012761030811816454, 0.001490566530264914,...","[-0.0010105168912559748, 0.000450886000180617,...","[-0.0024445701856166124, 0.002855407539755106,...",[2018.175],"[-0.0012761030811816454, 0.001490566530264914,...","[-6.323054641243813e-07, 7.385714960069648e-07..."
5,vvzx3nq79szk5qfmkznpm230n,"[0.003747842973098159, 0.005087693687528372, 0...","[-0.007172995246946812, 0.00010589948215056211...","[0.006009918637573719, 0.008158460259437561, 0...",[2021.1666666666667],"[0.003747842973098159, 0.005087693687528372, 0...","[1.854296843351263e-06, 2.517206407109234e-06,..."


In [19]:
def build_user_tower(input_dim, embedding_dim=128):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='relu')(x)
    user_embedding = layers.Dense(embedding_dim, activation=None)(x)  # Final user embedding
    return Model(inputs, user_embedding, name="UserTower")

def build_candidate_tower(input_dim, embedding_dim=128):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='relu')(x)
    candidate_embedding = layers.Dense(embedding_dim, activation=None)(x)  # Final candidate embedding
    return Model(inputs, candidate_embedding, name="CandidateTower")

In [30]:
# Instantiate towers
input_dim = len(normalized_embeddings[0])  # Dimensionality of the concatenated embedding
embedding_dim = 128

user_tower = build_user_tower(input_dim, embedding_dim)
candidate_tower = build_candidate_tower(input_dim, embedding_dim)

# Compute cosine similarity
user_embedding = user_tower.output
candidate_embedding = candidate_tower.output
cosine_similarity_model = tf.keras.layers.Dot(axes=1, normalize=True)([user_embedding, candidate_embedding])

# Final model
model = tf.keras.Model(inputs=[user_tower.input, candidate_tower.input], outputs=cosine_similarity_model)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

In [31]:
def generate_pairs(embeddings, similarity_threshold=0.8, negative_ratio=1):
    pairs = []
    labels = []

    # Compute cosine similarity for all pairs
    similarity_matrix = cosine_similarity(embeddings)  # This is a valid pairwise similarity matrix

    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            if similarity_matrix[i, j] > similarity_threshold:
                # Positive pair
                pairs.append((embeddings[i], embeddings[j]))
                labels.append(1)

                # Generate negative pairs
                for _ in range(negative_ratio):
                    negative_index = np.random.choice(len(embeddings))
                    while negative_index == i or negative_index == j:
                        negative_index = np.random.choice(len(embeddings))
                    pairs.append((embeddings[i], embeddings[negative_index]))
                    labels.append(0)
    
    return np.array(pairs), np.array(labels)

In [32]:
print(type(normalized_embeddings))

# Generate training data
pairs, labels = generate_pairs(normalized_embeddings)
user_1 = np.array([pair[0] for pair in pairs])
user_2 = np.array([pair[1] for pair in pairs])

history = model.fit(
    [user_1, user_2],  # Input: pairs of user embeddings
    labels,            # Output: similarity labels
    batch_size=32,
    epochs=10,
    validation_split=0.2
)


<class 'numpy.ndarray'>
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 976ms/step - accuracy: 0.5000 - loss: 8.0614 - val_accuracy: 0.5000 - val_loss: 8.0590
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.5000 - loss: 8.0590 - val_accuracy: 0.5000 - val_loss: 8.0590
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.5000 - loss: 8.0590 - val_accuracy: 0.5000 - val_loss: 8.0590
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.5000 - loss: 8.0590 - val_accuracy: 0.5000 - val_loss: 8.0590
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.5000 - loss: 8.0590 - val_accuracy: 0.5000 - val_loss: 8.0590
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.5000 - loss: 8.0590 - val_accuracy: 0.5000 - val_loss: 8.0590
Epoch 7/10
[1m1/1[0

In [35]:
mr = project.get_model_registry()
model.save("two_tower_model.keras")

# Create a new model version
model_dir = "two_tower_model.keras"
model_name = "two_tower_recommender"

model_registry = mr.python.create_model(
    name=model_name,
    metrics={"accuracy": history.history["accuracy"][-1]},  # Log the final accuracy
    description="Two-Tower Recommender Model for User Similarity",
)

model_registry.save(model_dir)
print(f"Model '{model_name}' uploaded to Hopsworks!")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/3094525 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1208515/models/two_tower_recommender/1
Model 'two_tower_recommender' uploaded to Hopsworks!
