In [1]:
import tensorflow as tf
from tensorflow.keras.regularizers import l2
import tensorflow_datasets as tfds
import numpy
from tensorflow.python.ops.gen_dataset_ops import shuffle_dataset
from sklearn.model_selection import train_test_split

dataset, info = tfds.load("movielens/100k-ratings", split=["train"], with_info=True)

data_set = dataset[0]

def preprocess_data(dataset):

    user_id = tf.strings.to_number(dataset["user_id"], out_type=tf.int32)
    movie_id = tf.strings.to_number(dataset["movie_id"], out_type=tf.int32)

    #casting user rating into 0 and 1 (0 means didn't rate this movie, 1 means did rate this movie)

    did_rate = tf.cast(dataset["user_rating"] > 0, tf.int64)

    features = {
        "user_id": user_id,
        "movie_id": movie_id,
        "user_rating":  did_rate
    }

    return features

data_set = data_set.map(preprocess_data)

print(len(data_set))

num_client = 5
userS_dataset = []

dataset_size = len(data_set)

for i in range(num_client):

    seed = numpy.random.seed(i)
    data_set = data_set.shuffle(buffer_size=dataset_size, seed= seed)
    userS_dataset.append(data_set)

data_set_1 =  data_set.shuffle(buffer_size=dataset_size, seed=42)

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/incomplete.CW0OIF_0.1.1/movielens-train.tfrecord*..…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.
100000


In [2]:
#size of training data and testing data
train_size = int(dataset_size * 0.8)
test_size = dataset_size - train_size

#extract training data and testing data
train_data_1 = data_set_1.take(int(train_size))
test_data_1 = data_set_1.skip(int(train_size))

train_data = []
test_data = []

for i in range(num_client):

    train_data.append(userS_dataset[i].take(int(train_size)))
    test_data.append(userS_dataset[i].skip(int(train_size)))


In [3]:
userIDs = []
movieIDs = []
ratings = []

unique_user_movie_pair = set()
number = 0

for example in data_set:

    # Convert TensorFlow tensor to a NumPy value
    user_id = example["user_id"].numpy()
    movie_id = example["movie_id"].numpy()
    rating = example["user_rating"].numpy()

    # Add movie pair only if it is uniuqe
    if (user_id, movie_id) not in unique_user_movie_pair:
        userIDs.append(user_id)
        movieIDs.append(movie_id)
        ratings.append(rating)
        unique_user_movie_pair.add((user_id, movie_id))
        number += 1

print(f"Number of unique movie pairs: {number}")

# Convert lists to NumPy arrays for later processing
userIDs = numpy.array(userIDs)
movieIDs = numpy.array(movieIDs)
ratings = numpy.array(ratings)

movieIDs = numpy.unique(movieIDs)
userIDs = numpy.unique(userIDs)

#size of user and movie in training data
num_user = len(userIDs)
num_movie = len(movieIDs)
rating = len(ratings)

print(f"Number of unique users: {num_user}")
print(f"Number of unique movies: {num_movie}")
print(f"Number of ratings: {rating}")


Number of unique movie pairs: 100000
Number of unique users: 943
Number of unique movies: 1682
Number of ratings: 100000


In [4]:
triplets = list(zip(userIDs, movieIDs, ratings))

train_triplets, test_triplets = train_test_split(triplets, test_size=0.2, random_state=50)
train_matrix = numpy.zeros((num_user, num_movie), dtype=numpy.int32)
test_matrix = numpy.zeros((num_user, num_movie), dtype=numpy.int32)

def integrate_feature_into_matrix(userIDs, movieIDs, ratings, num_user, num_movie, matrix):

    # Create a 2D matrix filled with zeros
    # Populate the matrix
    for userID, movieID, rating in zip(userIDs, movieIDs, ratings):
        matrix[int(userID), int(movieID)] = rating

    return matrix

for user_id, movie_id, rating in train_triplets:
    train_matrix[user_id-1, movie_id-1] = rating

for user_id, movie_id, rating in test_triplets:
    test_matrix[user_id-1, movie_id-1] = rating

print(train_matrix.shape)
print(test_matrix.shape)


print(train_matrix.shape)
print(test_matrix.shape)

movie_popularity = numpy.sum(train_matrix, axis=0)
movie_popularity_normalized = movie_popularity / numpy.max(movie_popularity)
movie_label = movie_popularity_normalized.reshape((1, movie_popularity_normalized.shape[0]))

train_matrix = train_matrix.reshape((1, num_user, num_movie, 1))


(943, 1682)
(943, 1682)
(943, 1682)
(943, 1682)


In [5]:
# X_train_user, X_test_user, X_train_movie, X_test_movie, y_train, y_test = train_test_split(
#     userIDs, movieIDs, ratings, test_size=0.2, random_state=42
# )

In [6]:
def build_cnn_model(num_user, num_movie):

    regularization = l2(0.001)

    input_matrix = tf.keras.layers.Input(shape=(num_user, num_movie, 1), name="user_id")

    # First Convolution Block
    cnn_layer = tf.keras.layers.Conv2D(164, kernel_size=(3, 3),strides = (1,1), padding = "SAME", kernel_regularizer=regularization, use_bias = False)(input_matrix)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.Activation('relu')(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides = (1,1),)(cnn_layer)
    # cnn_layer = tf.keras.layers.Dropout(0.3)(cnn_layer)

    # Second Convolution Block
    cnn_layer = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), strides = (1,1), padding = "SAME", kernel_regularizer=regularization, use_bias = False)(cnn_layer)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.SeparableConv2D(64, (3, 3), padding='same',  use_bias=False)(cnn_layer)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.Activation('relu')(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides = (2,2),)(cnn_layer)
    cnn_layer = tf.keras.layers.Dropout(0.2)(cnn_layer)

    # Third Convolution Block
    cnn_layer = tf.keras.layers.Conv2D(64, kernel_size=(2, 2), strides = (1,1), padding = "SAME", kernel_regularizer=regularization, use_bias = False)(cnn_layer)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.SeparableConv2D(64, (3, 3), padding='same', use_bias=False)(cnn_layer)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.Activation('relu')(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(3, 3), strides = (2,2),)(cnn_layer)
    cnn_layer = tf.keras.layers.Dropout(0.2)(cnn_layer)

    # cnn_layer = tf.keras.layers.GlobalAveragePooling2D()(cnn_layer)
    # cnn_layer = tf.keras.layers.GlobalAveragePooling2D()(cnn_layer)

    #cnn_layer = tf.keras.layers.Dense(128, activation='relu')(cnn_layer)
    #cnn_layer = tf.keras.layers.Dense(64, activation='relu')(cnn_layer)

    # cnn_layer = tf.keras.layers.Reshape((1, 1, 64))(cnn_layer)

    cnn_layer = tf.keras.layers.Conv2DTranspose(64, kernel_size=4, strides=1, padding="SAME", activation="relu")(cnn_layer)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides = (1,1),)(cnn_layer)
    cnn_layer = tf.keras.layers.Dropout(0.3)(cnn_layer)

    cnn_layer = tf.keras.layers.Conv2DTranspose(64, kernel_size=3, strides=1, padding="SAME", activation="relu")(cnn_layer)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides = (2,2),)(cnn_layer)
    cnn_layer = tf.keras.layers.Dropout(0.3)(cnn_layer)

    cnn_layer = tf.keras.layers.Conv2DTranspose(64, kernel_size=2, strides=1, padding="SAME", activation="relu")(cnn_layer)
    cnn_layer = tf.keras.layers.BatchNormalization()(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(3, 3), strides = (2,2),)(cnn_layer)
    cnn_layer = tf.keras.layers.Dropout(0.4)(cnn_layer)

    cnn_layer = tf.keras.layers.Flatten()(cnn_layer)

    # Dense Layer
    dense_layer = tf.keras.layers.Dense(64, activation='relu')(cnn_layer)

    # dense_layer = tf.keras.layers.Dropout(0.3)(dense_layer)

    dense_layer = tf.keras.layers.Dense(64, activation='relu')(dense_layer)

    # dense_layer = tf.keras.layers.Dropout(0.6)(dense_layer)

    dense_layer = tf.keras.layers.Dense(64, activation='relu')(dense_layer)

    #dense_layer = tf.keras.layers.Dropout(0.5)(dense_layer)

    # Output Layer
    output = tf.keras.layers.Dense(num_movie, activation='sigmoid', name="movie_scores")(cnn_layer)

    # Build and Compile the Model
    model = tf.keras.models.Model(inputs=input_matrix, outputs=output, name="MoviePopularityModel")

    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.001,
        decay_steps=5000,  # Slower decay
        decay_rate=0.98,
        staircase=True
    )

    def aggregated_popularity_loss(y_true, y_pred):

        # Remove the last dimension: (batch_size, num_users, num_movies)
        y_true_squeezed = tf.squeeze(y_true, axis=-1)

        # Aggregate over the user axis (axis=1). Here we use sum, but you could also use tf.reduce_mean.
        true_popularity = tf.reduce_sum(y_true_squeezed, axis=1)  # Shape: (batch_size, num_movies)
        # changed from reduce sum to reduce mean

        # Normalize the aggregated popularity.
        # For each sample in the batch, divide by the maximum popularity value to bring scores to [0, 1].
        true_popularity_norm = (true_popularity - tf.reduce_min(true_popularity)) / (tf.reduce_max(true_popularity) - tf.reduce_min(true_popularity) + 1e-7)

        # Calculate the mean squared error between normalized true popularity and predictions.
        loss = tf.reduce_mean(tf.square(true_popularity_norm - y_pred))

        return loss

    def ranking_loss(y_true, y_pred):
        """Pairwise Ranking Loss: Higher-ranked items should have higher scores"""
        y_true = tf.squeeze(y_true, axis=-1)
        true_popularity = tf.reduce_sum(y_true, axis=1)

        diff = tf.expand_dims(true_popularity, -1) - tf.expand_dims(true_popularity, -2)
        pred_diff = tf.expand_dims(y_pred, -1) - tf.expand_dims(y_pred, -2)

        loss = tf.reduce_mean(tf.nn.relu(diff * pred_diff))
        return loss

    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(learning_rate = 0.002, rho = 0.85), #, epsilon=1e-7),
        # optimizer=tf.keras.optimizers.AdamW(learning_rate = 0.0005),
        #loss = ranking_loss,
        loss= aggregated_popularity_loss,
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

    return model

# Build the model
model = build_cnn_model(num_user, num_movie)

# Train the model
model.fit(
    train_matrix,
    train_matrix,
    #movie_label,
    epochs= 100,
)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 44s/step - loss: 0.5838 - root_mean_squared_error: 0.6482
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423ms/step - loss: 0.2119 - root_mean_squared_error: 0.6601
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 427ms/step - loss: 0.1681 - root_mean_squared_error: 0.6650
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step - loss: 0.1567 - root_mean_squared_error: 0.6675
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426ms/step - loss: 0.1534 - root_mean_squared_error: 0.6686
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 418ms/step - loss: 0.1495 - root_mean_squared_error: 0.6689
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 418ms/step - loss: 0.1472 - root_mean_squared_error: 0.6691
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7bea4b6d4550>

In [7]:
test_matrix_reshaped = test_matrix.reshape((1, num_user, num_movie, 1))

# Predict movie scores
predictions = model.predict(test_matrix_reshaped)

# Aggregate scores across users
movie_scores = predictions.mean(axis=0)

# Get top 50 movie indices
# Top 50 in descending ordere
top_movies = numpy.argsort(-movie_scores)[:]  # Negative sign to get descending order

print("Predicted Movies Ranking(by index):")
for rank, movie_idx in enumerate(top_movies, 1):
    print(f"{rank}: Movie Index {movie_idx}, Score {movie_scores[movie_idx] } ")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 802ms/step
Predicted Movies Ranking(by index):
1: Movie Index 1461, Score 0.9999960660934448 
2: Movie Index 1634, Score 0.9999868869781494 
3: Movie Index 1124, Score 0.9999867677688599 
4: Movie Index 1230, Score 0.999982476234436 
5: Movie Index 90, Score 0.9999758005142212 
6: Movie Index 1309, Score 0.9999706745147705 
7: Movie Index 1068, Score 0.9998685121536255 
8: Movie Index 1532, Score 0.9998204112052917 
9: Movie Index 1004, Score 0.9998131394386292 
10: Movie Index 1291, Score 0.9997528195381165 
11: Movie Index 1422, Score 0.9996967315673828 
12: Movie Index 319, Score 0.9996578693389893 
13: Movie Index 1408, Score 0.9996292591094971 
14: Movie Index 1229, Score 0.9995606541633606 
15: Movie Index 1023, Score 0.9995157718658447 
16: Movie Index 741, Score 0.9994648098945618 
17: Movie Index 1504, Score 0.9994586110115051 
18: Movie Index 969, Score 0.9994242191314697 
19: Movie Index 1219, Score 0.999421954154

In [8]:
# loss, mae = model.evaluate(train_matrix, test_matrix)  # For self-supervised learning, use test_matrix as both inputs and targets
# print(f"Test Loss: {loss}")
# print(f"Test MAE: {mae}")


In [9]:
def calculate_caching_hit_rate(predicted_scores, test_matrix, cache_size_array):

    """
    Calculate the caching hit rate.

    :param predicted_scores: Array of predicted scores for all movies (shape: num_movies).
    :param test_data: Test dataset containing actual movie requests (e.g., movieIDs).
    :param cache_size: Number of movies to cache (top-k based on predicted scores).
    :return: Hit rate as a float.
    """

    predicted_scores = predicted_scores.flatten()

    num_user, num_movie = test_matrix.shape

    for cache_size in cache_size_array:

      # Get top-k movies based on predicted scores
      top_k_movies = numpy.argsort(predicted_scores)[-cache_size:][::-1]

      #Extract actual requested movie IDs from test_matrix
      requested_movie_ids = set()
      hits = 0

      for user_id in range(num_user):

          requested_movies = numpy.where(test_matrix[user_id] == 1)[0]
          requested_movie_ids.update(requested_movies)

      # Calculate hit rate
      for movie_id in requested_movie_ids:
          if movie_id in top_k_movies:
              hits += 1

      total_unique_movies = len(requested_movie_ids)
      print(f"Total unique movies: {total_unique_movies}, total hit: {hits}")

      hit_rate = hits / cache_size
      print(f"Hit Rate for cache size {cache_size}: {hit_rate:.2%}")
      print("")

    return hit_rate

# Example usage
cache_size_array = [50, 100, 150, 200, 250, 300]  # the number of cached top N movies
hit_rate = calculate_caching_hit_rate(predictions, test_matrix, cache_size_array)

Total unique movies: 189, total hit: 8
Hit Rate for cache size 50: 16.00%

Total unique movies: 189, total hit: 15
Hit Rate for cache size 100: 15.00%

Total unique movies: 189, total hit: 25
Hit Rate for cache size 150: 16.67%

Total unique movies: 189, total hit: 39
Hit Rate for cache size 200: 19.50%

Total unique movies: 189, total hit: 49
Hit Rate for cache size 250: 19.60%

Total unique movies: 189, total hit: 60
Hit Rate for cache size 300: 20.00%

