In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy
from tensorflow.python.ops.gen_dataset_ops import shuffle_dataset
from sklearn.model_selection import train_test_split


dataset, info = tfds.load("movielens/100k-ratings", split=["train"], with_info=True)

data_set = dataset[0]

def preprocess_data(dataset):

    user_id = tf.strings.to_number(dataset["user_id"], out_type=tf.int32)
    movie_id = tf.strings.to_number(dataset["movie_id"], out_type=tf.int32)

    #casting user rating into 0 and 1 (0 means didn't rate this movie, 1 means did rate this movie)

    did_rate = tf.cast(dataset["user_rating"] > 0, tf.int64)

    features = {
        "user_id": user_id,
        "movie_id": movie_id,
        "user_rating":  did_rate
    }

    return features

data_set = data_set.map(preprocess_data)

print(len(data_set))

num_client = 5
userS_dataset = []

dataset_size = len(data_set)

for i in range(num_client):

    seed = numpy.random.seed(i)
    data_set = data_set.shuffle(buffer_size=dataset_size, seed= seed)
    userS_dataset.append(data_set)

data_set_1 =  data_set.shuffle(buffer_size=dataset_size, seed=42)

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/incomplete.P791UB_0.1.1/movielens-train.tfrecord*..…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.
100000


In [2]:
#size of training data and testing data
train_size = int(dataset_size * 0.8)
test_size = dataset_size - train_size

#extract training data and testing data
train_data_1 = data_set_1.take(int(train_size))
test_data_1 = data_set_1.skip(int(train_size))

train_data = []
test_data = []

for i in range(num_client):

    train_data.append(userS_dataset[i].take(int(train_size)))
    test_data.append(userS_dataset[i].skip(int(train_size)))


In [3]:
userIDs = []
movieIDs = []
ratings = []

unique_user_movie_pair = set()
number = 0
for example in data_set:

    # Convert TensorFlow tensor to a NumPy value
    user_id = example["user_id"].numpy()
    movie_id = example["movie_id"].numpy()
    rating = example["user_rating"].numpy()

    # Add movie pair only if it is uniuqe
    if (user_id, movie_id) not in unique_user_movie_pair:
        userIDs.append(user_id)
        movieIDs.append(movie_id)
        ratings.append(rating)
        unique_user_movie_pair.add((user_id, movie_id))
        number += 1

print(f"Number of unique movie pairs: {number}")

# Convert lists to NumPy arrays for later processing
userIDs = numpy.array(userIDs)
movieIDs = numpy.array(movieIDs)
ratings = numpy.array(ratings)

movieIDs = numpy.unique(movieIDs)
userIDs = numpy.unique(userIDs)

print(movieIDs)
#size of user and movie in training data
num_user = max(userIDs)
num_movie = max(movieIDs)

print(f"Number of unique users: {num_user}")
print(f"Number of unique movies: {num_movie}")


Number of unique movie pairs: 100000
[   1    2    3 ... 1680 1681 1682]
Number of unique users: 943
Number of unique movies: 1682


In [6]:
triplets = list(zip(userIDs, movieIDs, ratings))

train_triplets, test_triplets = train_test_split(triplets, test_size=0.2, random_state=42)
train_matrix = numpy.zeros((num_user, num_movie), dtype=numpy.int32)
test_matrix = numpy.zeros((num_user, num_movie), dtype=numpy.int32)

def integrate_feature_into_matrix(userIDs, movieIDs, ratings, num_user, num_movie, matrix):

    # Create a 2D matrix filled with zeros

    # Populate the matrix
    for userID, movieID, rating in zip(userIDs, movieIDs, ratings):
        matrix[int(userID), int(movieID)] = rating

    return matrix

for user_id, movie_id, rating in train_triplets:
    train_matrix[user_id-1, movie_id-1] = rating

for user_id, movie_id, rating in test_triplets:
    test_matrix[user_id-1, movie_id-1] = rating

print(train_matrix.shape)
print(test_matrix.shape)
train_matrix = train_matrix.reshape((1, num_user, num_movie, 1))

print(train_matrix.shape)
print(test_matrix.shape)

(943, 1682)
(943, 1682)
(1, 943, 1682, 1)
(943, 1682)


In [16]:
def build_cnn_model(num_user, num_movie):

    input_matrix = tf.keras.layers.Input(shape=(num_user, num_movie, 1),batch_size = 32, name="user_id")

    cnn_layer = tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same')(input_matrix)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(4, 4))(cnn_layer) # Added MaxPooling

    cnn_layer = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same')(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(cnn_layer) # Added MaxPooling

    cnn_layer = tf.keras.layers.Flatten()(cnn_layer)

    dense_layer = tf.keras.layers.Dense(128, activation='relu')(cnn_layer)

    output = tf.keras.layers.Dense(num_movie, activation='sigmoid', name="movie_scores")(dense_layer)
    model = tf.keras.models.Model(inputs=input_matrix, outputs=output, name="MoviePopularityModel")

    model.compile(
        optimizer=tf.keras.optimizers.Adam,
        loss='mean_squared_error',  # For regression tasks
        metrics=['mae']  # Mean Absolute Error for monitoring
    )

    return model

model = build_cnn_model(num_user, num_movie)

model.fit(
    train_matrix,
    train_matrix,  # For self-supervised learning
    epochs=20,
    batch_size=32
)


ResourceExhaustedError: {{function_node __wrapped__StatelessRandomUniformV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[1572480,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessRandomUniformV2] name: 

In [9]:
test_matrix_reshaped = test_matrix.reshape((1, num_user, num_movie, 1))
# Predict movie scores
predictions = model.predict(test_matrix_reshaped)

# Aggregate scores across users
movie_scores = predictions.mean(axis=0)

# Get top 50 movie indices
# Top 50 in descending ordere
movie_score = numpy.argsort(movie_scores)[:][::-1]

print("Predicted Movies Ranking(by index):")
for rank, movie_idx in enumerate(movie_score, 1):
    print(f"{rank}: Movie Index {movie_idx}, Score {movie_scores[movie_idx] } ")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted Movies Ranking(by index):
1: Movie Index 985, Score 0.912687361240387 
2: Movie Index 905, Score 0.8893370032310486 
3: Movie Index 941, Score 0.8692811131477356 
4: Movie Index 577, Score 0.8593295216560364 
5: Movie Index 485, Score 0.8466425538063049 
6: Movie Index 1587, Score 0.8214610815048218 
7: Movie Index 666, Score 0.8214443922042847 
8: Movie Index 605, Score 0.8181080222129822 
9: Movie Index 1444, Score 0.8106611371040344 
10: Movie Index 438, Score 0.7924216985702515 
11: Movie Index 914, Score 0.7918500900268555 
12: Movie Index 1390, Score 0.7885574102401733 
13: Movie Index 11, Score 0.787208080291748 
14: Movie Index 1665, Score 0.7755173444747925 
15: Movie Index 1586, Score 0.7687616348266602 
16: Movie Index 165, Score 0.7610960602760315 
17: Movie Index 1331, Score 0.7590896487236023 
18: Movie Index 674, Score 0.7541584968566895 
19: Movie Index 1436, Score 0.7527686357498169 
20: Mo

In [None]:
loss, mae = model.evaluate(train_matrix, test_matrix)  # For self-supervised learning, use test_matrix as both inputs and targets
print(f"Test Loss: {loss}")
print(f"Test MAE: {mae}")


In [13]:
def calculate_caching_hit_rate(predicted_scores, test_matrix, cache_size):

    """
    Calculate the caching hit rate.

    :param predicted_scores: Array of predicted scores for all movies (shape: num_movies).
    :param test_data: Test dataset containing actual movie requests (e.g., movieIDs).
    :param cache_size: Number of movies to cache (top-k based on predicted scores).
    :return: Hit rate as a float.
    """

    predicted_scores = predicted_scores.flatten()

    # Get top-k movies based on predicted scores
    top_k_movies = numpy.argsort(predicted_scores)[-cache_size:][::-1]

    print(top_k_movies)

    #Extract actual requested movie IDs from test_matrix
    requested_movie_ids = set()
    hits = 0

    num_user, num_movie = test_matrix.shape

    for user_id in range(num_user):

        requested_movies = numpy.where(test_matrix[user_id] == 1)[0]
        requested_movie_ids.update(requested_movies)

    print(requested_movie_ids)


    # Calculate hit rate
    for movie_id in requested_movie_ids:
        if movie_id in top_k_movies:
            hits += 1

    total_unique_movies = len(requested_movie_ids)
    print(f"Total unique movies: {total_unique_movies}, total hit: {hits}")

    hit_rate = hits / cache_size

    return hit_rate

# Example usage
cache_size = 200  # the number of cached top N movies
hit_rate = calculate_caching_hit_rate(predictions, test_matrix, cache_size)
print(f"Caching Hit Rate: {hit_rate:.2%}")

[ 985  905  941  577  485 1587  666  605 1444  438  914 1390   11 1665
 1586  165 1331  674 1436 1575   86 1379  719 1365 1509 1183  182   19
  948  214  670  267  522  244 1176 1489  249 1188  793  515 1199 1037
   48  792 1318  320  211  550  880 1493 1527  700  166 1387 1102  981
  971  545  632 1680 1606  118  135  701  240 1227  979 1114 1357  731
  194 1525  139  375  537  739 1279 1568 1641  493 1555 1166  514 1252
 1321 1399 1004  291 1602 1482  497 1248  529 1620  180   64  998 1569
 1234 1559  904  892 1302  779 1634 1504 1301 1276  162 1505  541  428
 1598 1281 1580  716  768  582 1596  813 1025 1679  323 1588 1020  568
   16 1233  853  928  960 1190  315  829  878 1165  509  327  147  664
 1605  124 1195  536 1530 1036  734 1664 1562 1653  894  387 1408   80
  660  421  170  530  273 1656  915  835 1499 1118 1099  167  715 1256
  431 1572  832  470 1119  226  620  859 1422  245  427  377  655  400
 1090  293  750 1235 1371  440  354 1518  687  961  106 1669 1334  307
  246 