In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy
from tensorflow.python.ops.gen_dataset_ops import shuffle_dataset
from sklearn.model_selection import train_test_split


dataset, info = tfds.load("movielens/100k-ratings", split=["train"], with_info=True)

data_set = dataset[0]

def preprocess_data(dataset):

    user_id = tf.strings.to_number(dataset["user_id"], out_type=tf.int32)
    movie_id = tf.strings.to_number(dataset["movie_id"], out_type=tf.int32)

    #casting user rating into 0 and 1 (0 means didn't rate this movie, 1 means did rate this movie)

    did_rate = tf.cast(dataset["user_rating"] > 0, tf.int64)

    features = {
        "user_id": user_id,
        "movie_id": movie_id,
        "user_rating":  did_rate
    }

    return features

data_set = data_set.map(preprocess_data)

print(len(data_set))

num_client = 5
userS_dataset = []

dataset_size = len(data_set)

for i in range(num_client):

    seed = numpy.random.seed(i)
    data_set = data_set.shuffle(buffer_size=dataset_size, seed= seed)
    userS_dataset.append(data_set)

data_set_1 =  data_set.shuffle(buffer_size=dataset_size, seed=42)

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/incomplete.QFKS93_0.1.1/movielens-train.tfrecord*..…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.
100000


In [2]:
#size of training data and testing data
train_size = int(dataset_size * 0.8)
test_size = dataset_size - train_size

#extract training data and testing data
train_data_1 = data_set_1.take(int(train_size))
test_data_1 = data_set_1.skip(int(train_size))

train_data = []
test_data = []

for i in range(num_client):

    train_data.append(userS_dataset[i].take(int(train_size)))
    test_data.append(userS_dataset[i].skip(int(train_size)))


In [3]:
userIDs = []
movieIDs = []
ratings = []

unique_user_movie_pair = set()
number = 0
for example in data_set:

    # Convert TensorFlow tensor to a NumPy value
    user_id = example["user_id"].numpy()
    movie_id = example["movie_id"].numpy()
    rating = example["user_rating"].numpy()

    # Add movie pair only if it is uniuqe
    if (user_id, movie_id) not in unique_user_movie_pair:
        userIDs.append(user_id)
        movieIDs.append(movie_id)
        ratings.append(rating)
        unique_user_movie_pair.add((user_id, movie_id))
        number += 1

print(f"Number of unique movie pairs: {number}")

# Convert lists to NumPy arrays for later processing
userIDs = numpy.array(userIDs)
movieIDs = numpy.array(movieIDs)
ratings = numpy.array(ratings)

movieIDs = numpy.unique(movieIDs)
userIDs = numpy.unique(userIDs)

print(movieIDs)
#size of user and movie in training data
num_user = max(userIDs)
num_movie = max(movieIDs)

print(f"Number of unique users: {num_user}")
print(f"Number of unique movies: {num_movie}")


Number of unique movie pairs: 100000
[   1    2    3 ... 1680 1681 1682]
Number of unique users: 943
Number of unique movies: 1682


In [4]:
triplets = list(zip(userIDs, movieIDs, ratings))

train_triplets, test_triplets = train_test_split(triplets, test_size=0.2, random_state=42)
train_matrix = numpy.zeros((num_user, num_movie), dtype=numpy.int32)
test_matrix = numpy.zeros((num_user, num_movie), dtype=numpy.int32)

def integrate_feature_into_matrix(userIDs, movieIDs, ratings, num_user, num_movie, matrix):

    # Create a 2D matrix filled with zeros

    # Populate the matrix
    for userID, movieID, rating in zip(userIDs, movieIDs, ratings):
        matrix[int(userID), int(movieID)] = rating

    return matrix

for user_id, movie_id, rating in train_triplets:
    train_matrix[user_id-1, movie_id-1] = rating

for user_id, movie_id, rating in test_triplets:
    test_matrix[user_id-1, movie_id-1] = rating

print(train_matrix.shape)
print(test_matrix.shape)
train_matrix = train_matrix.reshape((1, num_user, num_movie, 1))

print(train_matrix.shape)
print(test_matrix.shape)

(943, 1682)
(943, 1682)
(1, 943, 1682, 1)
(943, 1682)


In [33]:
def build_cnn_model(num_user, num_movie):

    input_matrix = tf.keras.layers.Input(shape=(num_user, num_movie, 1),batch_size = 32, name="user_id")

    cnn_layer = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same')(input_matrix)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(4, 4))(cnn_layer) # Added MaxPooling
    cnn_layer = tf.keras.layers.Dropout(0.3)(cnn_layer)

    cnn_layer = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same')(cnn_layer)
    cnn_layer = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(cnn_layer) # Added MaxPooling

    cnn_layer = tf.keras.layers.Flatten()(cnn_layer)

    dense_layer = tf.keras.layers.Dense(128, activation='relu')(cnn_layer)

    output = tf.keras.layers.Dense(num_movie, activation='sigmoid', name="movie_scores")(dense_layer)
    model = tf.keras.models.Model(inputs=input_matrix, outputs=output, name="MoviePopularityModel")

    model.compile(
        optimizer='adam',
        loss='mean_squared_error',  # For regression tasks
        metrics=['mae']  # Mean Absolute Error for monitoring
    )

    return model

model = build_cnn_model(num_user, num_movie)

model.fit(
    train_matrix,
    train_matrix,  # For self-supervised learning
    epochs=20,
    batch_size=32
)


ResourceExhaustedError: {{function_node __wrapped__StatelessRandomUniformV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[1572480,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StatelessRandomUniformV2] name: 

In [23]:
test_matrix_reshaped = test_matrix.reshape((1, num_user, num_movie, 1))
# Predict movie scores
predictions = model.predict(test_matrix_reshaped)

# Aggregate scores across users
movie_scores = predictions.mean(axis=0)

# Get top 50 movie indices
# Top 50 in descending ordere
movie_score = numpy.argsort(movie_scores)[:][::-1]

print("Predicted Movies Ranking(by index):")
for rank, movie_idx in enumerate(movie_score, 1):
    print(f"{rank}: Movie Index {movie_idx}, Score {movie_scores[movie_idx] } ")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
Predicted Movies Ranking(by index):
1: Movie Index 1659, Score 0.013100138865411282 
2: Movie Index 1028, Score 0.012109708972275257 
3: Movie Index 1589, Score 0.011913652531802654 
4: Movie Index 353, Score 0.010444947518408298 
5: Movie Index 534, Score 0.010408624075353146 
6: Movie Index 676, Score 0.010210756212472916 
7: Movie Index 1294, Score 0.009926221333444118 
8: Movie Index 890, Score 0.008119500242173672 
9: Movie Index 396, Score 0.007179430220276117 
10: Movie Index 932, Score 0.006724418606609106 
11: Movie Index 283, Score 0.005351587664335966 
12: Movie Index 879, Score 0.0051581719890236855 
13: Movie Index 1660, Score 0.002998790703713894 
14: Movie Index 1376, Score 0.0029388039838522673 
15: Movie Index 1275, Score 0.002827979624271393 
16: Movie Index 604, Score 0.0026276244316250086 
17: Movie Index 400, Score 0.002451225882396102 
18: Movie Index 1517, Score 0.0021957899443805218 
19: Mo

In [7]:
loss, mae = model.evaluate(train_matrix, test_matrix)  # For self-supervised learning, use test_matrix as both inputs and targets
print(f"Test Loss: {loss}")
print(f"Test MAE: {mae}")


ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 1
'y' sizes: 943


In [29]:
def calculate_caching_hit_rate(predicted_scores, test_matrix, cache_size):

    """
    Calculate the caching hit rate.

    :param predicted_scores: Array of predicted scores for all movies (shape: num_movies).
    :param test_data: Test dataset containing actual movie requests (e.g., movieIDs).
    :param cache_size: Number of movies to cache (top-k based on predicted scores).
    :return: Hit rate as a float.
    """

    predicted_scores = predicted_scores.flatten()

    # Get top-k movies based on predicted scores
    top_k_movies = numpy.argsort(predicted_scores)[-cache_size:][::-1]

    print(top_k_movies)

    #Extract actual requested movie IDs from test_matrix
    requested_movie_ids = set()
    hits = 0

    num_user, num_movie = test_matrix.shape

    for user_id in range(num_user):

        requested_movies = numpy.where(test_matrix[user_id] == 1)[0]
        requested_movie_ids.update(requested_movies)

    print(requested_movie_ids)

    # Calculate hit rate
    for movie_id in requested_movie_ids:
        if movie_id in top_k_movies:
            hits += 1

    total_unique_movies = len(requested_movie_ids)
    print(f"Total unique movies: {total_unique_movies}, total hit: {hits}")

    hit_rate = hits / cache_size

    return hit_rate

# Example usage
cache_size = 250  # the number of cached top N movies
hit_rate = calculate_caching_hit_rate(predictions, test_matrix, cache_size)
print(f"Caching Hit Rate: {hit_rate:.2%}")

[1659 1028 1589  353  534  676 1294  890  396  932  283  879 1660 1376
 1275  604  400 1517 1452 1482  504  468 1388  298  500 1456  487  326
  138   68  506  612  467  255  514 1188 1094  151  599  175 1102  453
 1627 1073  452   43 1200 1249  585 1677  472 1078 1669 1223 1631  835
  611 1019 1555  483 1606  340  873 1392  739  477  331 1296   45  951
  209  969 1346  427  730 1115 1158  240 1504 1592 1238  424 1251 1421
  684  587  579  971 1570   85 1090  185 1366  824  828 1305   93  844
  108   79 1574 1011  869  523 1017  921  862 1367  404 1170  674    8
  319  731 1580 1147  915  530 1581   27  216 1099  938  980 1432  263
 1007 1562  281 1368  238  701  648  115 1633  795  541  421 1303 1401
  987  247 1295 1329 1184 1069 1561 1656  602  202  347  682 1287  805
  451 1553   34 1565  827 1168   38 1417 1414  626 1458 1271  496 1304
 1132    2  383 1386   50 1143  318  578 1215  565 1177 1365  369 1339
 1038 1443  478 1328   97  157 1411 1205 1495  554  717  797 1166 1002
  908 