## Matrix Factorization

In [1]:
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Load data
data_path = 'ml-1m/'
ratings = np.genfromtxt(data_path + 'ratings.dat', delimiter='::', dtype=int)

In [3]:
ratings

array([[        1,      1193,         5, 978300760],
       [        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

In [4]:
np.random.seed(42)
np.random.shuffle(ratings)
split_ratio = 0.8
split_idx = int(split_ratio * len(ratings))
train = ratings[:split_idx]
test = ratings[split_idx:]

In [5]:
train

array([[     5412,      2683,         2, 960243649],
       [     5440,       904,         5, 959995181],
       [      368,      3717,         4, 976311423],
       ...,
       [     3685,      2108,         4, 967121561],
       [     3312,      3616,         4, 983252174],
       [     4811,      1041,         4, 962933524]])

In [6]:
test

array([[     1841,      3717,         1, 974698076],
       [     3715,       880,         3, 966266200],
       [     2002,      3072,         4, 974678563],
       ...,
       [      854,      3102,         3, 975355597],
       [     4033,      3479,         5, 965525805],
       [      786,      1391,         4, 975429588]])

In [7]:
# Get number of users and items
num_users = max(max(train[:,0]), max(test[:,0]))
num_items = max(max(train[:,1]), max(test[:,1]))

In [8]:
# Create user-item matrix
train_matrix = np.zeros((num_users, num_items))
for i in range(train.shape[0]):
    user_id = train[i,0] - 1
    item_id = train[i,1] - 1
    rating = train[i,2]
    train_matrix[user_id][item_id] = rating

In [9]:
test_matrix = np.zeros((num_users, num_items))
for i in range(test.shape[0]):
    user_id = test[i,0] - 1
    item_id = test[i,1] - 1
    rating = test[i,2]
    test_matrix[user_id][item_id] = rating

### Dimensionality Setting

In [10]:
# Hyperparameters
num_factors = 20
learning_rate = 0.01
num_epochs = 10
lambda_reg = 0.1

### Random Initialization and Regularization

In [11]:
# Initialize user and item matrices
user_matrix = np.random.normal(size=(num_users, num_factors))
item_matrix = np.random.normal(size=(num_items, num_factors))


### Interpolation

In [12]:
# Train model
for epoch in range(num_epochs):
    for i in range(num_users):
        for j in range(num_items):
            if train_matrix[i][j] > 0:
                prediction = np.dot(user_matrix[i], item_matrix[j])
                error = train_matrix[i][j] - prediction
                
                # Update user and item matrices
                user_matrix[i] += learning_rate * (error * item_matrix[j] - lambda_reg * user_matrix[i])
                item_matrix[j] += learning_rate * (error * user_matrix[i] - lambda_reg * item_matrix[j])
    
    # Evaluate model
    predictions = np.dot(user_matrix, item_matrix.T)
    mse = mean_squared_error(test_matrix[test_matrix.nonzero()], predictions[test_matrix.nonzero()])
    rmse = sqrt(mse)
    print('Epoch %d, RMSE: %f' % (epoch+1, rmse))


Epoch 1, RMSE: 1.939659
Epoch 2, RMSE: 1.139167
Epoch 3, RMSE: 1.042253
Epoch 4, RMSE: 1.008161
Epoch 5, RMSE: 0.989313
Epoch 6, RMSE: 0.976660
Epoch 7, RMSE: 0.967078
Epoch 8, RMSE: 0.959215
Epoch 9, RMSE: 0.952476
Epoch 10, RMSE: 0.946604


In [13]:
# Print predictions
count = 0
for (i, j) in zip(*test_matrix.nonzero()):
    prediction = np.dot(user_matrix[i], item_matrix[j])
    print("User ID: %d, Movie ID: %d, Predicted Rating: %f" % (i+1, j, prediction))
    count += 1
    if count == 15:
        break

User ID: 1, Movie ID: 47, Predicted Rating: 3.270981
User ID: 1, Movie ID: 593, Predicted Rating: 4.167346
User ID: 1, Movie ID: 719, Predicted Rating: 4.206065
User ID: 1, Movie ID: 1021, Predicted Rating: 4.229602
User ID: 1, Movie ID: 1196, Predicted Rating: 4.435787
User ID: 1, Movie ID: 1720, Predicted Rating: 3.685748
User ID: 1, Movie ID: 1961, Predicted Rating: 3.986361
User ID: 1, Movie ID: 2790, Predicted Rating: 3.897736
User ID: 1, Movie ID: 2917, Predicted Rating: 4.009284
User ID: 1, Movie ID: 3407, Predicted Rating: 4.095823
User ID: 2, Movie ID: 94, Predicted Rating: 2.630974
User ID: 2, Movie ID: 379, Predicted Rating: 3.313437
User ID: 2, Movie ID: 433, Predicted Rating: 2.961054
User ID: 2, Movie ID: 514, Predicted Rating: 3.444213
User ID: 2, Movie ID: 589, Predicted Rating: 3.527575


## GMF

In [14]:
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from keras.layers import Concatenate, Dense, Dropout, Input, Embedding, Flatten

In [15]:
# Load the data
data = np.genfromtxt(data_path + "ratings.csv", delimiter=',', skip_header=1, dtype=int)

In [16]:
# Split the data into training and test sets
np.random.seed(42)
msk = np.random.rand(len(data)) < 0.8
train_data = data[msk]
test_data = data[~msk]

### Latent Vector

In [17]:
# Create user-item matrices for training and test sets
train_data_matrix = np.zeros((max(data[:, 0]), max(data[:, 1])))
for line in train_data:
    train_data_matrix[int(line[0])-1, int(line[1])-1] = line[2]

test_data_matrix = np.zeros((max(data[:, 0]), max(data[:, 1])))
for line in test_data:
    test_data_matrix[int(line[0])-1, int(line[1])-1] = line[2]

In [18]:
# Define the GMF model
num_users, num_items = train_data_matrix.shape
embedding_size = 8

user_input = keras.Input(shape=(1,), dtype='int32', name='user_input')
user_embedding = layers.Embedding(input_dim=num_users, output_dim=embedding_size, 
                                  input_length=1, name='user_embedding')(user_input)
user_vec = layers.Flatten(name='flatten_users')(user_embedding)

item_input = keras.Input(shape=(1,), dtype='int32', name='item_input')
item_embedding = layers.Embedding(input_dim=num_items, output_dim=embedding_size, 
                                  input_length=1, name='item_embedding')(item_input)
item_vec = layers.Flatten(name='flatten_items')(item_embedding)

Metal device set to: Apple M1


2023-04-19 15:41:34.208860: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-04-19 15:41:34.209034: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [19]:
len(train_data[:,2])

838594

In [20]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [20]:
num_users = max(np.max(train_data[:, 0]), np.max(test_data[:, 0])) + 1
num_movies = max(np.max(train_data[:, 1]), np.max(test_data[:, 1])) + 1

# Build the model
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=50, name='user_embedding')(user_input)
user_vec = Flatten(name='flatten_users')(user_embedding)

movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=50, name='movie_embedding')(movie_input)
movie_vec = Flatten(name='flatten_movies')(movie_embedding)

input_vecs = Concatenate(name='concatenation')([user_vec, movie_vec])
x = Dense(128, activation='relu')(input_vecs)
x = Dropout(0.3)(x)
y = Dense(1)(x)

model = Model(inputs=[user_input, movie_input], outputs=y)
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
history = model.fit(x=[train_data[:,0], train_data[:,1]], y=train_data[:,2], batch_size=64, epochs=10, 
                    validation_split=0.2, verbose=1)

Epoch 1/10


2023-03-23 18:16:10.019571: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-23 18:16:10.256830: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


   87/10483 [..............................] - ETA: 4:35 - loss: 8.3489

KeyboardInterrupt: 

In [None]:
# Save the model
model.save('trained_gmf.h5')

In [22]:
# Make predictions for the test set
predictions = model.predict([test_data[:,0], test_data[:,1]])


2023-04-19 15:41:43.388970: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-04-19 15:41:43.560924: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [23]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(test_data[:,2], predictions))
print('RMSE:', rmse)

RMSE: 2.0108634061940194


In [26]:
# Display predictions as userid, movies, and predicted rating
results = pd.DataFrame({'userId': test_data[:,0], 'movieId': test_data[:,1], 'predicted_rating': predictions.flatten()})

In [27]:
results

Unnamed: 0,userId,movieId,predicted_rating
0,1,29,1.377288
1,1,223,1.272934
2,1,296,1.972474
3,1,318,2.568789
4,1,1201,1.299395
...,...,...,...
209976,7119,150,1.842330
209977,7119,344,1.533092
209978,7119,349,1.704445
209979,7119,356,2.051096


In [42]:
# # Print example predictions
# test_users = np.unique(test_data[:,0])
# for user in test_users[:10]:
#     user_ratings = test_data[test_data[:,0] == user]
#     user_predictions = model.predict([user_ratings[:,0], user_ratings[:,1]])
#     top_ratings_indices = user_predictions.flatten().argsort()[-3:][::-1]
#     recommended_item_ids = [test_data[i] for i in top_ratings_indices]
#     print('Prediction {}: {}'.format(user, recommended_item_ids))

In [21]:
# Load the model
from tensorflow.keras.models import load_model

model = load_model('trained_gmf.h5', compile=False)

## MLP

In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout
from tensorflow.keras.models import Model

In [28]:
# Load data
ratings_df = pd.read_csv('ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
1048570,7120,168,5.0,1175543061
1048571,7120,253,4.0,1175542225
1048572,7120,260,5.0,1175542035
1048573,7120,261,4.0,1175543376


In [29]:
# Split data into training and test sets
train_df, test_df = train_test_split(ratings_df, test_size=0.2)

In [30]:
# Create embedding layer for user IDs
user_id_input = Input(shape=(1,))
user_embedding = Embedding(input_dim=train_df['userId'].max()+1, output_dim=32)(user_id_input)
user_flatten = Flatten()(user_embedding)

# Create embedding layer for movie IDs
movie_id_input = Input(shape=(1,))
movie_embedding = Embedding(input_dim=train_df['movieId'].max()+1, output_dim=32)(movie_id_input)
movie_flatten = Flatten()(movie_embedding)

# Concatenate user and movie embeddings
concatenate = Concatenate()([user_flatten, movie_flatten])

# Add dense layers to create MLP architecture
dense_1 = Dense(units=64, activation='relu')(concatenate)
dropout_1 = Dropout(0.2)(dense_1)
dense_2 = Dense(units=32, activation='relu')(dropout_1)
dropout_2 = Dropout(0.2)(dense_2)
dense_3 = Dense(units=16, activation='relu')(dropout_2)
output = Dense(units=1, activation='linear')(dense_3)

# Create model and compile
model = Model(inputs=[user_id_input, movie_id_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')

In [29]:
# Fit model to training data
history = model.fit(x=[train_df['userId'], train_df['movieId']], y=train_df['rating'], batch_size=64, epochs=10, validation_split=0.2)

Epoch 1/10


2023-03-23 18:17:49.522145: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-03-23 18:21:08.469320: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
# Save the model
model.save('trained_mlp.h5')

In [32]:
# Make predictions on test data
predictions = model.predict(x=[test_df['userId'], test_df['movieId']])

2023-04-19 15:44:12.741159: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [33]:
# Display predictions as userid, movies, and predicted rating
results = pd.DataFrame({'userId': test_df['userId'], 'movieId': test_df['movieId'], 'predicted_rating': predictions.flatten()})

In [34]:
results

Unnamed: 0,userId,movieId,predicted_rating
917937,6129,442,2.385502
148613,984,1569,3.519360
111216,768,457,3.700163
40324,309,2028,4.098413
779334,5187,1097,3.953411
...,...,...,...
299621,2051,6119,3.575153
704009,4673,1270,4.084152
585273,3921,2013,1.837890
284384,1959,4701,3.034858


In [31]:
# Load the model
from tensorflow.keras.models import load_model

model = load_model('trained_mlp.h5', compile=False)

## NeuMF

In [1]:
import pandas as pd
import numpy as np
from keras import Model
from keras.layers import Input, Dense, Concatenate, Dropout, Embedding, Flatten
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Multiply


In [2]:
from keras.regularizers import l2


In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
1048570,7120,168,5.0,1175543061
1048571,7120,253,4.0,1175542225
1048572,7120,260,5.0,1175542035
1048573,7120,261,4.0,1175543376


In [25]:
# Split data into train and test sets
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

In [26]:
# Create user-item matrix for train and test sets
train_matrix = train.pivot(index='userId', columns='movieId', values='rating').fillna(0)
test_matrix = test.pivot(index='userId', columns='movieId', values='rating').fillna(0)

In [27]:
train_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,129235,129303,129350,129354,129428,129707,130073,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7116,4.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7117,4.0,0.0,4.0,0.0,0.0,5.0,3.0,0.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7119,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
test_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,127110,127196,128356,128510,128622,128648,128832,128842,130052,130219
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
def NeuMF(num_users, num_items, mf_dim, layers, reg_mf, reg_layers):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')

    # Embedding layers
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=mf_dim, name='mf_user_embedding',
                                  embeddings_initializer='random_normal', input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=mf_dim, name='mf_item_embedding',
                                  embeddings_initializer='random_normal', input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=layers[0]//2, name='mlp_user_embedding',
                                   embeddings_initializer='random_normal', input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=layers[0]//2, name='mlp_item_embedding',
                                   embeddings_initializer='random_normal', input_length=1)

    # GMF branch
    mf_user_latent = Flatten()(mf_user_embedding(user_input))
    mf_item_latent = Flatten()(mf_item_embedding(item_input))
    mf_vector = Multiply()([mf_user_latent, mf_item_latent])

    # MLP branch
    mlp_user_latent = Flatten()(mlp_user_embedding(user_input))
    mlp_item_latent = Flatten()(mlp_item_embedding(item_input))
    mlp_vector = Concatenate()([mlp_user_latent, mlp_item_latent])
    for i in range(len(layers)):
        layer = Dense(layers[i], activation='relu', name=f'layer{i}', kernel_regularizer=l2(reg_layers[i]))
        mlp_vector = layer(mlp_vector)
        dropout_layer = Dropout(rate=0.2, name=f'dropout_layer{i}')
        mlp_vector = dropout_layer(mlp_vector)

    # Concatenate GMF and MLP branches
    concat_vector = Concatenate()([mf_vector, mlp_vector])

    # Output layer
    output_layer = Dense(1, activation='linear', name='output_layer', kernel_regularizer=l2(reg_mf))
    prediction = output_layer(concat_vector)

    # Define the model
    model = Model(inputs=[user_input, item_input], outputs=prediction)

    return model


In [30]:
# Set hyperparameters
num_users = len(ratings.userId.unique())
num_items = len(ratings.movieId.unique())


In [31]:
num_users

7120

In [32]:
num_items

14026

In [33]:
mf_dim = 8
layers = [64, 32, 16]
reg_mf = 0
reg_layers = [0, 0, 0, 0]
learning_rate = 0.001
batch_size = 256
epochs = 10

In [15]:
# Build and compile NeuMF model
model = NeuMF(num_users, num_items, mf_dim, layers, reg_mf, reg_layers)
#model.compile(optimizer=Adam(lr=learning_rate), loss='mean_squared_error', metrics=['accuracy'])
model.compile(loss='mean_squared_error', optimizer='adam')

# Train NeuMF model

Metal device set to: Apple M1


2023-05-11 18:23:34.960881: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-05-11 18:23:34.961452: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [51]:
history = model.fit([train.userId, train.movieId], train.rating, batch_size=batch_size, epochs=epochs, validation_data=([test.userId, test.movieId], test.rating))


Epoch 1/10


2023-04-19 15:45:20.554882: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-04-19 15:46:16.467130: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [52]:
model.save('trained_neumf.h5')

In [16]:
from keras.models import load_model
model = load_model('trained_neumf.h5')


In [17]:
def recommend_movies(model, user_id, movie_df, top_n=10):
    # Get list of all movie IDs
    all_movies = list(movie_df.movieId.unique())

    # Create list of tuples for user ID and each movie ID
    user = np.full(len(all_movies), user_id, dtype='int32')
    movies = np.array(all_movies, dtype='int32')
    user_movie_matrix = [user, movies]

    # Predict ratings for all movies
    ratings = model.predict(user_movie_matrix)
    ratings[ratings>5.0] = 5.0
   
    # Create DataFrame of predicted ratings for each movie
    ratings_df = pd.DataFrame({'movieId': movies, 'rating': ratings.flatten()})

    # Merge with movie DataFrame to get movie titles
    merged_df = pd.merge(ratings_df, movie_df, on='movieId')

    # Sort by predicted rating and return top n movies
    top_movies = merged_df.sort_values(by='rating', ascending=False).head(top_n)

    return top_movies[['movieId', 'title','rating']]

In [18]:
movies = pd.read_csv('movies.csv')

for i in range(0,5):
    # Generate recommendations for user i
    recommendations = recommend_movies(model, i, movies)
    # Print top 10 recommended movies for user i
    print("user ",i+1,":")
    print(recommendations)
    print()
    print("*******************************************************************************************************")
    print()

 16/853 [..............................] - ETA: 2s  

2023-04-23 07:50:01.105369: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-04-23 07:50:01.174755: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


user  1 :
      movieId                                              title    rating
7276     7388  Brother Sun, Sister Moon (Fratello sole, sorel...  3.989934
1505     1555         To Have, or Not (En avoir (ou pas)) (1995)  3.967968
4359     4454                                        More (1998)  3.915917
463       467                             Live Nude Girls (1995)  3.873938
7765     8364  Baadasssss! (How to Get the Man's Foot Outta Y...  3.868731
7237     7349             Broken Wings (Knafayim Shvurot) (2002)  3.865423
6858     6970                                    Desk Set (1957)  3.859289
5741     5840  My Mother's Castle (Château de ma mère, Le) (1...  3.856652
7987     8670  Testament of Dr. Mabuse, The (Das Testament de...  3.842881
4987     5083                                  Rare Birds (2001)  3.835602

*******************************************************************************************************

user  2 :
      movieId                                    

## Recurrent Neural Network

In [13]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
import pandas as pd

In [7]:
ratings_data = np.genfromtxt(data_path + 'ratings.csv', delimiter=',', skip_header=1)

In [8]:
# Extract the user IDs, movie IDs, and ratings from the data
user_ids = ratings_data[:, 0].astype(int)
movie_ids = ratings_data[:, 1].astype(int)
ratings = ratings_data[:, 2]

In [9]:
# Normalize the ratings between 0 and 1
max_rating = max(ratings)
min_rating = min(ratings)
ratings = (ratings - min_rating) / (max_rating - min_rating)

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    np.stack([user_ids, movie_ids], axis=1),
    ratings,
    test_size=0.2,
    random_state=42
)

In [21]:
# Define the RNN-based recommender system model
model = Sequential()
model.add(LSTM(64, input_shape=(1, 2)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [22]:
# Train the model
X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10


2023-03-30 18:32:56.835011: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-03-30 18:32:56.919284: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-03-30 18:32:56.993507: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-03-30 18:34:25.798413: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-03-30 18:34:25.833136: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2ac78c4c0>

In [25]:
model.save('trained_rnn.h5')

In [26]:
# Generate recommendations for a user
user_id = 1
movie_ids = np.arange(1, 14026) # generate recommendations for the first 10 movies
np.random.shuffle(movie_ids) # shuffle the movie IDs
user_input = np.array([(user_id, movie_id) for movie_id in movie_ids])
user_input = np.expand_dims(user_input, axis=1)
predicted_normalized_ratings = model.predict(user_input).flatten()
predicted_ratings = predicted_normalized_ratings * (max_rating - min_rating) + min_rating
recommended_movies = pd.read_csv('movies.csv').iloc[movie_ids-1][['movieId', 'title']]
recommended_movies['rating'] = predicted_ratings

# Sort the recommended movies by rating in descending order
recommended_movies = recommended_movies.sort_values('rating', ascending=False)
recommended_movies = recommended_movies[:10]

# Print the recommended movies
print('Recommended movies for user', user_id, ':')
print(recommended_movies.to_string(index=False))

Recommended movies for user 1 :
 movieId                              title   rating
       1                   Toy Story (1995) 3.828001
       2                     Jumanji (1995) 3.743200
       3            Grumpier Old Men (1995) 3.670595
       4           Waiting to Exhale (1995) 3.617508
       5 Father of the Bride Part II (1995) 3.582061
      53                    Lamerica (1994) 3.570140
      52            Mighty Aphrodite (1995) 3.570138
      54              Big Green, The (1995) 3.570136
      51              Guardian Angel (1994) 3.570129
      55                     Georgia (1995) 3.570127


In [2]:
from keras.models import load_model
model = load_model('trained_rnn.h5')


2023-03-31 13:32:13.309203: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-31 13:32:13.309958: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1


## Evaluation

### K-Fold Cross Validation

In [56]:
import pandas as pd
import numpy as np
from keras import Model
from keras.layers import Input, Dense, Concatenate, Dropout, Embedding, Flatten
from keras.optimizers import Adam
from sklearn.model_selection import KFold
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Multiply

In [57]:
from keras.regularizers import l2

In [58]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Set hyperparameters
num_users = len(ratings.userId.unique())
num_items = len(ratings.movieId.unique())

mf_dim = 8
layers = [64, 32, 16]
reg_mf = 0
reg_layers = [0, 0, 0, 0]
learning_rate = 0.001
batch_size = 256
epochs = 10

In [59]:
# Initialize k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)


In [60]:
# Create empty list to store evaluation scores
scores = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kfold.split(ratings)):
    print(f'Fold {fold+1}')

    # Split data into train and validation sets
    train = ratings.iloc[train_idx]
    val = ratings.iloc[val_idx]

    # Create user-item matrix for train and validation sets
    train_matrix = train.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    val_matrix = val.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    # Build and compile NeuMF model
    model = NeuMF(num_users, num_items, mf_dim, layers, reg_mf, reg_layers)
    model.compile(loss='mean_squared_error', optimizer='adam')

    # Train NeuMF model on current fold
    history = model.fit([train.userId, train.movieId], train.rating, batch_size=batch_size, epochs=epochs, validation_data=([val.userId, val.movieId], val.rating), verbose=0)

    # Evaluate NeuMF model on validation set
    val_loss = model.evaluate([val.userId, val.movieId], val.rating, verbose=0)
    print(f'Validation loss: {val_loss:.4f}')
    scores.append(val_loss)

    print("*******************************************************************************************************")
    print()

Fold 1


2023-04-19 16:01:24.677887: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-19 16:02:11.174902: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Validation loss: 0.7788
*******************************************************************************************************

Fold 2


2023-04-19 16:09:16.472695: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-19 16:10:02.799401: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Validation loss: 0.7783
*******************************************************************************************************

Fold 3


2023-04-19 16:17:50.450565: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-19 16:18:38.308516: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Validation loss: 0.7895
*******************************************************************************************************

Fold 4


2023-04-19 16:26:40.044557: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-19 16:27:32.867256: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Validation loss: 0.7951
*******************************************************************************************************

Fold 5


2023-04-19 16:35:43.978611: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-04-19 16:36:39.259761: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Validation loss: 0.8644
*******************************************************************************************************



In [61]:
# Calculate mean and standard deviation of evaluation scores
print(f'Cross-validation loss: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

# Generate recommendations for each user using the entire dataset
for i in range(num_users):
    # Generate recommendations for user i
    recommendations = recommend_movies(model, i, movies)
    # Print top 10 recommended movies for user i
    print("user ",i+1,":")
    print(recommendations)
    print()
    print("*******************************************************************************************************")
    print()

Cross-validation loss: 0.8012 +/- 0.0322
  4/853 [..............................] - ETA: 15s 

2023-04-19 16:44:43.902698: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


user  1 :
      movieId                                              title    rating
5154     5251  Charter Trip, The (a.k.a. Package Tour, The) (...  3.912589
4085     4179                                      Pixote (1981)  3.872313
4136     4230                              Too Much Sleep (1997)  3.870593
3901     3995                                 Boys Life 3 (2000)  3.865779
5751     5850               Road Games (a.k.a. Roadgames) (1981)  3.840136
315       318                   Shawshank Redemption, The (1994)  3.835572
7044     7156  Fog of War: Eleven Lessons from the Life of Ro...  3.832339
1505     1555         To Have, or Not (En avoir (ou pas)) (1995)  3.831957
7356     7502                            Band of Brothers (2001)  3.825464
6759     6869                        Bus 174 (Ônibus 174) (2002)  3.823964

*******************************************************************************************************

user  2 :
      movieId                                    

user  10 :
      movieId                                       title  rating
5789     5888                       Brother (Brat) (1997)     5.0
7382     7581                    Fountainhead, The (1949)     5.0
6561     6671                Angel at My Table, An (1990)     5.0
5776     5875                    Personal Velocity (2002)     5.0
1505     1555  To Have, or Not (En avoir (ou pas)) (1995)     5.0
596       602               Great Day in Harlem, A (1994)     5.0
5721     5820    Standing in the Shadows of Motown (2002)     5.0
626       633                      Denise Calls Up (1995)     5.0
946       963               Inspector General, The (1949)     5.0
5684     5783                              Derrida (2002)     5.0

*******************************************************************************************************

user  11 :
      movieId                                              title    rating
79         80        White Balloon, The (Badkonake sefid) (1995)  4.537

user  19 :
      movieId                                              title    rating
1533     1585                               Love Serenade (1996)  5.000000
3554     3645            Cleo from 5 to 7 (Cléo de 5 à 7) (1962)  4.926860
826       841  Eyes Without a Face (Yeux sans visage, Les) (1...  4.872491
1764     1846                                Nil By Mouth (1997)  4.836774
4050     4144      In the Mood For Love (Fa yeung nin wa) (2000)  4.806373
7593     7979                                Monterey Pop (1968)  4.769898
940       957                         Scarlet Letter, The (1926)  4.764027
7572     7937                    Silence, The (Tystnaden) (1963)  4.762913
6842     6954  Barbarian Invasions, The (Les invasions barbar...  4.756328
6242     6341                        Shape of Things, The (2003)  4.745326

*******************************************************************************************************


KeyboardInterrupt: 

### A/B Testing

In [1]:
import numpy as np
import pandas as pd
from keras.models import load_model
from sklearn.model_selection import train_test_split

In [2]:
# Load the trained models
neumf_model = load_model('trained_neumf.h5')
rnn_model = load_model('trained_rnn.h5')

movie_df = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

train, test = train_test_split(ratings, test_size=0.2, random_state=42)

all_movies = list(movie_df.movieId.unique())

user = np.full(len(all_movies), 1, dtype='int32')
movies = np.array(all_movies, dtype='int32')
user_movie_matrix = [user, movies]

Metal device set to: Apple M1


2023-05-11 21:33:28.255382: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-05-11 21:33:28.255522: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
# Evaluate the models on the test data
neumf_predictions = neumf_model.predict([test.userId, test.movieId])
neumf_rmse = np.sqrt(np.mean(np.square(neumf_predictions - test.rating.values.reshape(-1, 1))))

  44/6554 [..............................] - ETA: 15s 

2023-05-11 21:33:50.108838: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-11 21:33:50.158075: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [8]:
rnn_predictions = rnn_model.predict(np.expand_dims(np.stack([test.userId, test.movieId], axis=1), axis=1))
rnn_predictions = rnn_predictions.flatten() * (5.0 - 0.5) + 0.5
rnn_rmse = np.sqrt(np.mean(np.square(rnn_predictions - test.rating.values)))



In [10]:
# Determine the winning model
if neumf_rmse < rnn_rmse:
    winning_model = 'NeuMF'
else:
    winning_model = 'RNN'

# Print the results
print('RMSE for NeuMF:', neumf_rmse)
print('RMSE for RNN:', rnn_rmse)
print('The winning model is:', winning_model)

RMSE for NeuMF: 0.8983061318014586
RMSE for RNN: 1.0522898400448653
The winning model is: NeuMF
