In [58]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [59]:
movies_df = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/movie_info.csv')
ratings_df = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_all_development_set.csv')
ratings_df = ratings_df.rename(columns={'user_id': 'userId', 'item_id': 'movieId'})
movies_df = movies_df.rename(columns={'item_id': 'movieId'})
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv') 

In [60]:
# Ensure required columns exist
assert 'userId' in ratings_df.columns and 'movieId' in ratings_df.columns and 'rating' in ratings_df.columns

# Encode user and movie IDs
user_encoder = LabelEncoder()
ratings_df['user_id_encoded'] = user_encoder.fit_transform(ratings_df['userId'])

movie_encoder = LabelEncoder()
ratings_df['movie_id_encoded'] = movie_encoder.fit_transform(ratings_df['movieId'])

In [61]:
# Copy the dataset and split into train and test sets
df = ratings_df.copy()
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Convert columns to NumPy arrays
train_user_ids = np.array(train['user_id_encoded'].values)
train_movie_ids = np.array(train['movie_id_encoded'].values)
train_ratings = np.array(train['rating'].values)

test_user_ids = np.array(test['user_id_encoded'].values)
test_movie_ids = np.array(test['movie_id_encoded'].values)
test_ratings = np.array(test['rating'].values)

In [62]:
# Define the number of unique users, movies, and embedding dimensions
num_users = df['user_id_encoded'].nunique()
num_movies = df['movie_id_encoded'].nunique()
embedding_dim = 128

# Define the model
# User input and embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
user_embedding = Flatten()(user_embedding)

# Movie input and embedding
movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
movie_embedding = Flatten()(movie_embedding)

# Dot product of embeddings and output layer
dot_product = Dot(axes=1)([user_embedding, movie_embedding])
output = Dense(1, activation='linear')(dot_product)

In [65]:
# Compile the model
model = Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer='adam', loss='mae')

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    [train_user_ids, train_movie_ids], train_ratings,
    epochs=6,
    batch_size=256,
    validation_data=([test_user_ids, test_movie_ids], test_ratings),
    callbacks=[early_stopping]
)

Epoch 1/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5258 - val_loss: 0.7372
Epoch 2/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4791 - val_loss: 0.7431
Epoch 3/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4326 - val_loss: 0.7459
Epoch 4/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3928 - val_loss: 0.7543


In [68]:
# Evaluate the model
test_loss = model.evaluate([test_user_ids, test_movie_ids], test_ratings)
print(f"Test loss (MAE): {test_loss}")

# Predict ratings for the test set
test_predictions = model.predict([test_user_ids, test_movie_ids])

[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 495us/step - loss: 0.7404
Test loss (MAE): 0.7371749877929688
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 538us/step


In [70]:
# Create a DataFrame with predictions
test_results = pd.DataFrame({
    'original_user_id': test['userId'],
    'original_movie_id': test['movieId'],
    'actual_rating': test_ratings,
    'predicted_rating': test_predictions.flatten()
})
test_results['rating_difference'] = abs(test_results['actual_rating'] - test_results['predicted_rating'])

In [71]:
# Filter results for a specific user and sort by rating difference
filter_user_id = 772  # Replace with your desired user ID
filtered_results = test_results[test_results['original_user_id'] == filter_user_id]
filtered_results_sorted = filtered_results.sort_values(by='rating_difference')

print(f"Predictions for user {filter_user_id}, ordered by closest rating difference:")
print(filtered_results_sorted)

Predictions for user 772, ordered by closest rating difference:
       original_user_id  original_movie_id  actual_rating  predicted_rating  \
76753               772                230              2          1.981099   
10398               772                946              2          1.887526   
67026               772                893              2          2.120056   
525                 772                175              4          4.147466   
74901               772                187              3          3.206115   
21632               772                284              3          3.616620   
89978               772                728              3          3.672862   
62191               772                917              5          4.233611   
89249               772                391              2          2.768553   
8138                772                431              3          3.794891   
56211               772                170              5          

In [69]:
# Load leaderboard data
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv')  # Replace with your leaderboard file path
assert 'user_id' in leaderboard_data.columns and 'item_id' in leaderboard_data.columns

# Handle unseen user and movie IDs using mapping
user_mapping = dict(zip(user_encoder.classes_, user_encoder.transform(user_encoder.classes_)))
movie_mapping = dict(zip(movie_encoder.classes_, movie_encoder.transform(movie_encoder.classes_)))

# Map user and movie IDs, assigning unseen IDs to placeholder index
leaderboard_data['user_id_encoded'] = leaderboard_data['user_id'].map(
    lambda x: user_mapping.get(x, num_users)
).astype(int)

leaderboard_data['item_id_encoded'] = leaderboard_data['item_id'].map(
    lambda x: movie_mapping.get(x, num_movies)
).astype(int)

In [73]:
user_ids = leaderboard_data['user_id_encoded'].values
item_ids = leaderboard_data['item_id_encoded'].values
predictions = model.predict([user_ids, item_ids]).flatten()

# Save leaderboard predictions to a file
with open("predicted_ratings_leaderboard3.txt", "w") as f:
    for pred in predictions:
        f.write(f"{pred:.4f}\n")


[1m  1/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 14ms/step

2024-12-10 13:51:53.247407: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[28,0] = 1662 is not in [0, 1662)
	 [[{{node functional_6_1/movie_embedding_1/GatherV2}}]]


InvalidArgumentError: Graph execution error:

Detected at node functional_6_1/movie_embedding_1/GatherV2 defined at (most recent call last):
  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/runpy.py", line 86, in _run_code

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 701, in start

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 523, in process_one

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 429, in do_execute

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/4x/dslp7kh9487bqqc43wmz_bpm0000gn/T/ipykernel_58506/2803044761.py", line 6, in <module>

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 559, in predict

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 256, in one_step_on_data_distributed

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 246, in one_step_on_data

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 101, in predict_step

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/layers/layer.py", line 899, in __call__

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/models/functional.py", line 182, in call

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/models/functional.py", line 632, in call

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/layers/layer.py", line 899, in __call__

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/layers/core/embedding.py", line 140, in call

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/ops/numpy.py", line 5239, in take

  File "/Users/brandonmukadziwashe/micromamba/envs/cs135_env/lib/python3.10/site-packages/keras/src/backend/tensorflow/numpy.py", line 2063, in take

indices[28,0] = 1662 is not in [0, 1662)
	 [[{{node functional_6_1/movie_embedding_1/GatherV2}}]] [Op:__inference_one_step_on_data_distributed_95836]

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
# from sklearn.metrics import mean_absolute_error

# def build_and_train_model(num_users, num_movies, embedding_dim, learning_rate, batch_size, epochs, patience):
#     # Define the model
#     user_input = Input(shape=(1,), name='user_input')
#     user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
#     user_embedding = Flatten()(user_embedding)

#     movie_input = Input(shape=(1,), name='movie_input')
#     movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
#     movie_embedding = Flatten()(movie_embedding)

#     dot_product = Dot(axes=1)([user_embedding, movie_embedding])
#     output = Dense(1, activation='linear')(dot_product)

#     model = Model(inputs=[user_input, movie_input], outputs=output)
#     model.compile(optimizer='adam', loss='mae')

#     # Add early stopping
#     early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

#     # Train the model
#     history = model.fit(
#         [train_user_ids, train_movie_ids], train_ratings,
#         epochs=epochs,
#         batch_size=batch_size,
#         validation_data=([test_user_ids, test_movie_ids], test_ratings),
#         callbacks=[early_stopping],
#         verbose=0
#     )

#     # Evaluate the model
#     val_loss = model.evaluate([test_user_ids, test_movie_ids], test_ratings, verbose=0)
#     return model, val_loss

# # Define the hyperparameter grid
# embedding_dims = [16, 32, 64, 128]
# learning_rates = [0.001, 0.005, 0.01, 0.05]
# batch_sizes = [32, 64, 128, 256]
# epochs = 10
# patience = 3

# best_model = None
# best_params = None
# lowest_mae = float('inf')

# # Perform grid search
# for emb_dim, lr, batch_size in itertools.product(embedding_dims, learning_rates, batch_sizes):
#     print(f"Trying: Embedding Dim={emb_dim}, Learning Rate={lr}, Batch Size={batch_size}")
#     model, val_mae = build_and_train_model(num_users, num_movies, emb_dim, 0.001, batch_size, epochs, patience)
    
#     print(f"Validation MAE: {val_mae:.4f}")
    
#     if val_mae < lowest_mae:
#         lowest_mae = val_mae
#         best_model = model
#         best_params = (emb_dim, lr, batch_size)

# print(f"\nBest Model Found: Embedding Dim={best_params[0]}, Learning Rate={best_params[1]}, Batch Size={best_params[2]}")
# print(f"Lowest Validation MAE: {lowest_mae:.4f}")