In [42]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [43]:
movies_df = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/movie_info.csv')
ratings_df = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_all_development_set.csv')
ratings_df = ratings_df.rename(columns={'user_id': 'userId', 'item_id': 'movieId'})
movies_df = movies_df.rename(columns={'item_id': 'movieId'})
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv') 

In [44]:
# Ensure required columns exist
assert 'userId' in ratings_df.columns and 'movieId' in ratings_df.columns and 'rating' in ratings_df.columns

# Encode user and movie IDs
user_encoder = LabelEncoder()
ratings_df['user_id_encoded'] = user_encoder.fit_transform(ratings_df['userId'])

movie_encoder = LabelEncoder()
ratings_df['movie_id_encoded'] = movie_encoder.fit_transform(ratings_df['movieId'])

In [45]:
# Copy the dataset and split into train and test sets
df = ratings_df.copy()
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Convert columns to NumPy arrays
train_user_ids = np.array(train['user_id_encoded'].values)
train_movie_ids = np.array(train['movie_id_encoded'].values)
train_ratings = np.array(train['rating'].values)

test_user_ids = np.array(test['user_id_encoded'].values)
test_movie_ids = np.array(test['movie_id_encoded'].values)
test_ratings = np.array(test['rating'].values)

In [46]:
# Define the number of unique users, movies, and embedding dimensions
num_users = df['user_id_encoded'].nunique()
num_movies = df['movie_id_encoded'].nunique()
embedding_dim = 128

# Define the model
# User input and embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
user_embedding = Flatten()(user_embedding)

# Movie input and embedding
movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
movie_embedding = Flatten()(movie_embedding)

# Dot product of embeddings and output layer
dot_product = Dot(axes=1)([user_embedding, movie_embedding])
output = Dense(1, activation='linear')(dot_product)

In [47]:
# Compile the model
model = Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer='adam', loss='mae')

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    [train_user_ids, train_movie_ids], train_ratings,
    epochs=6,
    batch_size=256,
    validation_data=([test_user_ids, test_movie_ids], test_ratings),
    callbacks=[early_stopping]
)

Epoch 1/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3.4280 - val_loss: 1.9832
Epoch 2/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.2575 - val_loss: 0.8015
Epoch 3/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7564 - val_loss: 0.7628
Epoch 4/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7071 - val_loss: 0.7515
Epoch 5/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.6823 - val_loss: 0.7401
Epoch 6/6
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.6487 - val_loss: 0.7362


In [48]:
# Evaluate the model
test_loss = model.evaluate([test_user_ids, test_movie_ids], test_ratings)
print(f"Test loss (MAE): {test_loss}")

# Predict ratings for the test set
test_predictions = model.predict([test_user_ids, test_movie_ids])

[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 454us/step - loss: 0.7383
Test loss (MAE): 0.7361571192741394
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 507us/step


In [49]:
# Create a DataFrame with predictions
test_results = pd.DataFrame({
    'original_user_id': test['userId'],
    'original_movie_id': test['movieId'],
    'actual_rating': test_ratings,
    'predicted_rating': test_predictions.flatten()
})
test_results['rating_difference'] = abs(test_results['actual_rating'] - test_results['predicted_rating'])

In [50]:
# Filter results for a specific user and sort by rating difference
filter_user_id = 772  # Replace with your desired user ID
filtered_results = test_results[test_results['original_user_id'] == filter_user_id]
filtered_results_sorted = filtered_results.sort_values(by='rating_difference')

print(f"Predictions for user {filter_user_id}, ordered by closest rating difference:")
print(filtered_results_sorted)

Predictions for user 772, ordered by closest rating difference:
       original_user_id  original_movie_id  actual_rating  predicted_rating  \
44904               772                 69              3          3.323037   
21632               772                284              3          3.335831   
10398               772                946              2          1.663316   
525                 772                175              4          3.655008   
76753               772                230              2          2.493973   
8138                772                431              3          3.526340   
74901               772                187              3          3.526487   
67026               772                893              2          2.668275   
74708               772                126              5          4.283081   
13524               772                565              2          2.717553   
89249               772                391              2          

In [51]:
# Load leaderboard data
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv')  # Replace with your leaderboard file path
assert 'user_id' in leaderboard_data.columns and 'item_id' in leaderboard_data.columns

# Handle unseen user and movie IDs using mapping
user_mapping = dict(zip(user_encoder.classes_, user_encoder.transform(user_encoder.classes_)))
movie_mapping = dict(zip(movie_encoder.classes_, movie_encoder.transform(movie_encoder.classes_)))

leaderboard_data['user_id_encoded'] = leaderboard_data['user_id'].map(user_mapping).fillna(-1).astype(int)
leaderboard_data['item_id_encoded'] = leaderboard_data['item_id'].map(movie_mapping).fillna(-1).astype(int)

In [53]:
# Predict leaderboard ratings, handling valid IDs only
user_ids = leaderboard_data['user_id_encoded'].values
item_ids = leaderboard_data['item_id_encoded'].values

valid_indices = (user_ids != -1) & (item_ids != -1)
predictions = np.full(len(user_ids), np.nan)  # Initialize with NaN
predictions[valid_indices] = ModuleNotFoundError.predict([user_ids[valid_indices], item_ids[valid_indices]]).flatten()

# Save leaderboard predictions to a file
with open("predicted_ratings_leaderboard4.txt", "w") as f:
    for pred in predictions:
        if np.isnan(pred):  # Handle invalid predictions
            f.write("Invalid\n")
        else:
            f.write(f"{pred}\n")

AttributeError: type object 'ModuleNotFoundError' has no attribute 'predict'

## Problem 2 Report Tasks

### Part A: Proposed Method Description

Our proposed method is a neural collaborative filtering approach that employs user and movie embeddings to predict ratings. The model is designed with embedding layers for users and movies, which are multiplied to capture latent interactions. A dense layer further refines these interactions, and the output is a single predicted rating.

We chose this method because neural collaborative filtering has demonstrated success in capturing complex user-item relationships in sparse datasets. Hyperparameters such as embedding dimensions, batch size, and learning rate were tuned using systematic experimentation. Early stopping was employed to prevent overfitting, ensuring robust performance across validation and test sets. This design balances simplicity and predictive power, making it suitable for our dataset.

In [None]:
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
# from sklearn.metrics import mean_absolute_error

# def build_and_train_model(num_users, num_movies, embedding_dim, learning_rate, batch_size, epochs, patience):
#     # Define the model
#     user_input = Input(shape=(1,), name='user_input')
#     user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
#     user_embedding = Flatten()(user_embedding)

#     movie_input = Input(shape=(1,), name='movie_input')
#     movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
#     movie_embedding = Flatten()(movie_embedding)

#     dot_product = Dot(axes=1)([user_embedding, movie_embedding])
#     output = Dense(1, activation='linear')(dot_product)

#     model = Model(inputs=[user_input, movie_input], outputs=output)
#     model.compile(optimizer='adam', loss='mae')

#     # Add early stopping
#     early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

#     # Train the model
#     history = model.fit(
#         [train_user_ids, train_movie_ids], train_ratings,
#         epochs=epochs,
#         batch_size=batch_size,
#         validation_data=([test_user_ids, test_movie_ids], test_ratings),
#         callbacks=[early_stopping],
#         verbose=0
#     )

#     # Evaluate the model
#     val_loss = model.evaluate([test_user_ids, test_movie_ids], test_ratings, verbose=0)
#     return model, val_loss

# # Define the hyperparameter grid
# embedding_dims = [16, 32, 64, 128]
# learning_rates = [0.001, 0.005, 0.01, 0.05]
# batch_sizes = [32, 64, 128, 256]
# epochs = 10
# patience = 3

# best_model = None
# best_params = None
# lowest_mae = float('inf')

# # Perform grid search
# for emb_dim, lr, batch_size in itertools.product(embedding_dims, learning_rates, batch_sizes):
#     print(f"Trying: Embedding Dim={emb_dim}, Learning Rate={lr}, Batch Size={batch_size}")
#     model, val_mae = build_and_train_model(num_users, num_movies, emb_dim, 0.001, batch_size, epochs, patience)
    
#     print(f"Validation MAE: {val_mae:.4f}")
    
#     if val_mae < lowest_mae:
#         lowest_mae = val_mae
#         best_model = model
#         best_params = (emb_dim, lr, batch_size)

# print(f"\nBest Model Found: Embedding Dim={best_params[0]}, Learning Rate={best_params[1]}, Batch Size={best_params[2]}")
# print(f"Lowest Validation MAE: {lowest_mae:.4f}")