In [37]:
user_encoder = LabelEncoder()
ratings_df['user_id_encoded'] = user_encoder.fit_transform(ratings_df['userId'])

movie_encoder = LabelEncoder()
ratings_df['movie_id_encoded'] = movie_encoder.fit_transform(ratings_df['movieId'])

In [38]:
df = ratings_df.copy()

#### Split the Train and Test Data

In [74]:
# Train-test split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Ensure inputs are NumPy arrays
train_user_ids = np.array(train['user_id_encoded'].values)
train_movie_ids = np.array(train['movie_id_encoded'].values)
train_ratings = np.array(train['rating'].values)

test_user_ids = np.array(test['user_id_encoded'].values)
test_movie_ids = np.array(test['movie_id_encoded'].values)
test_ratings = np.array(test['rating'].values)

#### Define and Train the Two-Tower Model

In [86]:
# Number of unique users and movies
num_users = df['user_id_encoded'].nunique()
num_movies = df['movie_id_encoded'].nunique()
embedding_dim = 50  # Number of dimensions for the embedding

# User tower
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
user_embedding = Flatten()(user_embedding)

# Movie tower
movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
movie_embedding = Flatten()(movie_embedding)

# Dot product of user and movie embeddings to predict rating
dot_product = Dot(axes=1)([user_embedding, movie_embedding])

# Output layer for predicting rating
output = Dense(1, activation='linear')(dot_product)

# Compile and train the model
model2 = Model(inputs=[user_input, movie_input], outputs=output)
model2.compile(optimizer='adam', loss='mae')

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model2.fit([train_user_ids, train_movie_ids], train_ratings, 
                    epochs=6, 
                    batch_size=64, 
                    validation_data=([test_user_ids, test_movie_ids], test_ratings))

#### Model Evaluation

In [87]:
# Evaluate the model
test_loss = model2.evaluate([test_user_ids, test_movie_ids], test_ratings)
print(f"Test loss (MAE): {test_loss}")

#### Prediction on Test Data

In [88]:
# Make predictions
test_predictions = model2.predict([test_user_ids, test_movie_ids])


# Add the original user IDs, movie IDs, actual ratings, and predicted ratings to a DataFrame
test_results = pd.DataFrame({
    'original_user_id': test['userId'],  # Already original IDs in `ratings_df`
    'original_movie_id': test['movieId'],  # Match the original movie ID column
    'actual_rating': test_ratings,
    'predicted_rating': test_predictions.flatten()
})

#### Sort the prediction ranking difference in order(for a specific user)

In [89]:
# Calculate the difference between actual and predicted ratings
test_results['rating_difference'] = abs(test_results['actual_rating'] - test_results['predicted_rating'])

# Filter by a specific user ID (adjust for your data type)
filter_user_id = 772  # Replace with the desired user ID
filtered_results = test_results[test_results['original_user_id'] == filter_user_id]

# Sort by the closest rating difference
filtered_results_sorted = filtered_results.sort_values(by='rating_difference')

# Display sorted results
print(f"Predictions for user {filter_user_id}, ordered by closest rating difference:")
print(filtered_results_sorted)

In [None]:
# Extract user_ids and item_ids from the leaderboard data
user_ids = np.array(leaderboard_data['user_id'].values)
item_ids = np.array(leaderboard_data['item_id'].values)

predictions = model2.predict([user_ids, item_ids])

# Save the predictions to a text file
with open("predicted_ratings_leaderboard3.txt", "w") as f:
    for pred in predictions:
        f.write(f"{pred}\n")

In [96]:
print(user_ids.max(), item_ids.max())
print(user_ids.min(), item_ids.min())

In [97]:
print(test_user_ids.max(), test_movie_ids.max())
print(test_user_ids.min(), test_movie_ids.min())


In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# Ensure required columns exist
assert 'userId' in ratings_df.columns and 'movieId' in ratings_df.columns and 'rating' in ratings_df.columns

# Encode user and movie IDs
user_encoder = LabelEncoder()
ratings_df['user_id_encoded'] = user_encoder.fit_transform(ratings_df['userId'])

movie_encoder = LabelEncoder()
ratings_df['movie_id_encoded'] = movie_encoder.fit_transform(ratings_df['movieId'])

# Copy the dataset and split into train and test sets
df = ratings_df.copy()
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Convert columns to NumPy arrays
train_user_ids = np.array(train['user_id_encoded'].values)
train_movie_ids = np.array(train['movie_id_encoded'].values)
train_ratings = np.array(train['rating'].values)

test_user_ids = np.array(test['user_id_encoded'].values)
test_movie_ids = np.array(test['movie_id_encoded'].values)
test_ratings = np.array(test['rating'].values)

# Define the number of unique users, movies, and embedding dimensions
num_users = df['user_id_encoded'].nunique()
num_movies = df['movie_id_encoded'].nunique()
embedding_dim = 50

# Define the model
# User input and embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
user_embedding = Flatten()(user_embedding)

# Movie input and embedding
movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
movie_embedding = Flatten()(movie_embedding)

# Dot product of embeddings and output layer
dot_product = Dot(axes=1)([user_embedding, movie_embedding])
output = Dense(1, activation='linear')(dot_product)

# Compile the model
model2 = Model(inputs=[user_input, movie_input], outputs=output)
model2.compile(optimizer='adam', loss='mae')

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model2.fit(
    [train_user_ids, train_movie_ids], train_ratings,
    epochs=6,
    batch_size=64,
    validation_data=([test_user_ids, test_movie_ids], test_ratings),
    callbacks=[early_stopping]
)

# Evaluate the model
test_loss = model2.evaluate([test_user_ids, test_movie_ids], test_ratings)
print(f"Test loss (MAE): {test_loss}")

# Predict ratings for the test set
test_predictions = model2.predict([test_user_ids, test_movie_ids])

# Create a DataFrame with predictions
test_results = pd.DataFrame({
    'original_user_id': test['userId'],
    'original_movie_id': test['movieId'],
    'actual_rating': test_ratings,
    'predicted_rating': test_predictions.flatten()
})
test_results['rating_difference'] = abs(test_results['actual_rating'] - test_results['predicted_rating'])

# Filter results for a specific user and sort by rating difference
filter_user_id = 772  # Replace with your desired user ID
filtered_results = test_results[test_results['original_user_id'] == filter_user_id]
filtered_results_sorted = filtered_results.sort_values(by='rating_difference')

print(f"Predictions for user {filter_user_id}, ordered by closest rating difference:")
print(filtered_results_sorted)

# Load leaderboard data
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv')  # Replace with your leaderboard file path
assert 'user_id' in leaderboard_data.columns and 'item_id' in leaderboard_data.columns

# Handle unseen user and movie IDs using mapping
user_mapping = dict(zip(user_encoder.classes_, user_encoder.transform(user_encoder.classes_)))
movie_mapping = dict(zip(movie_encoder.classes_, movie_encoder.transform(movie_encoder.classes_)))

leaderboard_data['user_id_encoded'] = leaderboard_data['user_id'].map(user_mapping).fillna(-1).astype(int)
leaderboard_data['item_id_encoded'] = leaderboard_data['item_id'].map(movie_mapping).fillna(-1).astype(int)

# Predict leaderboard ratings, handling valid IDs only
user_ids = leaderboard_data['user_id_encoded'].values
item_ids = leaderboard_data['item_id_encoded'].values

valid_indices = (user_ids != -1) & (item_ids != -1)
predictions = np.full(len(user_ids), np.nan)  # Initialize with NaN
predictions[valid_indices] = model2.predict([user_ids[valid_indices], item_ids[valid_indices]]).flatten()

# Save leaderboard predictions to a file
with open("predicted_ratings_leaderboard3.txt", "w") as f:
    for pred in predictions:
        if np.isnan(pred):  # Handle invalid predictions
            f.write("Invalid\n")
        else:
            f.write(f"{pred:.4f}\n")


In [25]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
from train_valid_test_loader import load_train_valid_test_datasets

In [3]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()

In [4]:
movies_df = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/movie_info.csv')
ratings_df = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_all_development_set.csv')
# print(movies_df)
# Rename columns
ratings_df = ratings_df.rename(columns={'user_id': 'userId', 'item_id': 'movieId'})
movies_df = movies_df.rename(columns={'item_id': 'movieId'})


# Add a new column 'timestamp' filled with zeros
ratings_df['timestamp'] = 0


# print(ratings_df)
print(movies_df)

In [None]:
# # import the dataset
# import pandas as pd
# movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
# ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [5]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

In [6]:
# Take a look at movies_df
movies_df.head()

In [7]:
# Take a look at ratings_df
ratings_df.head()

In [6]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Movie names:", movie_names)
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

In [7]:
import torch
import numpy as np
import tqdm
from torch.autograd import Variable
# from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [8]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        
        #--- Producing new continuous IDs for users and movies ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [9]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.L1Loss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

In [10]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

In [11]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [12]:
len(trained_movie_embeddings) # unique movie factor weights

In [13]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [14]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
     # Check how many ratings this movie has rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count().iloc[0]
    rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count().iloc[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

### Making Predictions

#### Step 1: Cluster Analysis

In [15]:
# Assign each movie to a cluster
movie_clusters = {}
for cluster in range(10):
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        movie_clusters[movid] = cluster


#### Step 2: Enhance the Training Data: Add the cluster information to the ratings data:

In [16]:
# Add cluster information to ratings_df
ratings_df['cluster'] = ratings_df['movieId'].map(movie_clusters)


#### Step 3: Train a Model for Each Cluster

In [17]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

# Placeholder for cluster models
cluster_models = {}

# Train a model for each cluster
for cluster in range(10):
    cluster_data = ratings_df[ratings_df['cluster'] == cluster]
    
    if cluster_data.empty:
        continue  # Skip empty clusters

    # Prepare data for the surprise library
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(cluster_data[['userId', 'movieId', 'rating']], reader)
    trainset = data.build_full_trainset()
    
    # Train an SVD model
    model = SVD()
    model.fit(trainset)
    cluster_models[cluster] = model

#### Step 4: Predict Ratings for Leaderboard Data: For each movie-user pair in the leaderboard data, use the appropriate cluster model

In [18]:
# Load leaderboard data
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv')
leaderboard_data

In [19]:
# Load leaderboard data
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv')

# Predict ratings
predicted_ratings = []

for row in leaderboard_data.itertuples(index=False):
    user_id = row.user_id  # Assuming the column is named 'userId' in the CSV
    movie_id = row.item_id  # Assuming the column is named 'movieId' in the CSV
    
    cluster = movie_clusters.get(movie_id, None)
    
    if cluster is not None and cluster in cluster_models:
        # Use the model for the cluster
        pred = cluster_models[cluster].predict(user_id, movie_id).est
    else:
        # Fallback to a global average if no cluster or model exists
        pred = ratings_df['rating'].mean()
    
    predicted_ratings.append(pred)

# Save predictions to a file
np.savetxt('predicted_ratings_leaderboard.txt', predicted_ratings, fmt='%.4f')

## Alternative method

In [22]:
from CollabFilterOneVectorPerItem import *
from AbstractBaseCollabFilterSGD import *
from train_valid_test_loader import load_train_valid_test_datasets

In [21]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()
model = CollabFilterOneVectorPerItem(n_epochs=10, batch_size=50, step_size=0.1, n_factors = 50, alpha=0.001)
model.init_parameter_dict(n_users, n_items, train_tuple)
model.fit(train_tuple, valid_tuple)

In [23]:
from CollabFilterOneVectorPerItem import *
from AbstractBaseCollabFilterSGD import *
from train_valid_test_loader import load_train_valid_test_datasets

In [24]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()
model = CollabFilterOneVectorPerItem(n_epochs=10, batch_size=50, step_size=0.1, n_factors = 50, alpha=0.001)
model.init_parameter_dict(n_users, n_items, train_tuple)
model.fit(train_tuple, valid_tuple)

In [26]:
# Extract user_ids and item_ids from the leaderboard data
user_ids = leaderboard_data['user_id'].to_numpy()
item_ids = leaderboard_data['item_id'].to_numpy()

# Call the provided prediction function using these user_ids and item_ids
# Assuming the model parameters (mu, b_per_user, c_per_item, U, V) are defined or loaded
predictions = model.predict(user_ids, item_ids)

# Save the predictions to a text file
with open("predicted_ratings_leaderboard.txt", "w") as f:
    for pred in predictions:
        f.write(f"{pred}\n")

In [27]:
from CollabFilterOneVectorPerItem import *
from AbstractBaseCollabFilterSGD import *
from train_valid_test_loader import load_train_valid_test_datasets

In [28]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()
model = CollabFilterOneVectorPerItem(n_epochs=10, batch_size=50, step_size=0.1, n_factors = 50, alpha=0.001)
model.init_parameter_dict(n_users, n_items, train_tuple)
model.fit(train_tuple, valid_tuple)

In [30]:
# Extract user_ids and item_ids from the leaderboard data
user_ids = leaderboard_data['user_id'].to_numpy()
item_ids = leaderboard_data['item_id'].to_numpy()

# Call the provided prediction function using these user_ids and item_ids
# Assuming the model parameters (mu, b_per_user, c_per_item, U, V) are defined or loaded
predictions = model.predict(user_ids, item_ids)

# Save the predictions to a text file
with open("predicted_ratings_leaderboard2.txt", "w") as f:
    for pred in predictions:
        f.write(f"{pred}\n")

### Two-Tower Model

In [110]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# Ensure required columns exist
assert 'userId' in ratings_df.columns and 'movieId' in ratings_df.columns and 'rating' in ratings_df.columns

# Encode user and movie IDs
user_encoder = LabelEncoder()
ratings_df['user_id_encoded'] = user_encoder.fit_transform(ratings_df['userId'])

movie_encoder = LabelEncoder()
ratings_df['movie_id_encoded'] = movie_encoder.fit_transform(ratings_df['movieId'])

# Copy the dataset and split into train and test sets
df = ratings_df.copy()
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Convert columns to NumPy arrays
train_user_ids = np.array(train['user_id_encoded'].values)
train_movie_ids = np.array(train['movie_id_encoded'].values)
train_ratings = np.array(train['rating'].values)

test_user_ids = np.array(test['user_id_encoded'].values)
test_movie_ids = np.array(test['movie_id_encoded'].values)
test_ratings = np.array(test['rating'].values)

# Define the number of unique users, movies, and embedding dimensions
num_users = df['user_id_encoded'].nunique()
num_movies = df['movie_id_encoded'].nunique()
embedding_dim = 50

# Define the model
# User input and embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
user_embedding = Flatten()(user_embedding)

# Movie input and embedding
movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim, name='movie_embedding')(movie_input)
movie_embedding = Flatten()(movie_embedding)

# Dot product of embeddings and output layer
dot_product = Dot(axes=1)([user_embedding, movie_embedding])
output = Dense(1, activation='linear')(dot_product)

# Compile the model
model2 = Model(inputs=[user_input, movie_input], outputs=output)
model2.compile(optimizer='adam', loss='mae')

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model2.fit(
    [train_user_ids, train_movie_ids], train_ratings,
    epochs=6,
    batch_size=64,
    validation_data=([test_user_ids, test_movie_ids], test_ratings),
    callbacks=[early_stopping]
)

# Evaluate the model
test_loss = model2.evaluate([test_user_ids, test_movie_ids], test_ratings)
print(f"Test loss (MAE): {test_loss}")

# Predict ratings for the test set
test_predictions = model2.predict([test_user_ids, test_movie_ids])

# Create a DataFrame with predictions
test_results = pd.DataFrame({
    'original_user_id': test['userId'],
    'original_movie_id': test['movieId'],
    'actual_rating': test_ratings,
    'predicted_rating': test_predictions.flatten()
})
test_results['rating_difference'] = abs(test_results['actual_rating'] - test_results['predicted_rating'])

# Filter results for a specific user and sort by rating difference
filter_user_id = 772  # Replace with your desired user ID
filtered_results = test_results[test_results['original_user_id'] == filter_user_id]
filtered_results_sorted = filtered_results.sort_values(by='rating_difference')

print(f"Predictions for user {filter_user_id}, ordered by closest rating difference:")
print(filtered_results_sorted)

# Load leaderboard data
leaderboard_data = pd.read_csv('/Users/brandonmukadziwashe/CS135/cs135-24f-assignments/CS-135-Project-B/data_movie_lens_100k/ratings_masked_leaderboard_set.csv')  # Replace with your leaderboard file path
assert 'user_id' in leaderboard_data.columns and 'item_id' in leaderboard_data.columns

# Handle unseen user and movie IDs using mapping
user_mapping = dict(zip(user_encoder.classes_, user_encoder.transform(user_encoder.classes_)))
movie_mapping = dict(zip(movie_encoder.classes_, movie_encoder.transform(movie_encoder.classes_)))

leaderboard_data['user_id_encoded'] = leaderboard_data['user_id'].map(user_mapping).fillna(-1).astype(int)
leaderboard_data['item_id_encoded'] = leaderboard_data['item_id'].map(movie_mapping).fillna(-1).astype(int)

# Predict leaderboard ratings, handling valid IDs only
user_ids = leaderboard_data['user_id_encoded'].values
item_ids = leaderboard_data['item_id_encoded'].values

valid_indices = (user_ids != -1) & (item_ids != -1)
predictions = np.full(len(user_ids), np.nan)  # Initialize with NaN
predictions[valid_indices] = model2.predict([user_ids[valid_indices], item_ids[valid_indices]]).flatten()

# Save leaderboard predictions to a file
with open("predicted_ratings_leaderboard3.txt", "w") as f:
    for pred in predictions:
        if np.isnan(pred):  # Handle invalid predictions
            f.write("Invalid\n")
        else:
            f.write(f"{pred}\n")


Epoch 1/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 2.9844 - val_loss: 0.8454
Epoch 2/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.7902 - val_loss: 0.7691
Epoch 3/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.7209 - val_loss: 0.7538
Epoch 4/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.6834 - val_loss: 0.7448
Epoch 5/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.6479 - val_loss: 0.7432
Epoch 6/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.6098 - val_loss: 0.7412
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 566us/step - loss: 0.7421
Test loss (MAE): 0.7412182688713074
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 590us/step
Predictions for user 772, ordered by closest rating difference:
     