In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# Load the new datasets
df_ratings_train = pd.read_csv("u1.base", sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
df_ratings_test = pd.read_csv("u1.test", sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])

All_parts_objects = pd.read_csv(r'All_parts_objects.csv', index_col=2)
columns_to_keep = [str(i) for i in range(80)]
movies_features = All_parts_objects[columns_to_keep]
movies_metadata = pd.read_csv(r'movies_metadata_BERT_on_normal_tags_and_whisper.csv')

# Merge unique movie IDs from both train and test sets
unique_movieids_ratings_train = df_ratings_train['movieId'].unique()
unique_movieids_ratings_test = df_ratings_test['movieId'].unique()
unique_movieids_ratings = np.unique(np.concatenate((unique_movieids_ratings_train, unique_movieids_ratings_test)))

unique_movieids_movies_metadata = movies_metadata['movieId'].unique()
unique_movieids_movies_features = movies_features.index.unique()

common_movieids = list(set(unique_movieids_movies_metadata) & set(unique_movieids_ratings) & set(unique_movieids_movies_features))

# Filter dataframes based on common_movieids
df_ratings_train = df_ratings_train[df_ratings_train['movieId'].isin(common_movieids)]
df_ratings_test = df_ratings_test[df_ratings_test['movieId'].isin(common_movieids)]
movies_metadata = movies_metadata[movies_metadata['movieId'].isin(common_movieids)]
movies_features = movies_features.loc[movies_features.index.isin(common_movieids)]

# Reindex user-item matrix to include all common movie IDs
user_item_matrix = df_ratings_train.pivot(index='userId', columns='movieId', values='rating')
user_item_matrix = user_item_matrix.reindex(columns=common_movieids)

# Method 1: Collaborative Filtering
item_user_matrix_filled = user_item_matrix.T.fillna(0.5)
item_similarity_cf = cosine_similarity(item_user_matrix_filled)
item_similarity_df_cf = pd.DataFrame(item_similarity_cf, index=common_movieids, columns=common_movieids)

# Method 2: Metadata-based Similarity
item_metadata_matrix_filled = movies_features.fillna(0)
item_similarity_metadata = cosine_similarity(item_metadata_matrix_filled)
item_similarity_df_metadata = pd.DataFrame(item_similarity_metadata, index=common_movieids, columns=common_movieids)
threshold = 0.3
item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)

# Method 3: BERT-based Similarity
def string_to_array(s):
    s = s.strip('[]')
    return np.array([float(x) for x in s.split()])

movies_metadata['bert_embedding'] = movies_metadata['bert_embedding'].apply(string_to_array)
movie_embeddings = np.stack(movies_metadata['bert_embedding'].values)
cosine_sim_bert = cosine_similarity(movie_embeddings, movie_embeddings)
item_similarity_df_bert = pd.DataFrame(cosine_sim_bert, index=common_movieids, columns=common_movieids)

# Function to get k-nearest neighbors
def get_k_nearest_neighbors(item_similarity_df, movie_id, k=255):
    if movie_id not in item_similarity_df.columns:
        return pd.Index([])
    sim_scores = item_similarity_df[movie_id]
    nearest_neighbors = sim_scores.sort_values(ascending=False).index[:k]
    return nearest_neighbors

# Function to predict rating
def predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=255):
    nearest_neighbors = get_k_nearest_neighbors(item_similarity_df, movie_id, k)
    if nearest_neighbors.empty:
        return 0
    neighbor_ratings = user_item_matrix.loc[user_id, nearest_neighbors]
    if neighbor_ratings.isna().all():
        return 0
    return neighbor_ratings.mean()

# Evaluate at K=255 for combined predictions
K = 255
y_true_all = []
y_pred_all = []

for index, row in df_ratings_test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    y_true_all.append(row['rating'])
    
    # Predictions from the three methods
    pred_cf = predict_rating(user_item_matrix, item_similarity_df_cf, user_id, movie_id, k=K)
    pred_metadata = predict_rating(user_item_matrix, item_similarity_df_metadata_thresholded, user_id, movie_id, k=K)
    pred_bert = predict_rating(user_item_matrix, item_similarity_df_bert, user_id, movie_id, k=K)
    
    # Combine predictions by averaging
    combined_prediction = np.mean([pred_cf, pred_metadata, pred_bert])
    y_pred_all.append(combined_prediction)

# Convert to floats
y_true_all = [float(val) for val in y_true_all]
y_pred_all = [float(val) for val in y_pred_all]

# Generate RMSE
rmse = mean_squared_error(y_true_all, y_pred_all, squared=False)
print("RMSE:", rmse)


  item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)


RMSE: 1.20082760408979


