In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from collections import Counter

In [3]:

ratings = pd.read_csv(r'ratings_small_filtered_2.csv', index_col=0)
All_parts_objects = pd.read_csv(r'All_parts_objects.csv', index_col=2)
columns_to_keep = [str(i) for i in range(80)]
movies_features = All_parts_objects[columns_to_keep]
movies_metadata = pd.read_csv(r'movies_metadata_BERT_on_normal_tags_and_whisper.csv')

unique_movieids_ratings = ratings['movieId'].unique()
unique_movieids_movies_metadata = movies_metadata['movieId'].unique()
unique_movieids_movies_features = movies_features.index.unique()

common_movieids = list(set(unique_movieids_movies_metadata) & set(unique_movieids_ratings) & set(unique_movieids_movies_features))

ratings = ratings[ratings['movieId'].isin(common_movieids)]
movies_metadata = movies_metadata[movies_metadata['movieId'].isin(common_movieids)]
movies_features = movies_features.iloc[movies_features.index.isin(common_movieids)]


In [4]:
# Method 1: Collaborative Filtering
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
item_user_matrix_filled = user_item_matrix.T.fillna(0.5)
item_similarity_cf = cosine_similarity(item_user_matrix_filled)
item_similarity_df_cf = pd.DataFrame(item_similarity_cf, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Method 2: Metadata-based Similarity
item_metadata_matrix_filled = movies_features.fillna(0)
item_similarity_metadata = cosine_similarity(item_metadata_matrix_filled)
item_similarity_df_metadata = pd.DataFrame(item_similarity_metadata, index=item_metadata_matrix_filled.index, columns=item_metadata_matrix_filled.index)
threshold = 0.3
item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)

# Method 3: BERT-based Similarity
def string_to_array(s):
    s = s.strip('[]')
    return np.array([float(x) for x in s.split()])

movies_metadata['bert_embedding'] = movies_metadata['bert_embedding'].apply(string_to_array)
movie_embeddings = np.stack(movies_metadata['bert_embedding'].values)
cosine_sim_bert = cosine_similarity(movie_embeddings, movie_embeddings)
item_similarity_df_bert = pd.DataFrame(cosine_sim_bert, index=movies_metadata['movieId'], columns=movies_metadata['movieId'])

# Split the ratings data into training and testing sets
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

# Function to get k-nearest neighbors
def get_k_nearest_neighbors(item_similarity_df, movie_id, k=255):
    if movie_id not in item_similarity_df.columns:
        return pd.Index([])
    sim_scores = item_similarity_df[movie_id]
    nearest_neighbors = sim_scores.sort_values(ascending=False).index[:k]
    return nearest_neighbors

# Function to predict rating
def predict_rating(user_item_matrix, item_similarity_df, user_id, movie_id, k=255):
    nearest_neighbors = get_k_nearest_neighbors(item_similarity_df, movie_id, k)
    if nearest_neighbors.empty:
        return 0
    neighbor_ratings = user_item_matrix.loc[user_id, nearest_neighbors]
    if neighbor_ratings.isna().all():
        return 0
    return neighbor_ratings.mean()

# Evaluate at K=255 for combined predictions
K = 255


  item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > threshold else 0)


In [9]:
y_true_all = []
y_pred_all = []

for index, row in test_ratings.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    y_true_all.append(row['rating'])
    
    # Predictions from the three methods
    pred_cf = predict_rating(user_item_matrix, item_similarity_df_cf, user_id, movie_id, k=K)
    pred_metadata = predict_rating(user_item_matrix, item_similarity_df_metadata_thresholded, user_id, movie_id, k=K)
    pred_bert = predict_rating(user_item_matrix, item_similarity_df_bert, user_id, movie_id, k=K)
    
    # Combine predictions by averaging
    combined_prediction = np.mean([pred_cf, pred_metadata, pred_bert])
    y_pred_all.append(combined_prediction)

# Convert to floats
y_true_all = [float(val) for val in y_true_all]
y_pred_all = [float(val) for val in y_pred_all]

# Generate RMSE
rmse = mean_squared_error(y_true_all, y_pred_all, squared=False)
print("RMSE:", rmse)


RMSE: 0.7297277248364683


