In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
from collections import Counter, defaultdict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [8]:

ratings = pd.read_csv(r'ratings_small_filtered_2.csv', index_col=0)
All_parts_objects = pd.read_csv(r'All_parts_objects.csv', index_col=2)
columns_to_keep = [str(i) for i in range(80)]
movies_features = All_parts_objects[columns_to_keep]
movies_metadata = pd.read_csv(r'movies_metadata_BERT_on_normal_tags_and_whisper.csv')

unique_movieids_ratings = ratings['movieId'].unique()
unique_movieids_movies_metadata = movies_metadata['movieId'].unique()
unique_movieids_movies_features = movies_features.index.unique()
common_movieids = list(set(unique_movieids_movies_metadata) & set(unique_movieids_ratings) & set(unique_movieids_movies_features))

ratings = ratings[ratings['movieId'].isin(common_movieids)]
movies_metadata = movies_metadata[movies_metadata['movieId'].isin(common_movieids)]
movies_features = movies_features.iloc[movies_features.index.isin(common_movieids)]


# Assuming ratings, movies_features, movies_metadata DataFrames are already loaded

# Method 1: Collaborative Filtering
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
item_user_matrix_filled = user_item_matrix.T.fillna(0.5)
item_similarity_cf = cosine_similarity(item_user_matrix_filled)
item_similarity_df_cf = pd.DataFrame(item_similarity_cf, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Method 2: Metadata-based Similarity
item_metadata_matrix_filled = movies_features.fillna(0)
item_similarity_metadata = cosine_similarity(item_metadata_matrix_filled)
item_similarity_df_metadata = pd.DataFrame(item_similarity_metadata, index=item_metadata_matrix_filled.index, columns=item_metadata_matrix_filled.index)
item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > 0.3 else 0)

# Method 3: BERT-based Similarity
def string_to_array(s):
    s = s.strip('[]')
    return np.array([float(x) for x in s.split()])

movies_metadata['bert_embedding'] = movies_metadata['bert_embedding'].apply(string_to_array)
movie_embeddings = np.stack(movies_metadata['bert_embedding'].values)
cosine_sim_bert = cosine_similarity(movie_embeddings, movie_embeddings)
item_similarity_df_bert = pd.DataFrame(cosine_sim_bert, index=movies_metadata['movieId'], columns=movies_metadata['movieId'])

# Split the ratings data into training and testing sets
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

  item_similarity_df_metadata_thresholded = item_similarity_df_metadata.applymap(lambda x: x if x > 0.3 else 0)


In [9]:
# Function to get top-k recommendations from a similarity matrix
def get_top_k_recommendations(similarity_matrix, item_id, k):
    similar_items = similarity_matrix.loc[item_id].sort_values(ascending=False)[1:k+1].index.tolist()
    return similar_items

# Function to merge recommendations from all three methods
def get_merged_recommendations(item_id, k):
    rec_cf = get_top_k_recommendations(item_similarity_df_cf, item_id, k)
    rec_metadata = get_top_k_recommendations(item_similarity_df_metadata_thresholded, item_id, k)
    rec_bert = get_top_k_recommendations(item_similarity_df_bert, item_id, k)
    
    # Combine and remove duplicates
    merged_rec = list(dict.fromkeys(rec_cf + rec_metadata + rec_bert))
    return merged_rec[:k]

def precision_recall_at_k(test_data, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""
    
    user_est_true = defaultdict(list)
    for _, row in test_data.iterrows():
        uid, movie_id, true_r = row['userId'], row['movieId'], row['rating']
        recommended_items = get_merged_recommendations(movie_id, k)
        
        # Get the actual ratings for recommended items
        for rec_item in recommended_items:
            est_r = test_data[(test_data['userId'] == uid) & (test_data['movieId'] == rec_item)]['rating'].values
            if len(est_r) > 0:
                user_est_true[uid].append((est_r[0], true_r))
            else:
                user_est_true[uid].append((0, true_r))  # If no rating exists, assume 0
    
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )
        
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    
    return precisions, recalls

# Evaluate the model
k_values = [5, 10, 20]
threshold = 3.5

for k in k_values:
    precisions, recalls = precision_recall_at_k(test_ratings, k=k, threshold=threshold)
    
    avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    
    print(f"Results for k={k}:")
    print(f"Average Precision@{k}: {avg_precision:.4f}")
    print(f"Average Recall@{k}: {avg_recall:.4f}")
    print()

Results for k=5:
Average Precision@5: 0.1063
Average Recall@5: 0.0030

Results for k=10:
Average Precision@10: 0.1392
Average Recall@10: 0.0020

Results for k=20:
Average Precision@20: 0.1617
Average Recall@20: 0.0013



In [10]:
# Function to get top-k recommendations from a similarity matrix
def get_top_k_recommendations(similarity_matrix, item_id, k):
    similar_items = similarity_matrix.loc[item_id].sort_values(ascending=False)[1:k+1].index.tolist()
    return similar_items

# Function to merge recommendations from all three methods
def get_merged_recommendations(item_id, k):
    rec_cf = get_top_k_recommendations(item_similarity_df_cf, item_id, k)
    rec_metadata = get_top_k_recommendations(item_similarity_df_metadata_thresholded, item_id, k)
    rec_bert = get_top_k_recommendations(item_similarity_df_bert, item_id, k)
    
    # Combine and remove duplicates
    merged_rec = list(dict.fromkeys(rec_cf + rec_metadata + rec_bert))
    return merged_rec[:k]

def precision_recall_at_k(test_data, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""
    
    user_est_true = defaultdict(list)
    for _, row in test_data.iterrows():
        uid, movie_id, true_r = row['userId'], row['movieId'], row['rating']
        recommended_items = get_merged_recommendations(movie_id, k)
        
        # Get the actual ratings for recommended items
        for rec_item in recommended_items:
            est_r = test_data[(test_data['userId'] == uid) & (test_data['movieId'] == rec_item)]['rating'].values
            if len(est_r) > 0:
                user_est_true[uid].append((est_r[0], true_r))
            else:
                user_est_true[uid].append((0, true_r))  # If no rating exists, assume 0
    
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )
        
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    
    return precisions, recalls

# Evaluate the model
k_values = [50,100,150,255,350]
threshold = 3.5

for k in k_values:
    precisions, recalls = precision_recall_at_k(test_ratings, k=k, threshold=threshold)
    
    avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    
    print(f"Results for k={k}:")
    print(f"Average Precision@{k}: {avg_precision:.4f}")
    print(f"Average Recall@{k}: {avg_recall:.4f}")
    print()

Results for k=50:
Average Precision@50: 0.1901
Average Recall@50: 0.0006

Results for k=100:
Average Precision@100: 0.2126
Average Recall@100: 0.0003

Results for k=150:
Average Precision@150: 0.2201
Average Recall@150: 0.0002

Results for k=255:
Average Precision@255: 0.2305
Average Recall@255: 0.0002

Results for k=350:
Average Precision@350: 0.2410
Average Recall@350: 0.0001

