In [796]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np

In [885]:
#Block One: First Filtering
#reading the CSV files, change them on your personal computer to wherever you saved them
rate = pd.read_csv(r'ml-latest-small/ratings.csv')
movies = pd.read_csv(r'ml-latest-small/movies.csv')
ratings = pd.read_csv(r'ml-latest-small/ratings.csv')
tags = pd.read_csv(r'ml-latest-small/tags.csv')
userID = ratings
#dropping timestamp as it's unnecessary
tags = tags.drop(columns='timestamp')
ratings = ratings.drop(columns='timestamp') 

#Groups on userId and movieId and groups tags into a list
tags = tags.groupby(['movieId']).agg({'tag':set}).reset_index()

#Refining ratings to just be movieId and the average rating to only 1 decimal point
ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
ratings = ratings.rename(columns={'rating':'average_rating'})
ratings['average_rating'] = ratings['average_rating'].round(1)
filtered_data = pd.merge(movies,ratings, on='movieId')
filtered_data = pd.merge(filtered_data, tags, on='movieId')

In [886]:
#Block Two: Unique Genres and Second Filtering
# Split genres by '|' and get all unique genres
unique_genres = set()
for genres in movies['genres']:
    unique_genres.update(genres.split('|'))
unique_genres = sorted(unique_genres)  # Sort in alphabetic order
print(len(unique_genres))
for genre in unique_genres:
    filtered_data[genre] = filtered_data['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)
#filter out In Netflix Queue
filtered_data['tag'] = filtered_data['tag'].apply(lambda tag_set: {tag for tag in tag_set if tag != "In Netflix queue"})
# print(filtered_data.head())

# Drop the original 'genres' column (I think it's a good idea, no need to clutter the data yk?)
filtered_data= filtered_data.drop(columns=['genres'])

20


In [887]:
#Block Three: Vectorization
#I changed the vector size to 10 just cause it's smaller but if you wanna increase it by all means go for it
wordVector = Word2Vec(filtered_data['tag'].tolist(),vector_size=10, window=5, min_count=1, workers=4)

vector_size = 10  # Same size as Word2Vec vectors
padding_vector = np.zeros(vector_size)
# Function to generate a vector for up to 3 tags
def create_feature_vector(row):
    # Get word vectors for the tags, up to 3 tags, and pad if fewer
    tag_vectors = [
        wordVector.wv[tag] if tag in wordVector.wv else padding_vector
        for tag in list(row['tag'])[:3] # inlude if needed
    ]
    while len(tag_vectors) < 3:  # Pad with zero vectors if fewer than 3 tags
        tag_vectors.append(padding_vector)
    
    # Flatten the tag vectors (3 vectors of size 10 each -> 30 elements)
    tag_vector = np.concatenate(tag_vectors)
    
    # Add genre one-hot encoding
    genre_vector = row[unique_genres].values  # One-hot encoded genres
    
    # Combine tag vector and genre vector
    feature_vector = np.concatenate([tag_vector, genre_vector])
    return feature_vector

#wrote to file to see feature data
# filtered_data.to_csv('filtered_movies.csv',index=False)

In [888]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
tags_df = pd.read_csv('ml-latest-small/tags.csv')
tags_set = tags_df.groupby(['movieId']).agg({'tag':set}).reset_index()

"""      Genres_vector     """
# Initialize the genre_vector list
genre_vector = []

# Iterate over each movie's genres
for _, row in movies_df.iterrows():
    # Create a one-hot vector for the current movie
    genres = row['genres'].split('|')
    vector = [1 if genre in genres else 0 for genre in unique_genres]
    genre_vector.append(vector)

# Convert to a numpy array for further use
genre_vector = np.array(genre_vector)

# Add the genre_vector to the dataframe for reference (optional)
movies_df['genre_vector'] = list(genre_vector)
# print(f"\n ------This is genre-vec len: {len(movies_df['genre_vector'][0])} --------- \n")


"""       Tags_vector      """
# 3. Merge tags with movies, ensuring all movies are included
movies_df = movies_df.merge(tags_set, how='left', left_on='movieId', right_on='movieId')

# 4. Replace NaN tags with an empty set
movies_df['tag'] = movies_df['tag'].apply(lambda x: x if isinstance(x, set) else set())

# 5. Preprocess tags: Split and clean
movies_df['tag_list'] = movies_df['tag'].apply(lambda x: [tag.strip().lower() for tag in x])

# 3. Train word2vec model
# Combine all tag lists into one list of lists
tag_corpus = movies_df['tag_list'].tolist()

# Train a word2vec model
word2vec_model = Word2Vec(tag_corpus, vector_size=20, window=5, min_count=1, workers=4)

# 4. Create movie vectors by averaging tag vectors
def get_movie_vector(tags):
    tag_vectors = [word2vec_model.wv[tag] for tag in tags if tag in word2vec_model.wv]
    return np.mean(tag_vectors, axis=0) if tag_vectors else np.zeros(word2vec_model.vector_size)

movies_df['tag_vector'] = movies_df['tag_list'].apply(get_movie_vector)
# print(f"\n ------This is movie-vec len: {len(movies_df['tag_vector'][0])} --------- \n")

# 5. Drop unnecessary columns if needed
# movies_df = movies_df.drop(columns=['tags', 'tag_list'])

print(movies_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                        genre_vector  \
0  [0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...   
1  [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...   
2  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...   
4  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                                 tag  \
0    

In [889]:
filtered_data_copy = filtered_data.copy()

# What we used before
filtered_data['feature_vector'] = filtered_data.apply(create_feature_vector, axis=1)

# What we are using now
movies_df['feature_vector'] = movies_df.apply(
    lambda row: np.concatenate([row['tag_vector'], row['genre_vector']]),
    axis=1
)

filtered_data1 = filtered_data_copy.merge(
    movies_df[['movieId', 'feature_vector']],  # Select only necessary columns
    on='movieId',  # Merge on the 'movieId' column
    how='left'  # Use a left join to keep all rows in filtered_data_df
)

#Okay so I mixed the tag vector and the genre vectors into one feature vector
#I am going to drop the original tag vector and the genres
filtered_data = filtered_data.drop(columns=list(unique_genres))
filtered_data = filtered_data.drop(columns='tag')

filtered_data1 = filtered_data1.drop(columns=list(unique_genres))
filtered_data1 = filtered_data1.drop(columns='tag')

print(len(filtered_data1['feature_vector'][6]))
print(filtered_data1.head())
filtered_data1.to_csv('filtered_movies.csv',index=False)


40
   movieId                               title  average_rating  \
0        1                    Toy Story (1995)             3.9   
1        2                      Jumanji (1995)             3.4   
2        3             Grumpier Old Men (1995)             3.3   
3        5  Father of the Bride Part II (1995)             3.1   
4        7                      Sabrina (1995)             3.2   

                                      feature_vector  
0  [-0.042944200336933136, -0.009570187889039516,...  
1  [-0.0005739312618970871, 0.024589821696281433,...  
2  [0.032066673040390015, -0.0006265472620725632,...  
3  [-0.013254916295409203, 0.037702854722738266, ...  
4  [-0.01430355291813612, 0.03986180201172829, 0....  


In [890]:
#Query is the movie we want to isolate
#Cosine Similarities
def knn_recommendation_cos(query,train_data, user_watched_movies, k=5):
    #Extract ratings and feature vectors from training set
    #Ratings are optional if you don't want them, I have them here to use as bias
    train_features = np.array(train_data['feature_vector'].tolist())
    train_ratings = train_data['average_rating'].values#Optional

    #kNN using cosine_similarity
    similarities = cosine_similarity([query],train_features).flatten()

    #Optional Bias
    weighted_similarities = similarities * train_ratings

    # #get the indicies of nearest movies (5)
    # kNN_indices = np.argsort(weighted_similarities)[-k:][::-1] #Sorted by weighted similariy in descending order

    # #Get the movies from the index
    # kNN_movies = train_data.iloc[kNN_indices]
        # Get the indices of nearest movies (more than k initially)
    kNN_indices = np.argsort(weighted_similarities)[-k*2:][::-1]  # Get twice the number of recommendations (more than needed)

    # Get the movies from the index
    kNN_movies = train_data.iloc[kNN_indices]

    # Remove the movies the user has already watched
    kNN_movies_filtered = kNN_movies[~kNN_movies['movieId'].isin(user_watched_movies)]

    # If we have fewer than k movies after filtering, fetch more from the remaining pool
    if len(kNN_movies_filtered) < k:
        # Find additional movies (not already recommended) by getting the remaining top recommendations
        remaining_movies = kNN_movies[~kNN_movies['movieId'].isin(kNN_movies_filtered['movieId'])]
        additional_movies_needed = k - len(kNN_movies_filtered)
        additional_movies = remaining_movies.head(additional_movies_needed)
        
        # Combine the filtered list and additional recommendations
        kNN_movies_filtered = pd.concat([kNN_movies_filtered, additional_movies])

    return kNN_movies_filtered.head(k)  # Ensure we return exactly k recommendations
    # return kNN_movies

In [891]:
#if you want to compare cosine to euclidean distance
def knn_recommendation_eucl(query,train_data,user_watched_movies,k=5):
    #Extract ratings and feature vectors from training set
    #Ratings are optional if you don't want them, I have them here to use as bias
    train_features = np.array(train_data['feature_vector'].tolist())
    train_ratings = train_data['average_rating'].values#Optional

    #kNN using euclidean distances
    distances = euclidean_distances([query],train_features).flatten()

    #Optional Bias
    weighted_distances= distances / train_ratings

    #get the indicies of nearest movies (5)
    # kNN_indices = np.argsort(weighted_distances)[:k] #Sorted by weighted distances ascending order
    kNN_indices = np.argsort(weighted_distances)[-k*2:][::-1]  # Get twice the number of recommendations (more than needed)

    # Get the movies from the index
    kNN_movies = train_data.iloc[kNN_indices]

    # Remove the movies the user has already watched
    kNN_movies_filtered = kNN_movies[~kNN_movies['movieId'].isin(user_watched_movies)]

    # If we have fewer than k movies after filtering, fetch more from the remaining pool
    if len(kNN_movies_filtered) < k:
        # Find additional movies (not already recommended) by getting the remaining top recommendations
        remaining_movies = kNN_movies[~kNN_movies['movieId'].isin(kNN_movies_filtered['movieId'])]
        additional_movies_needed = k - len(kNN_movies_filtered)
        additional_movies = remaining_movies.head(additional_movies_needed)
        
        # Combine the filtered list and additional recommendations
        kNN_movies_filtered = pd.concat([kNN_movies_filtered, additional_movies])

    return kNN_movies_filtered.head(k)  # Ensure we return exactly k recommendations
    #Get the movies from the index
    # kNN_movies = train_data.iloc[kNN_indices]
    # return kNN_movies

In [892]:
def test_knn(test_data,train_data,user_watched_movies, k=5):
    # Dictionary to store movie frequencies
    movie_frequency = {}

    for _,row in test_data.iterrows():
        query = np.array(row['feature_vector'])
        knn_movies = knn_recommendation_cos(query,train_data,user_watched_movies,k)
        # print(f"Current Movie searched: {row['title']}")
        # print("Recommended Movies: ")
        # print(knn_movies[['title','average_rating']])
        # print()

        # Update the frequency dictionary
        for _, movie_row in knn_movies.iterrows():
            movie_title = movie_row['title']
            movie_id = movie_row['movieId']
            if movie_title in movie_frequency:
                movie_frequency[movie_title][0] += 1
            else:
                movie_frequency[movie_title] = [1, movie_id]

    # Sort the movie_frequency dictionary by frequency (the first element of the list)
    sorted_dict = dict(sorted(movie_frequency.items(), key=lambda item: item[1][0], reverse=True))

    # Get the movie with the maximum frequency
    max_key = max(movie_frequency, key=lambda k: movie_frequency[k][0])  # Get the movie with the highest frequency
    max_value = movie_frequency[max_key]  # The value is a list: [frequency, movie_id]

    print(f"Maximum value: {max_value[0]} with Key: {max_key} and Movie ID: {max_value[1]}")

    # Print the top 5 recommended movies
    count = 0
    movieIds = []
    for movie, freq in sorted_dict.items():
        print(f"{movie}: Frequency = {freq[0]}, Movie ID = {freq[1]}")  # Display frequency and Movie ID
        movieIds.append(freq[1])
        if count == 5:
            break
        count += 1
        

    
    return movieIds

In [893]:
def test_knn_eucl(test_data,train_data,user_watched_movies, k=5):
    # Dictionary to store movie frequencies
    movie_frequency = {}

    for _,row in test_data.iterrows():
        query = np.array(row['feature_vector'])
        knn_movies = knn_recommendation_eucl(query,train_data,user_watched_movies,k)
        # print(f"Current Movie searched: {row['title']}")
        # print("Recommended Movies: ")
        # print(knn_movies[['title','average_rating']])
        # print()

        # Update the frequency dictionary
        for _, movie_row in knn_movies.iterrows():
            movie_title = movie_row['title']
            movie_id = movie_row['movieId']
            if movie_title in movie_frequency:
                movie_frequency[movie_title][0] += 1
            else:
                movie_frequency[movie_title] = [1, movie_id]

    # Sort the movie_frequency dictionary by frequency (the first element of the list)
    sorted_dict = dict(sorted(movie_frequency.items(), key=lambda item: item[1][0], reverse=True))

    # Get the movie with the maximum frequency
    max_key = max(movie_frequency, key=lambda k: movie_frequency[k][0])  # Get the movie with the highest frequency
    max_value = movie_frequency[max_key]  # The value is a list: [frequency, movie_id]

    print(f"Maximum value: {max_value[0]} with Key: {max_key} and Movie ID: {max_value[1]}")

    # Print the top 5 recommended movies
    count = 0
    movieIds = []
    for movie, freq in sorted_dict.items():
        print(f"{movie}: Frequency = {freq[0]}, Movie ID = {freq[1]}")  # Display frequency and Movie ID
        movieIds.append(freq[1])
        if count == 5:
            break
        count += 1
        

    
    return movieIds

In [894]:
def calculate_cosine_similarity(movie_id_1, movie_id_2, train_data):
    """
    Calculate the cosine similarity between two movies based on their feature vectors.

    """

    # Retrieve the feature vectors for the two movies
    movie_1_vector = train_data.loc[train_data['movieId'] == movie_id_1, 'feature_vector'].values[0]
    # print(movie_1_vector)
    movie_2_vector = train_data.loc[train_data['movieId'] == movie_id_2, 'feature_vector'].values[0]

    # Reshape the vectors to 2D arrays (required for cosine_similarity)
    movie_1_vector = movie_1_vector.reshape(1, -1)
    movie_2_vector = movie_2_vector.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(movie_1_vector, movie_2_vector)[0][0]

    return similarity

In [895]:
from scipy.spatial.distance import euclidean

def calculate_euclidean_distance(movie_id_1, movie_id_2, train_data):
    """
    Calculate the Euclidean distance between two movies based on their feature vectors.

    Args:
        movie_id_1 (int): The ID of the first movie.
        movie_id_2 (int): The ID of the second movie.
        train_data (DataFrame): The DataFrame containing movie data with a 'feature_vector' column.

    Returns:
        float: The Euclidean distance between the two movies' feature vectors.
    """

    # Retrieve the feature vectors for the two movies
    movie_1_vector = train_data.loc[train_data['movieId'] == movie_id_1, 'feature_vector'].values[0]
    movie_2_vector = train_data.loc[train_data['movieId'] == movie_id_2, 'feature_vector'].values[0]

    # Calculate Euclidean distance
    distance = euclidean(movie_1_vector, movie_2_vector)

    return distance


In [896]:
def my_train_test_split(df, filt_data, n=7):
    # Create empty lists to store train and test data
    train_list = []
    test_list = []

    # Group by 'userID' to process each user individually
    
    merged_df = pd.merge(df, filt_data, on='movieId')
    grouped = merged_df.groupby('userId')

    for user, group in grouped:
        # Sort movies by rating in descending order
        sorted_group = group.sort_values(by='rating', ascending=False)
        
        # Select the top n rated movies for testing
        test = sorted_group.head(n)
        
        # Use the rest for training
        train = sorted_group.iloc[n:]
        
        # Append to respective lists
        test_list.append(test)
        train_list.append(train)

    # Combine all train and test splits into DataFrames
    train_data = pd.concat(train_list).reset_index(drop=True)
    test_data = pd.concat(test_list).reset_index(drop=True)

    print("Training Data:")
    print(train_data)
    print("\nTesting Data:")
    print(test_data)
    print(type(train_data))

    return train_data, test_data

In [897]:
train_data, test_data = my_train_test_split(rate, filtered_data,n=0)
train_data1, test_data1 = my_train_test_split(rate, filtered_data1,n=0)

print(f"traing_data query{len(train_data['feature_vector'][0])}")

Training Data:
       userId  movieId  rating   timestamp  \
0           1     1240     5.0   964983723   
1           1     2139     5.0   964982791   
2           1     2115     5.0   964982529   
3           1     2078     5.0   964982838   
4           1     2058     5.0   964982400   
...       ...      ...     ...         ...   
48282     610    96861     2.0  1493850474   
48283     610    69526     2.0  1493846153   
48284     610     6541     1.5  1493845480   
48285     610   120635     1.0  1493850489   
48286     610    68319     1.0  1493845505   

                                                   title  average_rating  \
0                                 Terminator, The (1984)             3.9   
1                             Secret of NIMH, The (1982)             3.5   
2            Indiana Jones and the Temple of Doom (1984)             3.6   
3                                Jungle Book, The (1967)             3.8   
4                                 Negotiator, The (1

In [898]:
# Filter the dataframe for that specific user
# print(type(train_data))
# print(train_data)
user_id = 10
user_data = train_data[train_data['userId'] == user_id]
user_data1 = train_data1[train_data1['userId'] == user_id]
# user_data1 = test_data[test_data['userId'] == 2]
# print(user_data)
# print(user_data1)

# test_knn()

In [899]:
user_watched_movies = user_data['movieId'].to_numpy()
# print(user_watched_movies)
# you can use either filtered_data1(new vect) or filtered_data(old vect)
movieIDs1 = test_knn(user_data1, filtered_data1, user_watched_movies, k=5)

print("\n ---------- Next old Filter --------- \n")

movieIDs = test_knn(user_data, filtered_data, user_watched_movies, k=5)

print("\n ---------- Test with Eucl --------- \n")
movieIDs2 = test_knn_eucl(user_data1, filtered_data1, user_watched_movies, k=5)

Maximum value: 7 with Key: Lady Jane (1986) and Movie ID: 6201
Lady Jane (1986): Frequency = 7, Movie ID = 6201
Wedding Banquet, The (Xi yan) (1993): Frequency = 7, Movie ID = 7023
Philadelphia Story, The (1940): Frequency = 7, Movie ID = 898
Harold and Maude (1971): Frequency = 7, Movie ID = 1235
Roman Holiday (1953): Frequency = 7, Movie ID = 916
Manhattan (1979): Frequency = 5, Movie ID = 1244

 ---------- Next old Filter --------- 

Maximum value: 7 with Key: Lady Jane (1986) and Movie ID: 6201
Lady Jane (1986): Frequency = 7, Movie ID = 6201
Wedding Banquet, The (Xi yan) (1993): Frequency = 7, Movie ID = 7023
Philadelphia Story, The (1940): Frequency = 7, Movie ID = 898
Harold and Maude (1971): Frequency = 7, Movie ID = 1235
Cyrano de Bergerac (1990): Frequency = 7, Movie ID = 1277
84 Charing Cross Road (1987): Frequency = 5, Movie ID = 5404

 ---------- Test with Eucl --------- 

Maximum value: 47 with Key: Begotten (1990) and Movie ID: 26717
Begotten (1990): Frequency = 47, Movi

In [None]:
# Gets the mean of the cosine_similarity between 
# the user movies and each recommended movies

user_test_movies = user_data1['movieId'].to_numpy()
print(len(user_test_movies))
arr = []

for x in movieIDs1:
    arr = []
    for y in user_test_movies:
        z = calculate_cosine_similarity(x, y, filtered_data1)
        arr.append(z)

    print(f"Recommended movie: {x} Similarity:{np.mean(arr)}")

47
Recommended movie: 6201 Similarity:0.39839186836495644
Recommended movie: 7023 Similarity:0.4806223190589474
Recommended movie: 898 Similarity:0.480669132888907
Recommended movie: 1235 Similarity:0.48013425250717184
Recommended movie: 916 Similarity:0.4813185625001533
Recommended movie: 1244 Similarity:0.48060386340138744


In [917]:
user_test_movies = user_data['movieId'].to_numpy()
print(len(user_test_movies))
arr = []

for x in movieIDs:
    arr = []
    for y in user_test_movies:
        z = calculate_cosine_similarity(x, y, filtered_data)
        arr.append(z)

    print(f"Recommended movie: {x} Similarity:{np.mean(arr)}")

47
Recommended movie: 6201 Similarity:0.3923577293777858
Recommended movie: 7023 Similarity:0.47776737176586076
Recommended movie: 898 Similarity:0.4762643815790968
Recommended movie: 1235 Similarity:0.475092014034493
Recommended movie: 1277 Similarity:0.47776737176586076
Recommended movie: 5404 Similarity:0.3929433887821629


In [915]:
user_test_movies = user_data1['movieId'].to_numpy()
print(len(user_test_movies))
arr = []

for x in movieIDs2:
    arr = []
    for y in user_test_movies:
        z = calculate_euclidean_distance(x, y, filtered_data1)
        arr.append(z)

    print(f"Recommended movie: {x} Distance:{np.mean(arr)}")

47
Recommended movie: 26717 Distance:1.9399781169431087
Recommended movie: 4138 Distance:2.0053436033735355
Recommended movie: 4204 Distance:1.7207408841173932
Recommended movie: 1389 Distance:2.1163123216192528
Recommended movie: 1995 Distance:2.1718821824424297
Recommended movie: 5213 Distance:1.7200561993144474


In [903]:
def evaluate_recommendations(user_data, recommended_movie_ids, filtered_data, similarity_threshold=0.8):
    """
    Evaluates recommendation accuracy using cosine similarity and user ratings.
    For movies that have a rating of 4 or 5 by the user we will want our 
    recommendation to have a cosine_similarity between recommended and that 
    movie, to be above the cosine_similarity threshold we defined. if we have a 
    user rated movie below 2, but we have a the cosine_similarity above the 
    threshold that will be our false positive.

    Args:
        user_data (DataFrame): Data for a single user, with columns 'movieId' and 'rating'.
        recommended_movie_ids (list): List of movieIds recommended to the user.
        filtered_data (DataFrame): Full dataset with feature vectors for movies.
        similarity_threshold (float): The cosine similarity threshold.

    Returns:
        dict: Accuracy, precision, recall, and other metrics.
    """
    # User's watched movies and their ratings
    user_test_movies = user_data['movieId'].to_numpy()
    user_ratings = user_data.set_index('movieId')['rating'].to_dict()

    # Initialize metrics
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0

    for recommended_movie in recommended_movie_ids:
        # Calculate similarities for all user test movies
        similarities = [
            calculate_cosine_similarity(recommended_movie, test_movie, filtered_data)
            for test_movie in user_test_movies
        ]

        # Use the mean similarity for this recommendation
        mean_similarity = np.mean(similarities)

        # Check if the recommended movie is similar above the threshold
        is_similar = mean_similarity >= similarity_threshold

        # Determine if this is TP, FP, TN, or FN
        for test_movie in user_test_movies:
            user_rating = user_ratings[test_movie]

            if user_rating >= 3:  # User likes the movie
                if is_similar:
                    true_positives += 1
                else:
                    false_negatives += 1
            elif user_rating <= 2:  # User dislikes the movie
                if is_similar:
                    false_positives += 1
                else:
                    true_negatives += 1

    # Calculate metrics
    total = true_positives + false_positives + true_negatives + false_negatives
    accuracy = (true_positives + true_negatives) / total if total > 0 else 0
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "true_positives": true_positives,
        "false_positives": false_positives,
        "true_negatives": true_negatives,
        "false_negatives": false_negatives,
    }


# Example Usage
user_test_movies = user_data1['movieId'].to_numpy()
recommended_movie_ids = movieIDs1  # Replace with your list of recommended movies
similarity_threshold = 0.35

metrics = evaluate_recommendations(user_data1, recommended_movie_ids, filtered_data1, similarity_threshold)

print(f"Accuracy: {metrics['accuracy']:.2f}")
print(f"Precision: {metrics['precision']:.2f}")
print(f"Recall: {metrics['recall']:.2f}")
print(f"True Positives: {metrics['true_positives']}")
print(f"False Positives: {metrics['false_positives']}")
print(f"True Negatives: {metrics['true_negatives']}")
print(f"False Negatives: {metrics['false_negatives']}")


Accuracy: 0.76
Precision: 0.76
Recall: 1.00
True Positives: 204
False Positives: 66
True Negatives: 0
False Negatives: 0


In [910]:
calculate_euclidean_distance(29, 32, filtered_data1)

2.003223528191637