### I used the file below as a reference to implement the collaborative filtering technique. The Python code implementing collaborative filtering remains mostly the same, with a few minor tweaks. As part of this research paper, I created ground truth labels explicitly from the original dataset based on a few discussed conditions for evaluating metrics such as MAP and NDCG.

[Collaborative filtering based recommender system](https://github.com/yjeong5126/movie_recommender/tree/master/item_based_collaborative_filtering)

In [None]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/ml-latest-small/ratings.csv', usecols=['userId', 'movieId', 'rating'])
movies = pd.read_csv('/content/drive/MyDrive/ml-latest-small/movies.csv', usecols=['movieId', 'title'])

########### use sample function when you DEAL WITH 25M records for running it locally#######
# Merge the sampled ratings with the movies dataframe
ratings = pd.merge(ratings, movies, how='inner', on='movieId')

# Create a pivot table with the sampled data
df = ratings.pivot_table(index='title', columns='userId', values='rating').fillna(0)

# Create a copy of the pivot table
df1 = df.copy()


In [None]:
def recommend_movies(user, num_recommended_movies):

  # print('The list of the Movies {} Has Watched \n'.format(user))

  # for m in df[df[user] > 0][user].index.tolist():
  #   print(m)

  # print('\n')

  recommended_movies = []

  for m in df[df[user] == 0].index.tolist():

    index_df = df.index.tolist().index(m)
    predicted_rating = df1.iloc[index_df, df1.columns.tolist().index(user)]
    recommended_movies.append((m, predicted_rating))

  sorted_rm = sorted(recommended_movies, key=lambda x:x[1], reverse=True)

  # print('The list of the Recommended Movies \n')
  rank = 1
  final_recommendations = []
  for recommended_movie in sorted_rm[:num_recommended_movies]:
    final_recommendations.append(recommended_movie[0])
    # print('{}: {} - predicted rating:{}'.format(rank, recommended_movie[0], recommended_movie[1]))
    rank = rank + 1

  return final_recommendations

In [None]:
def movie_recommender(user, num_neighbors, num_recommendation):

  number_neighbors = num_neighbors

  knn = NearestNeighbors(metric='cosine', algorithm='brute')
  knn.fit(df.values)
  distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors)

  user_index = df.columns.tolist().index(user)

  for m,t in list(enumerate(df.index)):
    if df.iloc[m, user_index] == 0:
      sim_movies = indices[m].tolist()
      movie_distances = distances[m].tolist()

      if m in sim_movies:
        id_movie = sim_movies.index(m)
        sim_movies.remove(m)
        movie_distances.pop(id_movie)

      else:
        sim_movies = sim_movies[:num_neighbors-1]
        movie_distances = movie_distances[:num_neighbors-1]

      movie_similarity = [1-x for x in movie_distances]
      movie_similarity_copy = movie_similarity.copy()
      nominator = 0

      for s in range(0, len(movie_similarity)):
        if df.iloc[sim_movies[s], user_index] == 0:
          if len(movie_similarity_copy) == (number_neighbors - 1):
            movie_similarity_copy.pop(s)

          else:
            movie_similarity_copy.pop(s-(len(movie_similarity)-len(movie_similarity_copy)))

        else:
          nominator = nominator + movie_similarity[s]*df.iloc[sim_movies[s],user_index]

      if len(movie_similarity_copy) > 0:
        if sum(movie_similarity_copy) > 0:
          predicted_r = nominator/sum(movie_similarity_copy)

        else:
          predicted_r = 0

      else:
        predicted_r = 0

      df1.iloc[m,user_index] = predicted_r
  return recommend_movies(user,num_recommendation)

In [None]:
recommended_movies = movie_recommender(1, 200,200 )

In [None]:
print(recommended_movies)

In [None]:
movie_ids = []

# Iterate over each movie title in recommended_movies
for movie_title in recommended_movies:
    # Find the movie ID corresponding to the movie title in the movies DataFrame
    movie_id = movies.loc[movies['title'] == movie_title, 'movieId'].iloc[0]
    # Append the movie ID to the movie_ids list
    movie_ids.append(movie_id)

sorted_movie_ids_recommended = sorted(movie_ids)

# Print the sorted list of movie IDs
print(sorted_movie_ids_recommended)

[13, 61, 125, 154, 228, 229, 422, 436, 932, 955, 971, 1126, 1140, 1216, 1251, 1277, 1284, 1483, 1519, 1574, 1649, 1701, 2531, 2551, 2563, 2607, 2624, 2696, 2726, 2730, 2731, 2765, 2772, 2896, 2940, 2971, 3019, 3022, 3045, 3106, 3198, 3323, 3429, 3615, 3707, 3724, 3761, 3783, 3852, 3858, 3862, 3899, 3901, 3914, 3946, 4032, 4254, 4255, 4256, 4307, 4322, 4349, 4350, 4381, 4477, 4577, 4677, 4959, 5011, 5062, 5155, 5267, 5293, 5347, 5417, 5423, 5425, 5477, 5538, 5539, 5548, 5608, 5680, 5723, 5745, 5818, 5884, 6335, 6551, 6558, 6572, 6588, 6688, 6731, 6853, 6953, 6969, 6971, 7065, 7264, 7371, 7380, 7448, 7541, 7841, 8781, 8981, 25782, 26680, 26717, 27317, 27611, 27830, 30812, 31410, 33201, 33660, 33779, 34323, 34530, 36477, 37741, 42015, 44197, 44225, 44633, 44665, 44840, 45728, 46337, 48142, 50802, 54997, 55830, 56174, 57502, 62718, 63239, 63540, 63853, 64957, 67534, 68358, 68959, 70286, 72226, 72919, 73488, 74688, 74789, 77846, 78264, 78349, 81591, 82152, 84847, 84952, 88129, 88140, 89190,

In [None]:

# Load ratings data
ratings = pd.read_csv('/content/drive/MyDrive/ml-latest-small/ratings.csv')

# Filter ratings to consider only positive ratings (e.g., ratings >= 3)
positive_ratings = ratings[ratings['rating'] >= 3]

# Create a user-item matrix
user_item_matrix = positive_ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

# Initialize Nearest Neighbors model
knn = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model on user-item matrix
knn.fit(user_item_matrix)

# Group positive ratings by user and aggregate liked movie IDs
ground_truth = positive_ratings.groupby('userId')['movieId'].apply(list).reset_index()

# Rename columns for clarity
ground_truth.columns = ['userId', 'liked_movies']

# Function to find similar users and their liked movies
def find_similar_users(user_id, num_neighbors=5):
    # Find indices of similar users
    user_index = ground_truth.loc[ground_truth['userId'] == user_id].index[0]
    distances, indices = knn.kneighbors(user_item_matrix.iloc[user_index].values.reshape(1, -1), n_neighbors=num_neighbors+1)

    # Get similar users' IDs
    similar_users_ids = user_item_matrix.iloc[indices.flatten()[1:], :].index.tolist()

    # Get liked movies of similar users
    similar_users_liked_movies = ground_truth[ground_truth['userId'].isin(similar_users_ids)]['liked_movies'].tolist()

    return similar_users_liked_movies

# Example usage:
user_id = 1
similar_users_liked_movies = find_similar_users(user_id)

# Construct ground truth table
ground_truth_table = {'userId': [user_id], 'liked_movies': [list(set(movie for user_movies in similar_users_liked_movies for movie in user_movies))]}
ground_truth_df = pd.DataFrame(ground_truth_table)

# Display the ground truth table
# Flatten the list of lists into a single list and sort it
all_movie_ids_sorted = sorted([movie_id for movie_ids in ground_truth_df['liked_movies'] for movie_id in movie_ids])

# Print the sorted list of movie IDs
print(all_movie_ids_sorted)


[1, 2, 3, 5, 6, 7, 10, 11, 16, 19, 21, 22, 29, 32, 36, 39, 41, 47, 50, 52, 62, 65, 69, 70, 81, 88, 95, 104, 107, 110, 111, 112, 141, 145, 150, 151, 161, 162, 163, 164, 165, 171, 172, 173, 180, 185, 186, 196, 198, 208, 216, 223, 224, 225, 230, 231, 234, 235, 252, 253, 257, 260, 261, 266, 272, 288, 292, 293, 296, 305, 315, 316, 317, 329, 333, 338, 344, 349, 350, 353, 356, 357, 362, 364, 367, 368, 370, 376, 377, 380, 384, 410, 413, 421, 426, 428, 431, 434, 435, 440, 441, 442, 454, 457, 459, 464, 466, 471, 474, 480, 481, 482, 485, 490, 493, 494, 497, 500, 514, 515, 517, 520, 521, 527, 533, 537, 539, 541, 543, 544, 551, 552, 553, 555, 586, 587, 588, 589, 590, 592, 593, 594, 595, 596, 597, 606, 608, 610, 628, 647, 648, 653, 671, 674, 678, 720, 728, 733, 736, 741, 745, 750, 765, 778, 780, 781, 783, 784, 785, 786, 799, 802, 832, 838, 849, 852, 858, 861, 866, 880, 891, 892, 897, 899, 903, 904, 908, 911, 912, 913, 915, 919, 920, 922, 923, 924, 933, 934, 940, 942, 951, 955, 969, 991, 994, 999, 10

In [None]:
all_movie_ids_sorted_ground_truth = all_movie_ids_sorted

In [None]:
def calculate_map(recommended_movies, ground_truth_df):
    # Initialize variables
    num_correct = 0
    num_recommended = len(recommended_movies)
    precision_sum = 0.0

    # Iterate over each recommended movie
    for i, movie in enumerate(recommended_movies):
        if movie in ground_truth_df:
            num_correct += 1
            precision = num_correct / (i + 1)  # Precision at current recall level
            precision_sum += precision

    # Calculate Mean Average Precision
    map_score = precision_sum / min(num_recommended, len(ground_truth_df))
    return map_score

# Example usage:
map_score = calculate_map(sorted_movie_ids_recommended, all_movie_ids_sorted_ground_truth)
print("MAP Score: {:.4f}".format(map_score*100))


MAP Score: 0.8628


In [None]:

def calculate_ndcg(recommended_movies, ground_truth_df, top_k=None):
    # If top_k is not specified, use the length of recommended movies
    if top_k is None:
        top_k = len(recommended_movies)

    # Calculate Discounted Cumulative Gain (DCG)
    dcg = 0.0
    for i, movie in enumerate(recommended_movies[:top_k]):
        if movie in ground_truth_df:
            # Assign a relevance score of 1 for relevant items
            relevance = 1
            # Use logarithm base 2 for discount factor
            dcg += (2 ** relevance - 1) / (math.log2(i + 2))

    # Calculate Ideal Discounted Cumulative Gain (IDCG) for normalization
    idcg = sum(1 / (math.log2(i + 2)) for i in range(min(len(ground_truth_df), top_k)))

    # Calculate NDCG
    if idcg == 0:
        ndcg = 0.0
    else:
        ndcg = dcg / idcg

    return ndcg

# Example usage:
ndcg_score = calculate_ndcg(sorted_movie_ids_recommended, all_movie_ids_sorted_ground_truth, top_k=10)
print("NDCG Score: {:.4f}".format(ndcg_score))


NDCG Score: 0.0636
