In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

Some tests of the item-based collaboration filtering algorithm to help debugging and optimising.

In [2]:
metadata = pd.read_json(r'F:\DS_Dataset\genome_2021\movie_dataset_public_final\raw\metadata.json', lines=True)

In [3]:
ratings = pd.read_csv(r'F:\DS_Dataset\final\user-based\ratings.csv')
ratings

Unnamed: 0,item_id,user_id,rating
0,1,0,3.5
1,1,19,5.0
2,1,24,3.0
3,1,25,3.0
4,1,42,5.0
...,...,...,...
28249186,237107,672530,4.5
28249187,237127,619061,3.0
28249188,237129,594170,4.0
28249189,237131,395134,2.5


In [4]:
user_avg_rating = pd.read_csv(r'F:\DS_Dataset\final\user-based\user_avg_rating.csv')
user_avg_rating

Unnamed: 0,user_id,rating
0,0,3.738636
1,7,3.066667
2,11,4.200000
3,19,3.592727
4,24,3.397177
...,...,...
247378,999984,2.250000
247379,999985,3.519685
247380,999991,2.946078
247381,999994,3.166667


In [5]:
def recommend_movies(input_df, ratings_df, user_avg_rating, n=20):
    """
    Recommend movies to a user by combining nearest neighbor finding, rating prediction, 
    and sorting by predicted ratings, with a limit of top 10 movies per neighbor.

    Parameters:
    input_df (DataFrame): Ratings of movies by the user.
    ratings_df (DataFrame): DataFrame containing movie ratings in the database.
    top10_movies_by_user (DataFrame): DataFrame containing top 10 rating movies by users.
    user_avg_rating (DataFrame): DataFrame containing average rating of each user.
    n (int): Number of top nearest neighbours to consider.

    Returns:
    DataFrame: Movies recommended to the user, sorted by predicted ratings.
    """
    
    # Find Nearest Neighbours
    watched_movies = input_df['item_id'].unique()
    relevant_ratings = ratings_df[ratings_df['item_id'].isin(watched_movies)]
    user_item_matrix = relevant_ratings.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)
    user_profile = input_df.set_index('item_id').reindex(user_item_matrix.columns).fillna(0).T
    similarity = cosine_similarity(user_profile, user_item_matrix)
    similarity_df = pd.DataFrame(similarity.T, index=user_item_matrix.index, columns=['cos_sim'])
    top_neighbours = similarity_df.sort_values(by='cos_sim', ascending=False).head(n).reset_index()

    # Limit to Top N Movies for Each Neighbor
    top_movies_by_neighbour = {}
    for neighbor_id in top_neighbours['user_id']:
        top_movies = ratings_df[ratings_df['user_id'] == neighbor_id].sort_values(by='rating', ascending=False).head(10)['item_id'].tolist()
        top_movies_by_neighbour[neighbor_id] = top_movies

    # Aggregate Movies to Predict
    movies_to_predict = set()
    for movies in top_movies_by_neighbour.values():
        movies_to_predict.update(movies)
    movies_to_predict -= set(input_df['item_id'])

    # Directly Predict Ratings
    user_avg_ratings_dict = user_avg_rating.set_index('user_id')['rating'].to_dict()
    
    predictions = []
    user_avg_rating = input_df['rating'].mean()  
    for movie_id in movies_to_predict:
        numerator, denominator = 0, 0
        for _, row in top_neighbours.iterrows():
            neighbor_id = row['user_id']
            if movie_id in top_movies_by_neighbour[neighbor_id]:
                cos_sim = row['cos_sim']
                neighbor_ratings = ratings_df.loc[(ratings_df['user_id'] == neighbor_id) & (ratings_df['item_id'] == movie_id), 'rating']
                if not neighbor_ratings.empty:
                    neighbor_rating = neighbor_ratings.iloc[0]
                    neighbor_avg_rating = user_avg_ratings_dict.get(neighbor_id, 0) 
                    numerator += cos_sim * (neighbor_rating - neighbor_avg_rating)
                    denominator += abs(cos_sim)

        # Calculate Predicted Rating
        pred_rating = user_avg_rating if denominator == 0 else user_avg_rating + (numerator / denominator)
        predictions.append((movie_id, pred_rating))

    # Sort and Return Recommendations
    recommendations_df = pd.DataFrame(predictions, columns=['item_id', 'pred_rating'])
    return recommendations_df.sort_values(by='pred_rating', ascending=False)

## Example one: super hero fan

In [6]:
metadata[metadata['item_id']==58559]

Unnamed: 0,title,directedBy,starring,dateAdded,avgRating,imdbId,item_id
12563,"Dark Knight, The (2008)",Christopher Nolan,"Christian Bale, Aaron Eckhart, Heath Ledger, M...",2008-07-03T21:01:22,4.18379,468569,58559


In [7]:
metadata[metadata['item_id']==5349]

Unnamed: 0,title,directedBy,starring,dateAdded,avgRating,imdbId,item_id
5253,Spider-Man (2002),Sam Raimi,"Tobey Maguire, Kirsten Dunst, James Franco, Wi...",,3.47373,145487,5349


In [8]:
metadata[metadata['item_id']==46530]

Unnamed: 0,title,directedBy,starring,dateAdded,avgRating,imdbId,item_id
11139,Superman Returns (2006),Bryan Singer,"Brandon Routh, Kate Bosworth, Kevin Spacey, Ja...",2006-06-27T07:09:05,2.89454,348150,46530


In [9]:
metadata[metadata['item_id']==59315]

Unnamed: 0,title,directedBy,starring,dateAdded,avgRating,imdbId,item_id
12671,Iron Man (2008),Jon Favreau,"Robert Downey Jr., Terrence Howard, Jeff Bridg...",2008-05-02T13:39:43,3.83428,371746,59315


In [10]:
metadata[metadata['item_id']==122912]

Unnamed: 0,title,directedBy,starring,dateAdded,avgRating,imdbId,item_id
27907,Avengers: Infinity War - Part I (2018),,"Robert Downey Jr.,Chris Evans,Josh Brolin,Samu...",2015-01-13T09:40:37,3.93974,4154756,122912


In [11]:
data = [
  {'item_id': 58559, 'rating': 4},
  {'item_id': 122912, 'rating': 5},
  {'item_id': 5349, 'rating': 5},
  {'item_id': 46530, 'rating': 3},
  {'item_id': 59315, 'rating': 5}
]
input_example = pd.DataFrame(data)
input_example

Unnamed: 0,item_id,rating
0,58559,4
1,122912,5
2,5349,5
3,46530,3
4,59315,5


In [12]:
import time

In [13]:
start_time = time.time()
movie_recommended_example1 = recommend_movies(input_example, ratings, user_avg_rating, n=20)
end_time = time.time()
end_time - start_time

9.894152641296387

In [14]:
movie_recommended_example1 = movie_recommended_example1.merge(metadata[['item_id', 'title']], on='item_id', how='left')
movie_recommended_example1

Unnamed: 0,item_id,pred_rating,title
0,1278,6.769853,Young Frankenstein (1974)
1,33493,6.769853,Star Wars: Episode III - Revenge of the Sith (...
2,143347,6.769853,Aquaman (2018)
3,106918,6.769853,"Secret Life of Walter Mitty, The (2013)"
4,6377,6.769853,Finding Nemo (2003)
...,...,...,...
175,54503,5.455118,Superbad (2007)
176,53996,5.455118,Transformers (2007)
177,69526,5.455118,Transformers: Revenge of the Fallen (2009)
178,72998,5.455118,Avatar (2009)
