In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import re

# User ratings from letterboxd csv 
ratings = pd.read_csv("ratings.csv", usecols=['Name', 'Rating', 'Year'])
#ratings = ratings.drop(columns=['Date', 'Letterboxd URI'])

# Collection of movie titles with their genres
movies = pd.read_csv("movies.csv")
# Remove (year) from titles
movies[['title', 'year']] = movies['title'].str.extract(r'^(.*)\s\((\d{4})\)$')
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')
movies['year'] = movies['year'].fillna(0).astype(int)
# Remove Nan titles
movies = movies.dropna(subset=['title']).reset_index(drop=True)
movies = movies.drop_duplicates(subset=['title', 'year']).reset_index(drop=True)

movies['title_clean'] = movies['title'].str.replace(r'\s*\(.*\)\s*$', '', regex=True)
mask = movies['title_clean'].str.endswith(', The')
movies.loc[mask, 'title_clean'] = 'The ' + movies.loc[mask, 'title_clean'].str[:-5]
movies['title'] = movies['title_clean']
movies = movies.drop(columns=['title_clean'])

# Colletion of ratings to be used in the correlation matrix.
all_ratings = pd.read_csv("allratings.csv")
all_ratings = all_ratings.drop(columns=['timestamp'])
all_ratings = all_ratings.dropna(subset=['userId', 'movieId', 'rating'])
# Merge movies into all_ratings to have title, genres, and year
all_ratings = pd.merge(all_ratings, movies)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
# List of genres
genres = set()

for genre in all_ratings['genres']:
    gs = genre.split('|')
    for g in gs:
        genres.add(g)

print(sorted(genres))

['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [5]:
# Year Range of dataset
low = 2026
high = 0

for year in all_ratings['year']:
    low = min(low, year)
    high = max(high, year)

print(f"Low: {low}, High: {high}")

Low: 1874, High: 2023


In [2]:
# Limit the included movies to movies with a minimum number of ratings
ratings_per_movie = all_ratings.groupby('movieId').size()
accepted_movies = ratings_per_movie[ratings_per_movie >= 100].index
filtered_ratings = all_ratings[all_ratings['movieId'].isin(accepted_movies)]
num_movies = filtered_ratings['movieId'].nunique()
print(f"Movies: {num_movies}")

Movies: 12145


In [33]:
from scipy.sparse import coo_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

user_ids = filtered_ratings['userId'].astype('category').cat.codes
movie_ids = filtered_ratings['movieId'].astype('category').cat.codes
rating_values = filtered_ratings['rating']

sparse_matrix = coo_matrix((rating_values, (user_ids, movie_ids))).tocsr()
movie_index_to_id = dict(enumerate(filtered_ratings['movieId'].astype('category').cat.categories))
movie_id_to_index = {v: k for k, v in movie_index_to_id.items()}

def kMostSimilar(sparse_matrix, k):
    norm_mat = normalize(sparse_matrix.T, norm='l2', axis=1)
    model = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='ball_tree', n_jobs=-1)
    model.fit(norm_mat)
    dists, inds = model.kneighbors(norm_mat)
    sims = 1 - (dists**2 / 2)
    return inds, sims

topk_inds, topk_sims = kMostSimilar(sparse_matrix, k=50)
global_mean = filtered_ratings['rating'].mean()
movie_means = filtered_ratings.groupby('movieId')['rating'].mean().to_dict()



In [38]:
from scipy.sparse import coo_matrix
movie_to_info = None

def calculate_similarity(sparse_matrix):
    # Calculate the cosine similarity between movies. Transpose the matrix to find item-item similarity instead of user-user
    similarity = cosine_similarity(sparse_matrix.T, dense_output=False)
    
    # Calculate the signifance of the correlation based on the number of ratings. i.e more ratings should mean more impact on the correlation calculation
    n_ratings = (sparse_matrix != 0).sum(axis=0)
    min_ratings = 50
    significance_matrix = n_ratings.T.dot(n_ratings) / (n_ratings.T.dot(n_ratings) + min_ratings)
    
    return similarity.multiply(significance_matrix)

# Make a matrix of all movie correlations
def make_matrix(filtered_ratings):    
    global movie_to_info
    user_ids = filtered_ratings['userId'].astype('category').cat.codes
    movie_ids = filtered_ratings['movieId'].astype('category').cat.codes
    rating_values = filtered_ratings['rating']
    
    # Sparse matrix made with coo_matrix to include only non 0 values, therefore saving space and time
    sparse_matrix = coo_matrix((rating_values, (user_ids, movie_ids))).tocsr()
    
    movie_similarity = calculate_similarity(sparse_matrix)
    
    movies = filtered_ratings['movieId'].astype('category').cat.categories
    movie_similarity_df = pd.DataFrame(
        movie_similarity.toarray(),
        index=movies,
        columns=movies
    )

    movie_to_info = filtered_ratings[['movieId', 'title', 'genres', 'year']].drop_duplicates().set_index('movieId')
    
    return movie_similarity_df

sparse_matrix = make_matrix(filtered_ratings)
global_mean = filtered_ratings['rating'].mean()
movie_means = filtered_ratings.groupby('movieId')['rating'].mean().to_dict()
print(sparse_matrix.shape)

(15960, 15960)


In [34]:
def map_user_ratings_to_movieids(user_df, filtered_ratings):
    # Create a lookup of title+year to movieId for exact matches
    title_year_to_id = filtered_ratings.groupby(['title', 'year'])['movieId'].first().to_dict()
    
    mapped_ratings = []
    not_found = []
    
    for _, row in user_df.iterrows():
        user_title = row['Name']
        user_year = row.get('Year', None)
        
        # Try exact match first with title+year
        if user_year and (user_title, user_year) in title_year_to_id:
            movie_id = title_year_to_id[(user_title, user_year)]
            mapped_ratings.append({
                'movieId': movie_id,
                'Rating': row['Rating'],
                'original_title': user_title
            })
            continue
        
        else:
            not_found.append((user_title, user_year))

    mapped_df = pd.DataFrame(mapped_ratings)
    
    return mapped_df, not_found

# Map user ratings to movieIds
ratings_with_ids, not_found = map_user_ratings_to_movieids(ratings, filtered_ratings)


In [45]:
print(not_found)
print(f"Percentage missing: {len(not_found) / len(ratings)}")

[('Glass Onion', 2022), ('Avengers: Infinity War', 2018), ('Black Panther', 2018), ('Spider-Man: Homecoming', 2017), ("Harry Potter and the Philosopher's Stone", 2001), ('Guardians of the Galaxy Vol. 2', 2017), ('A Silent Voice: The Movie', 2016), ('Captain Marvel', 2019), ('Pokémon Detective Pikachu', 2019), ('Entergalactic', 2022), ('Star Wars: Episode I – The Phantom Menace', 1999), ('Devilman Crybaby', 2018), ('Demon Slayer -Kimetsu no Yaiba- The Movie: Mugen Train', 2020), ('Cyberpunk: Edgerunners', 2022), ('The Last: Naruto the Movie', 2014), ('Your Lie in April', 2014), ('Hotarubi no Mori e', 2011), ('AnoHana: The Flower We Saw That Day', 2011), ('Words Bubble Up Like Soda Pop', 2020), ('Violet Evergarden: Eternity and the Auto Memory Doll', 2019), ('Bubble', 2022), ('Violet Evergarden: The Movie', 2020), ('Demon Slayer: Kimetsu no Yaiba', 2019), ('Fireworks', 2017), ('Tokyo Ghoul', 2014), ('Bakemonogatari', 2009), ('Kotaro Lives Alone', 2022), ('Flowers of Evil', 2013), ('Great

In [37]:
from rapidfuzz import fuzz, process
titles = filtered_ratings["title"]

bestMatch = process.extract("Harry Potter and the sorcerer's stone", titles, scorer=fuzz.ratio)
print(bestMatch)

[('Harry Potter and the Prisoner of Azkaban', 72.72727272727273, 898), ('Harry Potter and the Prisoner of Azkaban', 72.72727272727273, 2112), ('Harry Potter and the Prisoner of Azkaban', 72.72727272727273, 2499), ('Harry Potter and the Prisoner of Azkaban', 72.72727272727273, 4549), ('Harry Potter and the Prisoner of Azkaban', 72.72727272727273, 7486)]


In [35]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
def find_expected_rating(movie_id, user_ratings, sparse_matrix, topk_inds, topk_sims, movie_means, global_mean, movie_id_to_index, threshold=0.2):
    # if the movie isn't in the dataframe it has no correlations and is therefore useless and should be ignored
    if movie_id not in movie_id_to_index:
        return None
    
    user_mean = user_ratings['Rating'].mean()
    user_bias = user_mean - global_mean

    target_ind = movie_id_to_index[movie_id]
    neigh_inds = topk_inds[target_ind]
    neigh_sims = topk_sims[target_ind]

    valid = [movie_index_to_id[idx] in user_ratings['movieId'].values for idx in neigh_inds]
    if sum(valid) == 0:
        return movie_means.get(movie_id, global_mean) + user_bias
    
    valid_inds = neigh_inds[valid]
    valid_sims = neigh_sims[valid]
    valid_movie_ids = [movie_index_to_id[ind] for ind in valid_inds]

    valid_sims = np.array(valid_sims)
    valid_movie_ids = np.array(valid_movie_ids)
    mask_thresh = valid_sims > threshold
    if mask_thresh.sum() == 0:
        return movie_means.get(movie_id, global_mean) + user_bias
    
    valid_sims = valid_sims[mask_thresh]
    valid_movie_ids = valid_movie_ids[mask_thresh]

    user_ratings_dict = dict(zip(user_ratings['movieId'], user_ratings['Rating']))
    numerator = sum((sim**2) * (user_ratings_dict[m] - movie_means.get(m, global_mean)) for sim, m in zip(valid_sims, valid_movie_ids))
    denominator = sum(sim**2 for sim in valid_sims)
    prediction = movie_means.get(movie_id, global_mean) + numerator/denominator + user_bias
    return max(0.0, min(5.0, prediction))

In [36]:
# Compare each actual rating with the expected rating to evaluate the accuracy of the model
def evaluate_predictions(train_ratings, test_ratings, sparse_matrix, topk_inds, topk_sims, movie_means, global_mean, movie_id_to_index):    
    results = {}
    for thresh in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]:
        predictions = []
        actuals = []
        movie_ids = []
        outliers = 0
        Nones = 0
        for _, row in test_ratings.iterrows():
            movie_id = row['movieId']
            actual_rating = row['Rating']
            
            expected_rating = find_expected_rating(movie_id, train_ratings, sparse_matrix, topk_inds, topk_sims, movie_means, global_mean, movie_id_to_index, threshold=thresh)
        
            if expected_rating is not None:
                predictions.append(expected_rating)
                actuals.append(actual_rating)
                movie_ids.append(movie_id)

                # Number of predictions that are drastically different from the actual rating
                if abs(expected_rating - actual_rating) > 1.5:
                    outliers += 1
            else:
                Nones += 1
    
        mae = mean_absolute_error(actuals, predictions) if predictions else None
        rmse = np.sqrt(mean_squared_error(actuals, predictions)) if predictions else None
        outlier_percent = (outliers / len(test_ratings)) * 100
        results[thresh] = (predictions, movie_ids, mae, rmse, outlier_percent, Nones)

    return results

train_ratings, test_ratings = train_test_split(ratings_with_ids, test_size=0.20, random_state=1)
results = evaluate_predictions(train_ratings, test_ratings, sparse_matrix, topk_inds, topk_sims, movie_means, global_mean, movie_id_to_index)

for thresh, (predictions, titles, mae, rmse, outlier_percent, Nones) in results.items():
    print(f"Threshold: {thresh}:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Outlier Percent: {outlier_percent:.2f}%")
    print(f"Number of Nones: {Nones}")
    print(f"Number of test ratings: {len(test_ratings)}")
    print(f"Number of train ratings: {len(train_ratings)}")
    print(f"Number of total ratings: {len(ratings_with_ids)}")
    print("-----------------------------------")

Threshold: 0.0:
Mean Absolute Error (MAE): 0.5026168567321936
Root Mean Squared Error (RMSE): 0.6149298400470549
Outlier Percent: 2.04%
Number of Nones: 0
Number of test ratings: 98
Number of train ratings: 388
Number of total ratings: 486
-----------------------------------
Threshold: 0.1:
Mean Absolute Error (MAE): 0.5026168567321936
Root Mean Squared Error (RMSE): 0.6149298400470549
Outlier Percent: 2.04%
Number of Nones: 0
Number of test ratings: 98
Number of train ratings: 388
Number of total ratings: 486
-----------------------------------
Threshold: 0.2:
Mean Absolute Error (MAE): 0.5203554502804412
Root Mean Squared Error (RMSE): 0.6886340325670325
Outlier Percent: 2.04%
Number of Nones: 0
Number of test ratings: 98
Number of train ratings: 388
Number of total ratings: 486
-----------------------------------
Threshold: 0.3:
Mean Absolute Error (MAE): 0.4965124933116551
Root Mean Squared Error (RMSE): 0.6325226145046963
Outlier Percent: 4.08%
Number of Nones: 0
Number of test ra

In [132]:
# Increasing the minimum number of ratings required increases accuracy but reduces the number of movies
# Find the optimal threshold to balance accuracy and number of movies
def evaluate_for_threshold(threshold):
    ratings_per_movie = all_ratings.groupby('movieId').size()
    accepted_movies = ratings_per_movie[ratings_per_movie >= threshold].index
    filtered_ratings = all_ratings[all_ratings['movieId'].isin(accepted_movies)]
    num_movies = filtered_ratings['movieId'].nunique()
    
    user_ids = filtered_ratings['userId'].astype('category').cat.codes
    title_ids = filtered_ratings['title'].astype('category').cat.codes
    sparse_matrix = coo_matrix((filtered_ratings['rating'], (user_ids, title_ids))).tocsr()
    similarity_matrix = cosine_similarity(sparse_matrix.T, dense_output=False)
    titles = filtered_ratings['title'].astype('category').cat.categories
    movie_similarity_df = pd.DataFrame(similarity_matrix.toarray(), index=titles, columns=titles)

    # Evaluate predictions
    predictions, actuals, titles, mae, rmse, outlier_percent, Nones = evaluate_predictions(train_ratings, test_ratings, movie_similarity_df, global_mean, movie_means)
    return num_movies, mae

thresholds_0_100 = range(40, 60, 10)
thresholds_100_1000 = range(100, 4501, 100)

# Combine the two ranges
thresholds = list(thresholds_0_100) + list(thresholds_100_1000)
results = []

for threshold in thresholds:
    num_movies, mae = evaluate_for_threshold(threshold)
    if mae is not None:
        results.append((threshold, num_movies, mae))

#Analyze results to select modes
results_df = pd.DataFrame(results, columns=['Threshold', 'NumMovies', 'MAE'])
results_df['AccuracyImprovement'] = results_df['MAE'].iloc[0] - results_df['MAE']
results_df['MovieReduction'] = results_df['NumMovies'].iloc[0] - results_df['NumMovies']
results_df['Tradeoff'] = results_df['AccuracyImprovement'] / results_df['MovieReduction']

# Select top 5 thresholds based on tradeoff
top_thresholds = results_df.sort_values('Tradeoff', ascending=False).head(5)
print("Optimal Modes:")
print(top_thresholds.to_string(index=False))

# Display all results
print("All Results:")
print(results_df.to_string(index=False))

Optimal Modes:
 Threshold  NumMovies      MAE  AccuracyImprovement  MovieReduction  Tradeoff
      4000       1784 0.467770             0.087019           15655  0.000006
      3900       1814 0.468110             0.086679           15625  0.000006
      4100       1745 0.468997             0.085792           15694  0.000005
      4400       1653 0.469383             0.085406           15786  0.000005
      4300       1683 0.470331             0.084458           15756  0.000005
All Results:
 Threshold  NumMovies      MAE  AccuracyImprovement  MovieReduction      Tradeoff
        40      17439 0.554789             0.000000               0           NaN
        50      15975 0.548445             0.006344            1464  4.333266e-06
       100      12154 0.550034             0.004755            5285  8.997556e-07
       200       9292 0.541092             0.013697            8147  1.681228e-06
       300       7782 0.524712             0.030077            9657  3.114485e-06
       400  

In [32]:
def recommend_movies(user_ratings, sparse_matrix, startYear, endYear, topk_inds, topk_sims, movie_means, global_mean, movie_index_to_id):
    ratings_count = all_ratings.groupby('movieId').size()
    already_rated = set(user_ratings['movieId'])
    
    # Pre-compute user statistics
    user_mean = user_ratings['Rating'].mean()
    user_bias = user_mean - global_mean
    user_ratings_dict = dict(zip(user_ratings['movieId'], user_ratings['Rating']))
    
    # Get all candidate movies
    all_movie_ids = list(movie_index_to_id.values())
    candidate_ids = [mid for mid in all_movie_ids if mid not in already_rated]
    
    # Vectorized prediction
    predictions = []
    for movie_id in candidate_ids:
        if movie_id not in movie_id_to_index:
            continue
            
        target_ind = movie_id_to_index[movie_id]
        neigh_inds = topk_inds[target_ind]
        neigh_sims = topk_sims[target_ind]
        
        # Find which neighbors the user has rated
        neighbor_movie_ids = [movie_index_to_id[idx] for idx in neigh_inds]
        rated_mask = np.array([mid in user_ratings_dict for mid in neighbor_movie_ids])
        
        if not rated_mask.any():
            pred = movie_means.get(movie_id, global_mean) + user_bias
        else:
            valid_sims = neigh_sims[rated_mask]
            valid_movie_ids = np.array(neighbor_movie_ids)[rated_mask]
            
            # Apply threshold
            thresh_mask = valid_sims > 0.2
            if not thresh_mask.any():
                pred = movie_means.get(movie_id, global_mean) + user_bias
            else:
                valid_sims = valid_sims[thresh_mask]
                valid_movie_ids = valid_movie_ids[thresh_mask]
                
                # Vectorized calculation
                user_ratings_arr = np.array([user_ratings_dict[m] for m in valid_movie_ids])
                movie_means_arr = np.array([movie_means.get(m, global_mean) for m in valid_movie_ids])
                
                numerator = np.sum((valid_sims**2) * (user_ratings_arr - movie_means_arr))
                denominator = np.sum(valid_sims**2)
                pred = movie_means.get(movie_id, global_mean) + numerator/denominator + user_bias
                pred = max(0.0, min(5.0, pred))
        
        predictions.append((movie_id, pred, ratings_count.get(movie_id, 0)))
    
    # Sort and create DataFrame
    predictions.sort(key=lambda x: (x[1], x[2]), reverse=True)
    recommendations = pd.DataFrame(predictions, columns=['movieId', 'Expected Rating', 'Popularity'])
    
    recommendations = recommendations.merge(
        filtered_ratings[['movieId', 'title', 'genres', 'year']].drop_duplicates(), 
        on='movieId', how='left'
    )
    recommendations = recommendations[recommendations['year'].between(startYear, endYear, inclusive='both')]
    recommendations['Expected Rating'] = (recommendations['Expected Rating'] * 2).round() / 2
    
    return recommendations

recommendation_list = recommend_movies(ratings_with_ids, sparse_matrix, 1900, 2024, topk_inds, topk_sims, movie_means, global_mean, movie_index_to_id)

print(recommendation_list.head(40).to_string(index=False))

bins = [0, 1, 2, 3, 4, 5]
labels = ['0-1', '1-2', '2-3', '3-4', '4-5']
recommendation_list['Rating Interval'] = pd.cut(recommendation_list['Expected Rating'], bins=bins, labels=labels, right=False)

rating_distribution = recommendation_list['Rating Interval'].value_counts().sort_index()
print(rating_distribution)

 movieId  Expected Rating  Popularity                        title                         genres  year
    2300              5.0        7088                The Producers                         Comedy  1968
     123              5.0        3534            Chungking Express          Drama|Mystery|Romance  1994
    7486              5.0         658               Happy Together                  Drama|Romance  1997
    1757              5.0         955                Fallen Angels                  Drama|Romance  1995
   71438              5.0         287                Still Walking                          Drama  2008
   89759              5.0        2476                Separation, A                          Drama  2011
  103984              5.0        1296             The Great Beauty                   Comedy|Drama  2013
  163809              4.5        1397         Over the Garden Wall      Adventure|Animation|Drama  2013
    4237              4.5         367             The Gleaners &

In [31]:
def recommend_movies(user_ratings, sparse_matrix, startYear, endYear, topk_inds, topk_sims, movie_means, global_mean, movie_index_to_id):
    recommendations = []
    ratings_count = all_ratings.groupby('movieId').size()
    already_rated = set(user_ratings['movieId'])


    # For every movie in the similarity matrix, calculate the expected rating for the user if they haven't seen it
    for movie_id in movie_index_to_id.values():
        if movie_id not in already_rated:
            expected_rating = find_expected_rating(movie_id, user_ratings, sparse_matrix, topk_inds, topk_sims, movie_means, global_mean, {v: k for k, v in movie_index_to_id.items()}, threshold=0.2)
            if expected_rating is not None:
                num_ratings = ratings_count.get(movie_id, 0)

                recommendations.append((movie_id, expected_rating, num_ratings))

    # Sort the recommendations by expected rating and then by popularity
    recommendations.sort(key=lambda x: (x[1], x[2]), reverse=True)
    recommendations = pd.DataFrame(recommendations, columns=['movieId', 'Expected Rating', 'Popularity'])

    recommendations = recommendations.merge(filtered_ratings[['movieId', 'title', 'genres', 'year']].drop_duplicates(), on='movieId', how='left')
    recommendations = recommendations[recommendations['year'].between(startYear, endYear, inclusive='both')]
    recommendations['Expected Rating'] = (recommendations['Expected Rating'] * 2).round() / 2
    

    return recommendations

recommendation_list = recommend_movies(ratings_with_ids, sparse_matrix, 1900, 2024, topk_inds, topk_sims, movie_means, global_mean, movie_index_to_id)

print(recommendation_list.head(40).to_string(index=False))

bins = [0, 1, 2, 3, 4, 5]
labels = ['0-1', '1-2', '2-3', '3-4', '4-5']
recommendation_list['Rating Interval'] = pd.cut(recommendation_list['Expected Rating'], bins=bins, labels=labels, right=False)

rating_distribution = recommendation_list['Rating Interval'].value_counts().sort_index()
print(rating_distribution)

 movieId  Expected Rating  Popularity                        title                         genres  year
    2300              5.0        7088                The Producers                         Comedy  1968
     123              5.0        3534            Chungking Express          Drama|Mystery|Romance  1994
    7486              5.0         658               Happy Together                  Drama|Romance  1997
    1757              5.0         955                Fallen Angels                  Drama|Romance  1995
   71438              5.0         287                Still Walking                          Drama  2008
   89759              5.0        2476                Separation, A                          Drama  2011
  103984              5.0        1296             The Great Beauty                   Comedy|Drama  2013
  163809              4.5        1397         Over the Garden Wall      Adventure|Animation|Drama  2013
    4237              4.5         367             The Gleaners &