In [120]:
import pandas as pd
from sklearn.decomposition import NMF
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
import joblib
from joblib import load
from joblib import dump
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

In [121]:
ratings= pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')
movie_id_to_title = dict(zip(movies['movieId'], movies['title']))



In [122]:
def filter_by_average_rating(movies_df, ratings_df, min_rating=0):
    print(f"Filtering movies with average rating >= {min_rating}...")
    
    movies_with_ratings = pd.merge(movies_df, ratings_df, on='movieId', how='inner')
    
    average_ratings = movies_with_ratings.groupby('movieId')['rating'].mean()
    
    popular_movies = average_ratings[average_ratings >= min_rating].index
    
    filtered_movies_with_ratings = movies_with_ratings[movies_with_ratings['movieId'].isin(popular_movies)]
    
    print(f"Kept {len(popular_movies)} movies with average rating >= {min_rating}")
    
    return filtered_movies_with_ratings

In [123]:
def filter_users_by_activity(ratings_df, min_percentile, max_percentile):

    print(f"Filtering users with activity between {min_percentile*100:.0f}th and {max_percentile*100:.0f}th percentiles...")
    
    # Count ratings per user
    user_rating_counts = ratings_df.groupby('userId')['rating'].count()
    print(f"Before filtering: {len(user_rating_counts)} users with rating statistics:")
    print(user_rating_counts.describe())
    
    # Get percentile thresholds
    min_threshold = user_rating_counts.quantile(min_percentile)
    max_threshold = user_rating_counts.quantile(max_percentile)
    
    # Filter users
    filtered_users = user_rating_counts[(user_rating_counts >= min_threshold) & 
                                       (user_rating_counts <= max_threshold)].index
    
    filtered_ratings = ratings_df[ratings_df['userId'].isin(filtered_users)]
    
    # Get statistics after filtering
    user_rating_counts_after = filtered_ratings.groupby('userId')['rating'].count()
    print(f"After filtering: {len(user_rating_counts_after)} users with rating statistics:")
    print(user_rating_counts_after.describe())
    
    return filtered_ratings

In [124]:
def filter_movies_by_popularity(ratings_df, min_ratings):

    print(f"Filtering movies with at least {min_ratings} ratings...")
    
    # Count ratings per movie
    movie_rating_counts = ratings_df.groupby('movieId').size()
    
    print(f"Before filtering: {len(movie_rating_counts)} movies")
    print(f"Movies with <{min_ratings} ratings: {(movie_rating_counts < min_ratings).sum()}")
    
    # Filter movies with sufficient ratings
    popular_movies = movie_rating_counts[movie_rating_counts >= min_ratings].index
    
    filtered_ratings = ratings_df[ratings_df['movieId'].isin(popular_movies)]
    
    print(f"After filtering: {len(popular_movies)} movies kept")
    
    return filtered_ratings

In [125]:
def select_diverse_users(ratings_df, similarity_threshold=0.3, max_users_per_movie=500):
    """
    Select diverse users and return the filtered dataframe.
    """
    # Build user-movie and movie-user mappings
    user_to_movies = {}
    movie_user_counts = {}
    
    for _, row in ratings_df.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        
        if user_id not in user_to_movies:
            user_to_movies[user_id] = set()
        user_to_movies[user_id].add(movie_id)
        
        if movie_id not in movie_user_counts:
            movie_user_counts[movie_id] = 0
    
    # Sort users by number of ratings (experts first)
    sorted_users = sorted(user_to_movies.keys(), 
                         key=lambda u: len(user_to_movies[u]), 
                         reverse=True)
    
    # Select diverse users
    selected_users = []
    
    for user_id in sorted_users:
        user_movies = user_to_movies[user_id]
        
        # Check movie limit
        if any(movie_user_counts.get(movie_id, 0) >= max_users_per_movie 
              for movie_id in user_movies):
            continue
        
        # Check similarity with existing users
        too_similar = False
        for selected_id in selected_users:
            selected_movies = user_to_movies[selected_id]
            
            # Calculate Jaccard similarity
            intersection = len(user_movies.intersection(selected_movies))
            union = len(user_movies.union(selected_movies))
            
            if intersection / union > similarity_threshold:
                too_similar = True
                break
        
        if not too_similar:
            selected_users.append(user_id)
            
            # Update movie counts
            for movie_id in user_movies:
                movie_user_counts[movie_id] += 1
    
    # Return the filtered dataframe
    return ratings_df[ratings_df['userId'].isin(selected_users)]

""" def select_diverse_users(ratings_df, max_users_per_movie=1000, similarity_threshold=0.3):

    print(f"Selecting diverse users with max {max_users_per_movie} users per movie...")
    
    # Get initial statistics
    total_users = ratings_df['userId'].nunique()
    total_movies = ratings_df['movieId'].nunique()
    total_ratings = len(ratings_df)
    
    print(f"Initial dataset: {total_ratings} ratings from {total_users} users on {total_movies} movies")
    
    # Create a movie-to-users dictionary to track how many users rated each movie
    movie_to_users = {}
    for movie_id, group in ratings_df.groupby('movieId'):
        movie_to_users[movie_id] = set(group['userId'])
    
    # Create a user-to-movies dictionary
    user_to_movies = {}
    for user_id, group in ratings_df.groupby('userId'):
        user_to_movies[user_id] = set(group['movieId'])
    
    # Sort users by the number of ratings (CHANGED: now in descending order)
    user_rating_counts = ratings_df.groupby('userId').size()
    sorted_users = user_rating_counts.sort_values(ascending=False).index.tolist()
    
    # Add some randomness within rating count groups to prevent bias
    # Group users by rating count
    rating_count_groups = {}
    for user_id, count in user_rating_counts.items():
        if count not in rating_count_groups:
            rating_count_groups[count] = []
        rating_count_groups[count].append(user_id)
    
    # Shuffle each group and rebuild the sorted users list
    shuffled_sorted_users = []
    for count in sorted(rating_count_groups.keys(), reverse=True):
        group = rating_count_groups[count]
        np.random.shuffle(group)
        shuffled_sorted_users.extend(group)
    
    selected_users = set()
    movie_user_counts = {movie_id: 0 for movie_id in movie_to_users.keys()}
    
    # Process users in the sorted order (now more ratings first)
    for user_id in shuffled_sorted_users:
        user_movies = user_to_movies[user_id]
        
        # Skip users with too few ratings (optional)
        if len(user_movies) < 0:  # Minimum threshold can be changed
            continue
            
        # Check if this user would exceed the max users for any movie
        exceeds_limit = False
        for movie_id in user_movies:
            if movie_user_counts[movie_id] >= max_users_per_movie:
                exceeds_limit = True
                break
        
        # Check similarity with already selected users
        too_similar = False
        if not exceeds_limit and len(selected_users) > 0:
            # Check similarity with ALL existing users (no sampling)
            for selected_user in selected_users:
                selected_user_movies = user_to_movies[selected_user]
                
                # Only calculate similarity if there's a minimum overlap
                if len(user_movies) > 0 and len(selected_user_movies) > 0:
                    # Calculate Jaccard similarity between users
                    intersection = len(user_movies.intersection(selected_user_movies))
                    union = len(user_movies.union(selected_user_movies))
                    
                    similarity = intersection / union
                    if similarity > similarity_threshold:
                        too_similar = True
                        break
        
        # Add user if they don't exceed limits and aren't too similar
        if not exceeds_limit and not too_similar:
            selected_users.add(user_id)
            
            # Update movie user counts
            for movie_id in user_movies:
                movie_user_counts[movie_id] += 1
    
    # Filter the ratings DataFrame to only include selected users
    filtered_ratings = ratings_df[ratings_df['userId'].isin(selected_users)]
    
    # Calculate and print statistics
    avg_ratings_per_user = filtered_ratings.groupby('userId').size().mean()
    
    print(f"After filtering: {len(filtered_ratings)} ratings from {len(selected_users)} users")
    print(f"Retained {len(filtered_ratings)/total_ratings:.1%} of original ratings")
    print(f"Average ratings per selected user: {avg_ratings_per_user:.1f}")
    
    return filtered_ratings """

' def select_diverse_users(ratings_df, max_users_per_movie=1000, similarity_threshold=0.3):\n\n    print(f"Selecting diverse users with max {max_users_per_movie} users per movie...")\n\n    # Get initial statistics\n    total_users = ratings_df[\'userId\'].nunique()\n    total_movies = ratings_df[\'movieId\'].nunique()\n    total_ratings = len(ratings_df)\n\n    print(f"Initial dataset: {total_ratings} ratings from {total_users} users on {total_movies} movies")\n\n    # Create a movie-to-users dictionary to track how many users rated each movie\n    movie_to_users = {}\n    for movie_id, group in ratings_df.groupby(\'movieId\'):\n        movie_to_users[movie_id] = set(group[\'userId\'])\n\n    # Create a user-to-movies dictionary\n    user_to_movies = {}\n    for user_id, group in ratings_df.groupby(\'userId\'):\n        user_to_movies[user_id] = set(group[\'movieId\'])\n\n    # Sort users by the number of ratings (CHANGED: now in descending order)\n    user_rating_counts = ratings_df.

In [126]:
def create_balanced_matrix_v3(ratings_df, min_ratings_per_movie=3, target_user_count=4000, 
                             target_tier_coverage=0.8, similarity_threshold=0.15, verbose=True):
    """
    Creates a matrix with enhanced representation of less popular movies.
    
    Parameters:
    -----------
    ratings_df : DataFrame
        Original ratings dataframe
    min_ratings_per_movie : int
        Minimum ratings for a movie to be included
    target_user_count : int
        Target number of users to include
    target_tier_coverage : float
        Target coverage for less popular tiers (0-1)
    similarity_threshold : float
        Threshold for Jaccard similarity
    verbose : bool
        Whether to print progress information
    
    Returns:
    --------
    DataFrame
        Filtered ratings dataframe with balanced representation
    """
    if verbose:
        print(f"Creating optimized matrix with enhanced lower tier coverage...")
        print(f"Target: {target_user_count} users with {target_tier_coverage:.0%} coverage of lower tiers")
        print(f"Original dataset: {ratings_df['userId'].nunique()} users, {ratings_df['movieId'].nunique()} movies")
    
    # Filter movies with minimum ratings
    movie_counts = ratings_df.groupby('movieId').size()
    valid_movies = movie_counts[movie_counts >= min_ratings_per_movie].index
    filtered_df = ratings_df[ratings_df['movieId'].isin(valid_movies)]
    
    # Create equal-sized popularity tiers
    movie_counts = filtered_df.groupby('movieId').size().sort_values(ascending=False)
    movies_list = list(movie_counts.index)
    total_movies = len(movies_list)
    tiers = 4
    tier_size = total_movies // tiers
    
    movie_tiers = {}
    for i in range(tiers):
        start_idx = i * tier_size
        end_idx = start_idx + tier_size if i < tiers - 1 else None
        tier_movies = set(movies_list[start_idx:end_idx])
        movie_tiers[i] = tier_movies
    
    if verbose:
        for tier, movies in movie_tiers.items():
            tier_name = ["most popular", "popular", "less popular", "least popular"][tier]
            print(f"Tier {tier} ({tier_name}): {len(movies)} movies")
    
    # Build user mappings optimized for tier analysis
    users_data = {}
    
    for user_id, group in filtered_df.groupby('userId'):
        user_movies = set(group['movieId'])
        tier_counts = {tier: len(user_movies.intersection(tier_movies)) 
                      for tier, tier_movies in movie_tiers.items()}
        
        users_data[user_id] = {
            'movies': user_movies,
            'tier_counts': tier_counts,
            'total_ratings': len(user_movies)
        }
    
    # IMPORTANT: Identify tier-specialist users who rate more in less popular tiers
    tier_specialists = {2: [], 3: []}
    
    for user_id, data in users_data.items():
        tier_counts = data['tier_counts']
        total = sum(tier_counts.values())
        
        if total < 10:  # Skip users with too few ratings
            continue
            
        # Calculate proportion of ratings in lower tiers
        lower_tier_proportion = (tier_counts[2] + tier_counts[3]) / total
        
        # Users with at least 30% ratings in lower tiers are specialists
        if lower_tier_proportion >= 0.3:
            # Further categorize based on which tier they lean toward
            if tier_counts[3] > tier_counts[2]:
                tier_specialists[3].append(user_id)
            else:
                tier_specialists[2].append(user_id)
    
    if verbose:
        print(f"Identified {len(tier_specialists[2])} tier 2 specialists and {len(tier_specialists[3])} tier 3 specialists")
    
    # Calculate extremely weighted user scores - massively favor lower tiers
    user_scores = {}
    for user_id, data in users_data.items():
        tier_counts = data['tier_counts']
        total = data['total_ratings']
        
        if total == 0:
            user_scores[user_id] = 0
            continue
        
        # Super-exponential weighting
        tier_weights = [1, 8, 64, 512]  # Even more aggressive weights
        weighted_sum = sum(tier_weights[tier] * count for tier, count in tier_counts.items())
        
        # Calculate tier coverage
        tiers_with_ratings = sum(1 for count in tier_counts.values() if count > 0)
        coverage_bonus = (tiers_with_ratings / 4) ** 2
        
        # Combine factors
        user_scores[user_id] = (weighted_sum / total) * (1 + coverage_bonus)
    
    # Two-phase matrix construction
    # Phase 1: Focus exclusively on tier specialists to maximize lower tier coverage
    # Phase 2: Fill remaining spots with high-scoring users
    
    # Initialize tracking
    selected_users = []
    movie_user_counts = {movie: 0 for movie in valid_movies}
    
    # Phase 1: Aggressively add tier specialists
    if verbose:
        print("\nPhase 1: Adding tier specialists...")
    
    # First add tier 3 specialists (least popular)
    tier3_target = min(int(target_user_count * 0.25), len(tier_specialists[3]))
    tier3_added = 0
    
    # Sort tier 3 specialists by their score
    sorted_tier3 = sorted(tier_specialists[3], key=lambda u: user_scores[u], reverse=True)
    
    # Very limited restrictions for popular tiers to prioritize tier 3 coverage
    tier_limits = [2, 5, 30, 10000]
    
    for user_id in sorted_tier3:
        # Skip already selected
        if user_id in selected_users:
            continue
            
        # Check tier limits
        user_movies = users_data[user_id]['movies']
        exceeds_limit = False
        
        for movie_id in user_movies:
            for tier, tier_movies in movie_tiers.items():
                if movie_id in tier_movies:
                    if movie_user_counts[movie_id] >= tier_limits[tier]:
                        exceeds_limit = True
                        break
            if exceeds_limit:
                break
                
        if exceeds_limit:
            continue
            
        # Simplified similarity check
        too_similar = False
        if len(selected_users) > 0:
            # Only check recent additions for efficiency
            recent_users = selected_users[-50:]
            for existing_user in recent_users:
                existing_movies = users_data[existing_user]['movies']
                intersection = len(user_movies.intersection(existing_movies))
                
                if intersection == 0:
                    continue
                    
                union = len(user_movies.union(existing_movies))
                similarity = intersection / union
                
                if similarity > similarity_threshold:
                    too_similar = True
                    break
        
        if not too_similar:
            selected_users.append(user_id)
            tier3_added += 1
            
            # Update movie counts
            for movie_id in user_movies:
                movie_user_counts[movie_id] += 1
                
            if tier3_added >= tier3_target:
                break
    
    if verbose:
        print(f"Added {tier3_added} tier 3 specialists")
    
    # Next add tier 2 specialists
    tier2_target = min(int(target_user_count * 0.25), len(tier_specialists[2]))
    tier2_added = 0
    
    # Sort tier 2 specialists by their score
    sorted_tier2 = sorted(tier_specialists[2], key=lambda u: user_scores[u], reverse=True)
    
    # Slightly relaxed tier limits for phase 2
    tier_limits = [3, 10, 50, 10000]
    
    for user_id in sorted_tier2:
        # Skip already selected
        if user_id in selected_users:
            continue
            
        # Check tier limits
        user_movies = users_data[user_id]['movies']
        exceeds_limit = False
        
        for movie_id in user_movies:
            for tier, tier_movies in movie_tiers.items():
                if movie_id in tier_movies:
                    if movie_user_counts[movie_id] >= tier_limits[tier]:
                        exceeds_limit = True
                        break
            if exceeds_limit:
                break
                
        if exceeds_limit:
            continue
            
        # Simplified similarity check
        too_similar = False
        if len(selected_users) > 0:
            # Only check recent additions
            recent_users = selected_users[-50:]
            for existing_user in recent_users:
                existing_movies = users_data[existing_user]['movies']
                intersection = len(user_movies.intersection(existing_movies))
                
                if intersection == 0:
                    continue
                    
                union = len(user_movies.union(existing_movies))
                similarity = intersection / union
                
                if similarity > similarity_threshold:
                    too_similar = True
                    break
        
        if not too_similar:
            selected_users.append(user_id)
            tier2_added += 1
            
            # Update movie counts
            for movie_id in user_movies:
                movie_user_counts[movie_id] += 1
                
            if tier2_added >= tier2_target:
                break
    
    if verbose:
        print(f"Added {tier2_added} tier 2 specialists")
        specialists_added = tier3_added + tier2_added
        print(f"Total specialists added: {specialists_added} ({len(selected_users)} total users so far)")
    
    # Phase 2: Add remaining users based on overall score
    remaining_target = target_user_count - len(selected_users)
    
    if verbose:
        print(f"\nPhase 2: Adding {remaining_target} more users to reach target...")
    
    # Gradually increasing limits for remaining slots
    remaining_phases = [
        {"limits": [5, 20, 100, 10000], "target": remaining_target * 0.3},
        {"limits": [10, 40, 10000, 10000], "target": remaining_target * 0.3},
        {"limits": [20, 10000, 10000, 10000], "target": remaining_target * 0.4}
    ]
    
    # Sort all users by score
    sorted_users = sorted(users_data.keys(), key=lambda u: user_scores[u], reverse=True)
    
    for phase_idx, phase in enumerate(remaining_phases):
        added_in_phase = 0
        phase_target = int(phase["target"])
        tier_limits = phase["limits"]
        
        if verbose:
            print(f"  Remaining phase {phase_idx+1}: target={phase_target}, limits={tier_limits}")
        
        for user_id in sorted_users:
            # Skip already selected
            if user_id in selected_users:
                continue
                
            # Check tier limits
            user_movies = users_data[user_id]['movies']
            exceeds_limit = False
            
            for movie_id in user_movies:
                for tier, tier_movies in movie_tiers.items():
                    if movie_id in tier_movies:
                        if movie_user_counts[movie_id] >= tier_limits[tier]:
                            exceeds_limit = True
                            break
                if exceeds_limit:
                    break
                    
            if exceeds_limit:
                continue
                
            # Simplified similarity check for first 2000 users
            too_similar = False
            if len(selected_users) < 2000:
                # Only check recent additions
                recent_users = selected_users[-50:]
                for existing_user in recent_users:
                    existing_movies = users_data[existing_user]['movies']
                    intersection = len(user_movies.intersection(existing_movies))
                    
                    if intersection == 0:
                        continue
                        
                    union = len(user_movies.union(existing_movies))
                    similarity = intersection / union
                    
                    if similarity > similarity_threshold:
                        too_similar = True
                        break
            
            if not too_similar:
                selected_users.append(user_id)
                added_in_phase += 1
                
                # Update movie counts
                for movie_id in user_movies:
                    movie_user_counts[movie_id] += 1
                    
                # Early stopping for this phase
                if added_in_phase >= phase_target or len(selected_users) >= target_user_count:
                    break
        
        if verbose:
            print(f"    Added {added_in_phase} users in phase {phase_idx+1}, total: {len(selected_users)}")
            
        # Break if target reached
        if len(selected_users) >= target_user_count:
            break
    
    # Final pass if needed
    if len(selected_users) < target_user_count:
        if verbose:
            remaining = target_user_count - len(selected_users)
            print(f"\nFinal pass: Adding {remaining} more users without constraints")
            
        # Sort remaining users by ratings in lower tiers (descending)
        remaining_users = [u for u in users_data.keys() if u not in selected_users]
        sorted_remaining = sorted(
            remaining_users, 
            key=lambda u: users_data[u]['tier_counts'][2] + users_data[u]['tier_counts'][3],
            reverse=True
        )
        
        for user_id in sorted_remaining:
            selected_users.append(user_id)
            
            if len(selected_users) >= target_user_count:
                break
    
    # Create final dataset
    final_df = filtered_df[filtered_df['userId'].isin(selected_users)]
    
    # Evaluate coverage statistics
    final_movies = set(final_df['movieId'].unique())
    tier_representation = {}
    for tier, tier_movies in movie_tiers.items():
        movies_in_tier = len(tier_movies)
        movies_kept = len(tier_movies.intersection(final_movies))
        tier_representation[tier] = movies_kept / movies_in_tier
    
    if verbose:
        print(f"\nFinal matrix statistics:")
        print(f"Selected {len(selected_users)} users ({len(selected_users)/len(users_data):.1%} of valid users)")
        print(f"Covered {len(final_movies)} movies ({len(final_movies)/len(valid_movies):.1%} of valid movies)")
        print(f"Total ratings in matrix: {len(final_df)}")
        
        print("\nTier representation (% of tier movies included):")
        for tier, percentage in tier_representation.items():
            tier_name = ["most popular", "popular", "less popular", "least popular"][tier]
            print(f"  Tier {tier} ({tier_name}): {percentage:.1%}")
        
        # Ratings distribution
        ratings_per_user = final_df.groupby('userId').size()
        print(f"\nRatings per user: min={ratings_per_user.min()}, max={ratings_per_user.max()}, " +
              f"mean={ratings_per_user.mean():.1f}, median={ratings_per_user.median():.1f}")
        
        # Tier distributions
        tier_ratings = {tier: 0 for tier in range(tiers)}
        for movie_id in final_movies:
            for tier, tier_movies in movie_tiers.items():
                if movie_id in tier_movies:
                    movie_ratings = len(final_df[final_df['movieId'] == movie_id])
                    tier_ratings[tier] += movie_ratings
                    break
        
        total_ratings = sum(tier_ratings.values())
        print("\nRating distribution across tiers:")
        for tier, count in tier_ratings.items():
            tier_name = ["most popular", "popular", "less popular", "least popular"][tier]
            percentage = count / total_ratings if total_ratings > 0 else 0
            print(f"  Tier {tier} ({tier_name}): {count} ratings ({percentage:.1%})")
    
    return final_df

In [127]:
def remove_columns(filtered_df):
    if 'timestamp' in filtered_df.columns:
        filtered_df.drop(columns=["timestamp"], inplace=True)
    if 'genres' in filtered_df.columns:
        filtered_df.drop(columns=["genres"], inplace=True)
    if 'title' in filtered_df.columns:
        filtered_df.drop(columns=["title"], inplace=True)
    
    print(f"Final dataset: {len(filtered_df)} ratings across {filtered_df['movieId'].nunique()} movies from {filtered_df['userId'].nunique()} users")

    return filtered_df



In [128]:
ratings = remove_columns(ratings)

Final dataset: 33832162 ratings across 83239 movies from 330975 users


In [129]:

min_avg_rating=0
user_min_percentile=0.50 # these seem to fit the average user best
user_max_percentile=0.97
min_ratings_per_movie=2
max_users_per_movie=80 # 250
user_similarity_threshold=1 # 0.15



In [130]:
# Step 3: Filter users by activity level
filtered_df = filter_users_by_activity(
    ratings,
    min_percentile=user_min_percentile,
    max_percentile=user_max_percentile
)

Filtering users with activity between 50th and 97th percentiles...
Before filtering: 330975 users with rating statistics:
count    330975.00000
mean        102.21969
std         232.15453
min           1.00000
25%          15.00000
50%          31.00000
75%          98.00000
max       33332.00000
Name: rating, dtype: float64
After filtering: 156971 users with rating statistics:
count    156971.000000
mean        134.058023
std         116.407592
min          31.000000
25%          51.000000
50%          90.000000
75%         173.000000
max         579.000000
Name: rating, dtype: float64


In [131]:
ratings_df = create_balanced_matrix_v3(
    filtered_df,
    min_ratings_per_movie=3,
    target_user_count=1000,
    target_tier_coverage=0.98,  # Target 80% coverage for lower tiers
    similarity_threshold=0.5,
    verbose=True
)

Creating optimized matrix with enhanced lower tier coverage...
Target: 1000 users with 98% coverage of lower tiers
Original dataset: 156971 users, 49671 movies
Tier 0 (most popular): 7126 movies
Tier 1 (popular): 7126 movies
Tier 2 (less popular): 7126 movies
Tier 3 (least popular): 7128 movies
Identified 50 tier 2 specialists and 11 tier 3 specialists

Phase 1: Adding tier specialists...
Added 11 tier 3 specialists
Added 24 tier 2 specialists
Total specialists added: 35 (35 total users so far)

Phase 2: Adding 965 more users to reach target...
  Remaining phase 1: target=289, limits=[5, 20, 100, 10000]
    Added 196 users in phase 1, total: 231
  Remaining phase 2: target=289, limits=[10, 40, 10000, 10000]
    Added 190 users in phase 2, total: 421
  Remaining phase 3: target=386, limits=[20, 10000, 10000, 10000]
    Added 285 users in phase 3, total: 706

Final pass: Adding 294 more users without constraints

Final matrix statistics:
Selected 1000 users (0.6% of valid users)
Covered 

In [132]:
filtered_df = remove_columns(ratings_df)


Final dataset: 164878 ratings across 23133 movies from 1000 users


In [133]:
filtered_df["userId"].nunique(),filtered_df["movieId"].nunique()

(1000, 23133)

In [134]:
# Step 2: Filter by average rating
filtered_df = filter_by_average_rating(
    movies, 
    ratings_df, 
    min_rating=min_avg_rating
)

Filtering movies with average rating >= 0...
Kept 23133 movies with average rating >= 0


In [135]:
filtered_df["userId"].nunique(),filtered_df["movieId"].nunique()

(1000, 23133)

In [136]:
filtered_df = filter_movies_by_popularity(
    filtered_df,
    min_ratings=min_ratings_per_movie
)

Filtering movies with at least 2 ratings...
Before filtering: 23133 movies
Movies with <2 ratings: 6009
After filtering: 17124 movies kept


In [137]:
filtered_df["userId"].nunique(),filtered_df["movieId"].nunique()

(1000, 17124)

In [138]:
def scale_ratings(matrix):
    """
    Z-score normalization with clipping to avoid extreme values.
    Accounts for user rating tendencies while preserving relative importance.
    """
    data = matrix.copy().values
    rows = data.shape[0]
    
    # Calculate global mean (we'll only use this for documentation in comments)
    global_mean = np.mean(data[data > 0])
    
    for i in range(rows):
        # Find rated movies for this user
        nonzero_mask = data[i, :] > 0
        rated_indices = np.where(nonzero_mask)[0]
        
        # Get user's ratings
        user_ratings = data[i, rated_indices]
        
        if len(user_ratings) > 1:
            # Calculate user's own mean
            user_mean = np.mean(user_ratings)
            
            # Mean center relative to user's own mean
            centered_ratings = user_ratings - user_mean
            
            # Apply standardization if there's variance
            user_std = np.std(user_ratings)
            if user_std > 0:
                # Z-score normalize
                normalized_ratings = centered_ratings / user_std
                # Clip to avoid extreme values (-2.5 to 2.5 is ~99% of normal distribution)
                normalized_ratings = np.clip(normalized_ratings, -2.5, 2.5)
                # Scale to [0,1] range
                scaled_ratings = (normalized_ratings + 2.5) / 5.0
            else:
                # If all ratings are the same, set to neutral 0.5
                scaled_ratings = np.ones_like(user_ratings) * 0.5
                
            # Update the matrix with scaled ratings
            data[i, rated_indices] = scaled_ratings
    
    # Convert back to DataFrame
    scaled_matrix = pd.DataFrame(data, index=matrix.index, columns=matrix.columns)
    return scaled_matrix

In [139]:
def prepare_user_ratings(ratings_file_path, user_movie_matrix_columns):
    """
    Prepares and scales a single user's ratings using the same Z-score
    normalization approach as the training data.
    """
    user_data = pd.read_csv(ratings_file_path)
    user_data = user_data.dropna(subset=['movieId'])
    
    # Create user-movie ratings matrix with zero values
    user_ratings = pd.DataFrame(0.0, index=[0], columns=user_movie_matrix_columns, dtype=np.float64)
    
    # Fill with original ratings
    for _, row in user_data.iterrows():
        movie_id = row['movieId']
        if movie_id in user_ratings.columns:
            user_ratings.loc[0, movie_id] = float(row['Rating'])
    
    # Get indices of rated movies
    nonzero_mask = user_ratings.values > 0
    rated_indices = np.where(nonzero_mask[0])[0]
    
    # Get this user's ratings
    user_rating_values = user_ratings.iloc[0, rated_indices].values
    
    if len(user_rating_values) > 1:
        # Calculate user's mean and std
        user_mean = np.mean(user_rating_values)
        user_std = np.std(user_rating_values)
        
        # Apply Z-score normalization if there's variance
        if user_std > 0:
            # Center around mean and normalize by standard deviation
            normalized_ratings = (user_rating_values - user_mean) / user_std
            # Clip to avoid extreme values
            normalized_ratings = np.clip(normalized_ratings, -2.5, 2.5)
            # Scale to [0,1] range
            scaled_ratings = (normalized_ratings + 2.5) / 5.0
        else:
            # If all ratings are the same, set to neutral 0.5
            scaled_ratings = np.ones_like(user_rating_values) * 0.5
        
        # Update the user ratings with scaled values
        user_ratings.iloc[0, rated_indices] = scaled_ratings
    
    return user_data, user_ratings

In [140]:
def fit_model(
    user_movie_matrix,
    n_components=50,
    max_iter=100,
    init='random',
    solver='cd',
    tol=0.0001,

    sample_size=1.0
):

    print(f"\n--- Fitting model with {sample_size*100:.0f}% of users ---")
    
    # Sample users 
    if sample_size < 1.0:
        n_users = int(user_movie_matrix.shape[0] * sample_size)
        sampled_users = np.random.choice(user_movie_matrix.index, size=n_users, replace=False)
        training_matrix = user_movie_matrix.loc[sampled_users, :]
    else:
        training_matrix = user_movie_matrix
    
    print(f"Training matrix shape: {training_matrix.shape}")
    
    # Train the model
    start_time = time.time()
    nmf = NMF(
        n_components=n_components,
        max_iter=max_iter,
        verbose=0,
        init="nndsvd",
        solver=solver,
        tol=tol,
    )
    
    user_factors = nmf.fit_transform(training_matrix)
    item_factors = nmf.components_
    training_time = time.time() - start_time
    
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Iterations completed: {nmf.n_iter_}")
    print(f"Final error: {nmf.reconstruction_err_}")
    
    return nmf, training_matrix, training_time



In [141]:
def get_recommendations(
    nmf_model,
    training_matrix,
    my_ratings,
    my_data,
    n_recommendations=20,
    movie_id_to_title_map=None
):
    # Transform personal ratings into factor space
    my_user_factors = nmf_model.transform(my_ratings)
    
    # Generate predictions
    predicted_ratings = np.dot(my_user_factors, nmf_model.components_)
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=training_matrix.columns)
    
    # Get movies already rated
    rated_movies = set(int(movie_id) for movie_id in my_data['movieId'].values)
    print(f"You've rated {len(rated_movies)} movies")
    
    # Filter for unrated movies
    unrated_movies = [m for m in training_matrix.columns if int(m) not in rated_movies]
    print(f"Found {len(unrated_movies)} movies you haven't rated")
    
    # Get recommendations
    recommendations = []
    for movie_id in unrated_movies:
        pred_rating = predicted_ratings_df.loc[0, movie_id]
        recommendations.append((movie_id, pred_rating))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    top_recommendations = recommendations[:n_recommendations]
    
    if movie_id_to_title_map:
        print("\nTop Recommendations:")
        for movie_id, predicted_rating in top_recommendations:
            # Convert from 0-1 scale back to 1-5 scale
            original_scale_rating = predicted_rating * 4 + 1 # this didnt work that well tbh
            
            movie_title = movie_id_to_title_map.get(movie_id, f"Unknown Movie (ID: {movie_id})")
            print(f"Movie: {movie_title}, Similarity Rating: {predicted_rating:.2f}/1.00")
    
    return top_recommendations 

In [142]:
def evaluate_nmf_predictions(nmf_model, training_matrix, test_data):


    # Create a wide-format user vector (like ludde_ratings) filled with zeros
    user_vector = np.zeros(training_matrix.shape[1])
    
    # Map movie IDs to their indices in the model
    movie_indices = {}
    if hasattr(training_matrix, 'columns'):
        # If training_matrix is a DataFrame
        for i, col in enumerate(training_matrix.columns):
            movie_indices[col] = i
    else:
        # Assume movie IDs are consecutive integers starting from 1 or 0
        offset = 1 if training_matrix.shape[1] == 19915 else 0
        for i in range(training_matrix.shape[1]):
            movie_indices[i + offset] = i
    
    # Collect actual ratings and movie positions for comparison
    actual_ratings = []
    movie_positions = []
    
    # Fill the user vector with ratings from test_data
    for _, row in test_data.iterrows():
        movie_id = row['movieId']
        rating = row['Rating']
        
        if movie_id in movie_indices:
            position = movie_indices[movie_id]
            user_vector[position] = rating
            actual_ratings.append(rating)
            movie_positions.append(position)
    
    # Transform the user vector to get user factors
    user_factors = nmf_model.transform([user_vector])
    
    # Generate predictions for all movies
    predictions_all = user_factors @ nmf_model.components_
    
    # Extract predictions for only the movies we rated in test_data
    predicted_ratings = [predictions_all[0, pos] for pos in movie_positions]
    
    # Calculate metrics
    if len(actual_ratings) > 0:
        rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
        mae = mean_absolute_error(actual_ratings, predicted_ratings)
        me = np.mean(np.array(predicted_ratings) - np.array(actual_ratings))
        corr = np.corrcoef(actual_ratings, predicted_ratings)[0, 1] if len(actual_ratings) > 1 else np.nan
    else:
        rmse, mae, me, corr = np.nan, np.nan, np.nan, np.nan
    
    return {
        'rmse': rmse,
        'mae': mae,
        'mean_error': me,
        'correlation': corr,
        'num_predictions': len(actual_ratings)
    }

In [143]:
user_movie_matrix = filtered_df.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)
user_movie_matrix_scaled = scale_ratings(user_movie_matrix)


In [None]:
nmf_model, training_matrix, _ = fit_model(
    user_movie_matrix_scaled,
    n_components=150, # 300 was og
    sample_size=1,
    max_iter=800,
)




--- Fitting model with 100% of users ---
Training matrix shape: (1000, 17124)


In [None]:
import joblib
import pandas as pd
import numpy as np
import os
from sklearn.decomposition import NMF

# Create model_files directory if it doesn't exist
os.makedirs('model_files', exist_ok=True)


joblib.dump(nmf_model, 'model_files/nmf_model.joblib')


movie_ids = training_matrix.columns.tolist()
joblib.dump(movie_ids, 'model_files/movie_ids.joblib')


model_metadata = {
    'n_components': nmf_model.n_components,
    'trained_on': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M'),
    'matrix_shape': training_matrix.shape,
    'scaling_method': 'z_score_normalization',
}
joblib.dump(model_metadata, 'model_files/model_metadata.joblib')

print(f"Model saved with {len(movie_ids)} movies and {nmf_model.n_components} components")

Model saved with 10593 movies and 300 components


In [None]:
elliott_data, elliott_ratings = prepare_user_ratings(
    "data/loelliot_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
ludde_data, ludde_ratings = prepare_user_ratings(
    "data/ludde_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
charlie_data, charlie_ratings = prepare_user_ratings(
    "data/chaarll_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
tilda_data, tilda_ratings = prepare_user_ratings(
    "data/tilda_h_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
zorrodor_data, zorrodor_ratings = prepare_user_ratings(
    "data/zorrodor_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
lukas_data, lukas_ratings = prepare_user_ratings(
    "data/lukas_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
clara_data, clara_ratings = prepare_user_ratings(
    "data/clar_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
voided_data, voided_ratings = prepare_user_ratings(
    "data/voided_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
skellic_data, skellic_ratings = prepare_user_ratings(
    "data/skellic_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
liv_data, liv_ratings = prepare_user_ratings(
    "data/liv_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)

In [None]:
ludde_data.info()

In [None]:
ludde_ratings.info()

In [None]:
# recommendations = get_recommendations(
#     nmf_model=nmf_model,
#     training_matrix=training_matrix,
#     my_ratings=ludde_ratings,
#     my_data=ludde_data,
#     n_recommendations=200,
#     movie_id_to_title_map=movie_id_to_title,
# )
# Evaluate the model with your test data
metrics = evaluate_nmf_predictions(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    test_data=ludde_data  # Your test data with  movie_id, rating columns
)

# Display metrics
print(f"RMSE: {metrics['rmse']:.4f}")
print(f"MAE: {metrics['mae']:.4f}")
print(f"Mean Error: {metrics['mean_error']:.4f}")
print(f"Correlation: {metrics['correlation']:.4f}")
print(f"Number of predictions: {metrics['num_predictions']}")

RMSE: 2.2123
MAE: 1.9729
Mean Error: -1.9364
Correlation: 0.3753
Number of predictions: 410


In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=ludde_ratings,
    my_data=ludde_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 426 movies
Found 10183 movies you haven't rated

Top Recommendations:
Movie: American Beauty (1999), Similarity Rating: 0.51/1.00
Movie: Beautiful Mind, A (2001), Similarity Rating: 0.44/1.00
Movie: Saving Private Ryan (1998), Similarity Rating: 0.44/1.00
Movie: Godfather, The (1972), Similarity Rating: 0.39/1.00
Movie: Sixth Sense, The (1999), Similarity Rating: 0.39/1.00
Movie: Nightcrawler (2014), Similarity Rating: 0.38/1.00
Movie: Ratatouille (2007), Similarity Rating: 0.37/1.00
Movie: Cast Away (2000), Similarity Rating: 0.37/1.00
Movie: Logan (2017), Similarity Rating: 0.37/1.00
Movie: What We Do in the Shadows (2014), Similarity Rating: 0.35/1.00
Movie: Three Billboards Outside Ebbing, Missouri (2017), Similarity Rating: 0.35/1.00
Movie: Departed, The (2006), Similarity Rating: 0.35/1.00
Movie: Zootopia (2016), Similarity Rating: 0.34/1.00
Movie: Kill Bill: Vol. 2 (2004), Similarity Rating: 0.34/1.00
Movie: 12 Angry Men (1957), Similarity Rating: 0.34/1.00
Movie: P

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=tilda_ratings,
    my_data=tilda_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 285 movies
Found 10321 movies you haven't rated

Top Recommendations:
Movie: Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001), Similarity Rating: 0.35/1.00
Movie: The Imitation Game (2014), Similarity Rating: 0.34/1.00
Movie: Harry Potter and the Deathly Hallows: Part 2 (2011), Similarity Rating: 0.32/1.00
Movie: American Beauty (1999), Similarity Rating: 0.32/1.00
Movie: Prestige, The (2006), Similarity Rating: 0.31/1.00
Movie: The Martian (2015), Similarity Rating: 0.31/1.00
Movie: Call Me by Your Name (2017), Similarity Rating: 0.30/1.00
Movie: Intouchables (2011), Similarity Rating: 0.30/1.00
Movie: Little Women (2019), Similarity Rating: 0.30/1.00
Movie: Inside Out (2015), Similarity Rating: 0.29/1.00
Movie: Django Unchained (2012), Similarity Rating: 0.28/1.00
Movie: Manchester by the Sea (2016), Similarity Rating: 0.28/1.00
Movie: The Handmaiden (2016), Similarity Rating: 0.27/1.00
Movie: Seven (a.k.a. Se7en) (1995), Similarity Rating: 0.26/1.00
Movie: Beautiful

In [None]:

recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=elliott_ratings,
    my_data=elliott_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)
# params:    
"""
min_avg_rating=2.5
user_min_percentile=0.25
user_max_percentile=0.999
min_ratings_per_movie=3
max_users_per_movie=250
user_similarity_threshold=0.15
fit: 
n_components=200,
sample_size=1,
max_iter=800
"""
# response "Jag har sett typ en tredjedel av dessa filmer kanske, men inte sen jag började med letterboxd"

You've rated 177 movies
Found 10457 movies you haven't rated

Top Recommendations:
Movie: Juno (2007), Similarity Rating: 0.27/1.00
Movie: Bo Burnham: Inside (2021), Similarity Rating: 0.27/1.00
Movie: Spider-Man: Into the Spider-Verse (2018), Similarity Rating: 0.23/1.00
Movie: Eighth Grade (2018), Similarity Rating: 0.22/1.00
Movie: Eternal Sunshine of the Spotless Mind (2004), Similarity Rating: 0.22/1.00
Movie: Marriage Story (2019), Similarity Rating: 0.22/1.00
Movie: Scott Pilgrim vs. the World (2010), Similarity Rating: 0.21/1.00
Movie: Portrait of a Lady on Fire (2019), Similarity Rating: 0.20/1.00
Movie: Arrival (2016), Similarity Rating: 0.19/1.00
Movie: Manchester by the Sea (2016), Similarity Rating: 0.19/1.00
Movie: Inside Out (2015), Similarity Rating: 0.19/1.00
Movie: Nomadland (2020), Similarity Rating: 0.18/1.00
Movie: Hunt for the Wilderpeople (2016), Similarity Rating: 0.18/1.00
Movie: Encanto (2021), Similarity Rating: 0.18/1.00
Movie: Wolf of Wall Street, The (2013

'\nmin_avg_rating=2.5\nuser_min_percentile=0.25\nuser_max_percentile=0.999\nmin_ratings_per_movie=3\nmax_users_per_movie=250\nuser_similarity_threshold=0.15\nfit: \nn_components=200,\nsample_size=1,\nmax_iter=800\n'

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=voided_ratings,
    my_data=voided_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)
# params:    
"""
min_avg_rating=2.5
user_min_percentile=0.25
user_max_percentile=0.999
min_ratings_per_movie=3
max_users_per_movie=250
user_similarity_threshold=0.15
fit: 
n_components=200,
sample_size=1,
max_iter=800
"""
"""jo men det gör dom, har typ alla förrutom guardians of the galaxy på min watchlist haha
och det stämmer att de jag redan hade sett som totoro hade jag inte rateat"""

You've rated 271 movies
Found 10340 movies you haven't rated

Top Recommendations:
Movie: Prestige, The (2006), Similarity Rating: 0.46/1.00
Movie: Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001), Similarity Rating: 0.40/1.00
Movie: Reservoir Dogs (1992), Similarity Rating: 0.40/1.00
Movie: Interstellar (2014), Similarity Rating: 0.39/1.00
Movie: My Neighbor Totoro (Tonari no Totoro) (1988), Similarity Rating: 0.37/1.00
Movie: Fargo (1996), Similarity Rating: 0.37/1.00
Movie: Inception (2010), Similarity Rating: 0.36/1.00
Movie: American Beauty (1999), Similarity Rating: 0.36/1.00
Movie: Godfather, The (1972), Similarity Rating: 0.34/1.00
Movie: Kiki's Delivery Service (Majo no takkyûbin) (1989), Similarity Rating: 0.33/1.00
Movie: Cabin in the Woods, The (2012), Similarity Rating: 0.33/1.00
Movie: Perks of Being a Wallflower, The (2012), Similarity Rating: 0.32/1.00
Movie: Dark Knight, The (2008), Similarity Rating: 0.32/1.00
Movie: Mad Max: Fury Road (2015), Similarity Rating: 0

'jo men det gör dom, har typ alla förrutom guardians of the galaxy på min watchlist haha\noch det stämmer att de jag redan hade sett som totoro hade jag inte rateat'

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=zorrodor_ratings,
    my_data=zorrodor_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

# params:    
"""
min_avg_rating=2.5
user_min_percentile=0.25
user_max_percentile=0.999
min_ratings_per_movie=3
max_users_per_movie=250
user_similarity_threshold=0.15
fit: 
n_components=200,
sample_size=1,
max_iter=800
"""
# zorro:
# "Dom första 20 är alla bra filmer"
# "Eller de e nästan bara grejer jag gillar"
# "Några luriga dock men de mesta e gött"
# "Jag har rateat ganska lite också"
# "Så den har inte sååå mkt att gå på"

You've rated 45 movies
Found 10558 movies you haven't rated

Top Recommendations:
Movie: In the Mood For Love (Fa yeung nin wa) (2000), Similarity Rating: 0.13/1.00
Movie: Parasite (2019), Similarity Rating: 0.11/1.00
Movie: Eternal Sunshine of the Spotless Mind (2004), Similarity Rating: 0.11/1.00
Movie: Get Out (2017), Similarity Rating: 0.09/1.00
Movie: Rear Window (1954), Similarity Rating: 0.09/1.00
Movie: Shoplifters (2018), Similarity Rating: 0.08/1.00
Movie: Separation, A (Jodaeiye Nader az Simin) (2011), Similarity Rating: 0.08/1.00
Movie: Spirited Away (Sen to Chihiro no kamikakushi) (2001), Similarity Rating: 0.08/1.00
Movie: In Bruges (2008), Similarity Rating: 0.07/1.00
Movie: Her (2013), Similarity Rating: 0.07/1.00
Movie: Portrait of a Lady on Fire (2019), Similarity Rating: 0.07/1.00
Movie: Grand Budapest Hotel, The (2014), Similarity Rating: 0.06/1.00
Movie: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964), Similarity Rating: 0.06/1.00
Movie:

'\nmin_avg_rating=2.5\nuser_min_percentile=0.25\nuser_max_percentile=0.999\nmin_ratings_per_movie=3\nmax_users_per_movie=250\nuser_similarity_threshold=0.15\nfit: \nn_components=200,\nsample_size=1,\nmax_iter=800\n'

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=liv_ratings,
    my_data=liv_data,
    n_recommendations=100,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 106 movies
Found 10501 movies you haven't rated

Top Recommendations:
Movie: Spirited Away (Sen to Chihiro no kamikakushi) (2001), Similarity Rating: 0.28/1.00
Movie: Eternal Sunshine of the Spotless Mind (2004), Similarity Rating: 0.24/1.00
Movie: Parasite (2019), Similarity Rating: 0.21/1.00
Movie: Mean Girls (2004), Similarity Rating: 0.20/1.00
Movie: Bo Burnham: Inside (2021), Similarity Rating: 0.18/1.00
Movie: Princess Mononoke (Mononoke-hime) (1997), Similarity Rating: 0.18/1.00
Movie: Midsommar (2019), Similarity Rating: 0.17/1.00
Movie: Grand Budapest Hotel, The (2014), Similarity Rating: 0.17/1.00
Movie: Fantastic Mr. Fox (2009), Similarity Rating: 0.17/1.00
Movie: Her (2013), Similarity Rating: 0.16/1.00
Movie: Donnie Darko (2001), Similarity Rating: 0.16/1.00
Movie: Carol (2015), Similarity Rating: 0.16/1.00
Movie: Inside Out (2015), Similarity Rating: 0.16/1.00
Movie: Moonrise Kingdom (2012), Similarity Rating: 0.16/1.00
Movie: Corpse Bride (2005), Similarity 

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=skellic_ratings,
    my_data=skellic_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 338 movies
Found 10280 movies you haven't rated

Top Recommendations:
Movie: Reservoir Dogs (1992), Similarity Rating: 0.46/1.00
Movie: Fight Club (1999), Similarity Rating: 0.46/1.00
Movie: Prestige, The (2006), Similarity Rating: 0.44/1.00
Movie: Interstellar (2014), Similarity Rating: 0.43/1.00
Movie: Donnie Darko (2001), Similarity Rating: 0.43/1.00
Movie: Cabin in the Woods, The (2012), Similarity Rating: 0.41/1.00
Movie: Fargo (1996), Similarity Rating: 0.41/1.00
Movie: Wolf of Wall Street, The (2013), Similarity Rating: 0.40/1.00
Movie: What We Do in the Shadows (2014), Similarity Rating: 0.38/1.00
Movie: American Beauty (1999), Similarity Rating: 0.38/1.00
Movie: Inception (2010), Similarity Rating: 0.37/1.00
Movie: Shining, The (1980), Similarity Rating: 0.36/1.00
Movie: Kiki's Delivery Service (Majo no takkyûbin) (1989), Similarity Rating: 0.36/1.00
Movie: Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001), Similarity Rating: 0.36/1.00
Movie: Guardians of the G