In [1]:
import pandas as pd
from sklearn.decomposition import NMF
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
import joblib
from joblib import load
from joblib import dump
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
ratings= pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')
movie_id_to_title = dict(zip(movies['movieId'], movies['title']))



In [3]:
def filter_by_average_rating(movies_df, ratings_df, min_rating=0):
    print(f"Filtering movies with average rating >= {min_rating}...")
    
    movies_with_ratings = pd.merge(movies_df, ratings_df, on='movieId', how='inner')
    
    average_ratings = movies_with_ratings.groupby('movieId')['rating'].mean()
    
    popular_movies = average_ratings[average_ratings >= min_rating].index
    
    filtered_movies_with_ratings = movies_with_ratings[movies_with_ratings['movieId'].isin(popular_movies)]
    
    print(f"Kept {len(popular_movies)} movies with average rating >= {min_rating}")
    
    return filtered_movies_with_ratings

In [4]:
def filter_users_by_activity(ratings_df, min_percentile, max_percentile):

    print(f"Filtering users with activity between {min_percentile*100:.0f}th and {max_percentile*100:.0f}th percentiles...")
    
    # Count ratings per user
    user_rating_counts = ratings_df.groupby('userId')['rating'].count()
    print(f"Before filtering: {len(user_rating_counts)} users with rating statistics:")
    print(user_rating_counts.describe())
    
    # Get percentile thresholds
    min_threshold = user_rating_counts.quantile(min_percentile)
    max_threshold = user_rating_counts.quantile(max_percentile)
    
    # Filter users
    filtered_users = user_rating_counts[(user_rating_counts >= min_threshold) & 
                                       (user_rating_counts <= max_threshold)].index
    
    filtered_ratings = ratings_df[ratings_df['userId'].isin(filtered_users)]
    
    # Get statistics after filtering
    user_rating_counts_after = filtered_ratings.groupby('userId')['rating'].count()
    print(f"After filtering: {len(user_rating_counts_after)} users with rating statistics:")
    print(user_rating_counts_after.describe())
    
    return filtered_ratings

In [5]:
def filter_movies_by_popularity(ratings_df, min_ratings):

    print(f"Filtering movies with at least {min_ratings} ratings...")
    
    # Count ratings per movie
    movie_rating_counts = ratings_df.groupby('movieId').size()
    
    print(f"Before filtering: {len(movie_rating_counts)} movies")
    print(f"Movies with <{min_ratings} ratings: {(movie_rating_counts < min_ratings).sum()}")
    
    # Filter movies with sufficient ratings
    popular_movies = movie_rating_counts[movie_rating_counts >= min_ratings].index
    
    filtered_ratings = ratings_df[ratings_df['movieId'].isin(popular_movies)]
    
    print(f"After filtering: {len(popular_movies)} movies kept")
    
    return filtered_ratings

In [None]:
def select_diverse_users_with_experts_first(ratings_df, n_users=30000, similarity_threshold=0.3, 
                                           max_users_per_movie=500, expert_percentage=0.3, random_seed=42):
    """
    Select a diverse set of users with experts prioritized, while preventing popular movie bias.
    
    Args:
        ratings_df: DataFrame with userId, movieId, rating columns
        n_users: Total number of users to select
        similarity_threshold: Maximum allowed Jaccard similarity between users
        max_users_per_movie: Maximum number of users allowed per movie
        expert_percentage: Percentage of users to select based on expertise (# of ratings)
        random_seed: Random seed for reproducibility
        
    Returns:
        List of selected user IDs
    """
    np.random.seed(random_seed)
    
    # Build user-movie and movie-user dictionaries in a single pass
    user_to_movies = {}
    movie_to_users = {}
    
    print("Building dictionaries...")
    for _, row in ratings_df.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        
        if user_id not in user_to_movies:
            user_to_movies[user_id] = set()
        user_to_movies[user_id].add(movie_id)
        
        if movie_id not in movie_to_users:
            movie_to_users[movie_id] = set()
        movie_to_users[movie_id].add(user_id)
    
    # Create a list of (user_id, rating_count) tuples and sort by count (descending)
    user_rating_counts = [(user_id, len(movies)) for user_id, movies in user_to_movies.items()]
    user_rating_counts.sort(key=lambda x: x[1], reverse=True)
    
    # Get ordered list of users, experts first
    expert_users = [u[0] for u in user_rating_counts]
    
    # Calculate number of expert users to prioritize
    n_expert_users = int(n_users * expert_percentage)
    
    # Initialize selection variables
    selected_users = []
    movie_user_counts = {movie_id: 0 for movie_id in movie_to_users}
    
    print(f"Selecting diverse users (first {n_expert_users} prioritizing experts)...")
    
    # Process users in order (expert first)
    for user_id in expert_users:
        # Stop when we have enough users
        if len(selected_users) >= n_users:
            break
            
        user_movies = user_to_movies[user_id]
        
        # Check if adding this user would exceed max_users_per_movie for any movie
        exceeds_limit = False
        for movie_id in user_movies:
            if movie_user_counts[movie_id] >= max_users_per_movie:
                exceeds_limit = True
                break
                
        if exceeds_limit:
            continue
            
        # Check similarity with existing users
        too_similar = False
        
        # Only check against a sample when the selected set gets large
        users_to_check = selected_users
        if len(selected_users) > 1000:
            users_to_check = np.random.choice(selected_users, 1000, replace=False)
            
        for selected_user in users_to_check:
            selected_user_movies = user_to_movies[selected_user]
            
            # Quick size comparison check
            len_user = len(user_movies)
            len_selected = len(selected_user_movies)
            
            # Skip dissimilar size pairs
            if min(len_user, len_selected) / max(len_user, len_selected) < similarity_threshold / 2:
                continue
                
            # Efficient intersection calculation
            if len_user <= len_selected:
                intersection = sum(1 for m in user_movies if m in selected_user_movies)
            else:
                intersection = sum(1 for m in selected_user_movies if m in user_movies)
                
            # Calculate Jaccard similarity
            union = len_user + len_selected - intersection
            similarity = intersection / union if union > 0 else 0
            
            if similarity > similarity_threshold:
                too_similar = True
                break
                
        if not too_similar:
            # Add user to selected users
            selected_users.append(user_id)
            
            # Update movie user counts
            for movie_id in user_movies:
                movie_user_counts[movie_id] += 1
        
        # Display progress
        if len(selected_users) % 500 == 0:
            print(f"Selected {len(selected_users)}/{n_users} users")
                
    print(f"Selected {len(selected_users)} users")
    return selected_users

""" def select_diverse_users(ratings_df, max_users_per_movie=1000, similarity_threshold=0.3):

    print(f"Selecting diverse users with max {max_users_per_movie} users per movie...")
    
    # Get initial statistics
    total_users = ratings_df['userId'].nunique()
    total_movies = ratings_df['movieId'].nunique()
    total_ratings = len(ratings_df)
    
    print(f"Initial dataset: {total_ratings} ratings from {total_users} users on {total_movies} movies")
    
    # Create a movie-to-users dictionary to track how many users rated each movie
    movie_to_users = {}
    for movie_id, group in ratings_df.groupby('movieId'):
        movie_to_users[movie_id] = set(group['userId'])
    
    # Create a user-to-movies dictionary
    user_to_movies = {}
    for user_id, group in ratings_df.groupby('userId'):
        user_to_movies[user_id] = set(group['movieId'])
    
    # Sort users by the number of ratings (CHANGED: now in descending order)
    user_rating_counts = ratings_df.groupby('userId').size()
    sorted_users = user_rating_counts.sort_values(ascending=False).index.tolist()
    
    # Add some randomness within rating count groups to prevent bias
    # Group users by rating count
    rating_count_groups = {}
    for user_id, count in user_rating_counts.items():
        if count not in rating_count_groups:
            rating_count_groups[count] = []
        rating_count_groups[count].append(user_id)
    
    # Shuffle each group and rebuild the sorted users list
    shuffled_sorted_users = []
    for count in sorted(rating_count_groups.keys(), reverse=True):
        group = rating_count_groups[count]
        np.random.shuffle(group)
        shuffled_sorted_users.extend(group)
    
    selected_users = set()
    movie_user_counts = {movie_id: 0 for movie_id in movie_to_users.keys()}
    
    # Process users in the sorted order (now more ratings first)
    for user_id in shuffled_sorted_users:
        user_movies = user_to_movies[user_id]
        
        # Skip users with too few ratings (optional)
        if len(user_movies) < 0:  # Minimum threshold can be changed
            continue
            
        # Check if this user would exceed the max users for any movie
        exceeds_limit = False
        for movie_id in user_movies:
            if movie_user_counts[movie_id] >= max_users_per_movie:
                exceeds_limit = True
                break
        
        # Check similarity with already selected users
        too_similar = False
        if not exceeds_limit and len(selected_users) > 0:
            # Check similarity with ALL existing users (no sampling)
            for selected_user in selected_users:
                selected_user_movies = user_to_movies[selected_user]
                
                # Only calculate similarity if there's a minimum overlap
                if len(user_movies) > 0 and len(selected_user_movies) > 0:
                    # Calculate Jaccard similarity between users
                    intersection = len(user_movies.intersection(selected_user_movies))
                    union = len(user_movies.union(selected_user_movies))
                    
                    similarity = intersection / union
                    if similarity > similarity_threshold:
                        too_similar = True
                        break
        
        # Add user if they don't exceed limits and aren't too similar
        if not exceeds_limit and not too_similar:
            selected_users.add(user_id)
            
            # Update movie user counts
            for movie_id in user_movies:
                movie_user_counts[movie_id] += 1
    
    # Filter the ratings DataFrame to only include selected users
    filtered_ratings = ratings_df[ratings_df['userId'].isin(selected_users)]
    
    # Calculate and print statistics
    avg_ratings_per_user = filtered_ratings.groupby('userId').size().mean()
    
    print(f"After filtering: {len(filtered_ratings)} ratings from {len(selected_users)} users")
    print(f"Retained {len(filtered_ratings)/total_ratings:.1%} of original ratings")
    print(f"Average ratings per selected user: {avg_ratings_per_user:.1f}")
    
    return filtered_ratings """

In [7]:
def remove_columns(filtered_df):
    if 'timestamp' in filtered_df.columns:
        filtered_df.drop(columns=["timestamp"], inplace=True)
    if 'genres' in filtered_df.columns:
        filtered_df.drop(columns=["genres"], inplace=True)
    if 'title' in filtered_df.columns:
        filtered_df.drop(columns=["title"], inplace=True)
    
    print(f"Final dataset: {len(filtered_df)} ratings across {filtered_df['movieId'].nunique()} movies from {filtered_df['userId'].nunique()} users")

    return filtered_df



In [8]:
ratings = remove_columns(ratings)

Final dataset: 33832162 ratings across 83239 movies from 330975 users


In [9]:

min_avg_rating=0
user_min_percentile=0.20 # these seem to fit the average user best
user_max_percentile=0.97
min_ratings_per_movie=3
max_users_per_movie=200 # 250
user_similarity_threshold=0.15 # 0.15



In [10]:
# Step 3: Filter users by activity level
filtered_df = filter_users_by_activity(
    ratings,
    min_percentile=user_min_percentile,
    max_percentile=user_max_percentile
)

Filtering users with activity between 20th and 97th percentiles...
Before filtering: 330975 users with rating statistics:
count    330975.00000
mean        102.21969
std         232.15453
min           1.00000
25%          15.00000
50%          31.00000
75%          98.00000
max       33332.00000
Name: rating, dtype: float64
After filtering: 256848 users with rating statistics:
count    256848.000000
mean         89.284974
std         106.963896
min          12.000000
25%          20.000000
50%          44.000000
75%         111.000000
max         579.000000
Name: rating, dtype: float64


In [11]:
ratings_df = select_diverse_users(
    filtered_df, 
    max_users_per_movie=max_users_per_movie,
    similarity_threshold=user_similarity_threshold,
)

Selecting diverse users with max 200 users per movie...
Initial dataset: 22932667 ratings from 256848 users on 50710 movies
After filtering: 411738 ratings from 8896 users
Retained 1.8% of original ratings
Average ratings per selected user: 46.3


In [12]:
ratings_df["userId"].nunique(),ratings_df["movieId"].nunique()

(8896, 28277)

In [13]:
filtered_df = remove_columns(ratings_df)


Final dataset: 411738 ratings across 28277 movies from 8896 users


In [14]:
filtered_df["userId"].nunique(),filtered_df["movieId"].nunique()

(8896, 28277)

In [15]:
# Step 2: Filter by average rating
filtered_df = filter_by_average_rating(
    movies, 
    ratings_df, 
    min_rating=min_avg_rating
)

Filtering movies with average rating >= 0...


Kept 28277 movies with average rating >= 0


In [16]:
filtered_df["userId"].nunique(),filtered_df["movieId"].nunique()

(8896, 28277)

In [17]:
filtered_df = filter_movies_by_popularity(
    filtered_df,
    min_ratings=min_ratings_per_movie
)

Filtering movies with at least 3 ratings...
Before filtering: 28277 movies
Movies with <3 ratings: 14709
After filtering: 13568 movies kept


In [18]:
filtered_df["userId"].nunique(),filtered_df["movieId"].nunique()

(8896, 13568)

In [19]:
def scale_ratings(matrix):
    """
    Z-score normalization with clipping to avoid extreme values.
    Accounts for user rating tendencies while preserving relative importance.
    """
    data = matrix.copy().values
    rows = data.shape[0]
    
    # Calculate global mean (we'll only use this for documentation in comments)
    global_mean = np.mean(data[data > 0])
    
    for i in range(rows):
        # Find rated movies for this user
        nonzero_mask = data[i, :] > 0
        rated_indices = np.where(nonzero_mask)[0]
        
        # Get user's ratings
        user_ratings = data[i, rated_indices]
        
        if len(user_ratings) > 1:
            # Calculate user's own mean
            user_mean = np.mean(user_ratings)
            
            # Mean center relative to user's own mean
            centered_ratings = user_ratings - user_mean
            
            # Apply standardization if there's variance
            user_std = np.std(user_ratings)
            if user_std > 0:
                # Z-score normalize
                normalized_ratings = centered_ratings / user_std
                # Clip to avoid extreme values (-2.5 to 2.5 is ~99% of normal distribution)
                normalized_ratings = np.clip(normalized_ratings, -2.5, 2.5)
                # Scale to [0,1] range
                scaled_ratings = (normalized_ratings + 2.5) / 5.0
            else:
                # If all ratings are the same, set to neutral 0.5
                scaled_ratings = np.ones_like(user_ratings) * 0.5
                
            # Update the matrix with scaled ratings
            data[i, rated_indices] = scaled_ratings
    
    # Convert back to DataFrame
    scaled_matrix = pd.DataFrame(data, index=matrix.index, columns=matrix.columns)
    return scaled_matrix

In [20]:
def prepare_user_ratings(ratings_file_path, user_movie_matrix_columns):
    """
    Prepares and scales a single user's ratings using the same Z-score
    normalization approach as the training data.
    """
    user_data = pd.read_csv(ratings_file_path)
    user_data = user_data.dropna(subset=['movieId'])
    
    # Create user-movie ratings matrix with zero values
    user_ratings = pd.DataFrame(0.0, index=[0], columns=user_movie_matrix_columns, dtype=np.float64)
    
    # Fill with original ratings
    for _, row in user_data.iterrows():
        movie_id = row['movieId']
        if movie_id in user_ratings.columns:
            user_ratings.loc[0, movie_id] = float(row['Rating'])
    
    # Get indices of rated movies
    nonzero_mask = user_ratings.values > 0
    rated_indices = np.where(nonzero_mask[0])[0]
    
    # Get this user's ratings
    user_rating_values = user_ratings.iloc[0, rated_indices].values
    
    if len(user_rating_values) > 1:
        # Calculate user's mean and std
        user_mean = np.mean(user_rating_values)
        user_std = np.std(user_rating_values)
        
        # Apply Z-score normalization if there's variance
        if user_std > 0:
            # Center around mean and normalize by standard deviation
            normalized_ratings = (user_rating_values - user_mean) / user_std
            # Clip to avoid extreme values
            normalized_ratings = np.clip(normalized_ratings, -2.5, 2.5)
            # Scale to [0,1] range
            scaled_ratings = (normalized_ratings + 2.5) / 5.0
        else:
            # If all ratings are the same, set to neutral 0.5
            scaled_ratings = np.ones_like(user_rating_values) * 0.5
        
        # Update the user ratings with scaled values
        user_ratings.iloc[0, rated_indices] = scaled_ratings
    
    return user_data, user_ratings

In [21]:
def fit_model(
    user_movie_matrix,
    n_components=50,
    max_iter=200,
    init='random',
    solver='cd',
    tol=0.0001,

    sample_size=1.0
):

    print(f"\n--- Fitting model with {sample_size*100:.0f}% of users ---")
    
    # Sample users 
    if sample_size < 1.0:
        n_users = int(user_movie_matrix.shape[0] * sample_size)
        sampled_users = np.random.choice(user_movie_matrix.index, size=n_users, replace=False)
        training_matrix = user_movie_matrix.loc[sampled_users, :]
    else:
        training_matrix = user_movie_matrix
    
    print(f"Training matrix shape: {training_matrix.shape}")
    
    # Train the model
    start_time = time.time()
    nmf = NMF(
        n_components=n_components,
        max_iter=max_iter,
        verbose=0,
        init="nndsvd",
        solver=solver,
        tol=tol,
    )
    
    user_factors = nmf.fit_transform(training_matrix)
    item_factors = nmf.components_
    training_time = time.time() - start_time
    
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Iterations completed: {nmf.n_iter_}")
    print(f"Final error: {nmf.reconstruction_err_}")
    
    return nmf, training_matrix, training_time



In [22]:
def get_recommendations(
    nmf_model,
    training_matrix,
    my_ratings,
    my_data,
    n_recommendations=20,
    movie_id_to_title_map=None
):
    # Transform personal ratings into factor space
    my_user_factors = nmf_model.transform(my_ratings)
    
    # Generate predictions
    predicted_ratings = np.dot(my_user_factors, nmf_model.components_)
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=training_matrix.columns)
    
    # Get movies already rated
    rated_movies = set(int(movie_id) for movie_id in my_data['movieId'].values)
    print(f"You've rated {len(rated_movies)} movies")
    
    # Filter for unrated movies
    unrated_movies = [m for m in training_matrix.columns if int(m) not in rated_movies]
    print(f"Found {len(unrated_movies)} movies you haven't rated")
    
    # Get recommendations
    recommendations = []
    for movie_id in unrated_movies:
        pred_rating = predicted_ratings_df.loc[0, movie_id]
        recommendations.append((movie_id, pred_rating))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    top_recommendations = recommendations[:n_recommendations]
    
    if movie_id_to_title_map:
        print("\nTop Recommendations:")
        for movie_id, predicted_rating in top_recommendations:
            # Convert from 0-1 scale back to 1-5 scale
            original_scale_rating = predicted_rating * 4 + 1 # this didnt work that well tbh
            
            movie_title = movie_id_to_title_map.get(movie_id, f"Unknown Movie (ID: {movie_id})")
            print(f"Movie: {movie_title}, Similarity Rating: {predicted_rating:.2f}/1.00")
    
    return top_recommendations 

In [23]:
def evaluate_nmf_predictions(nmf_model, training_matrix, test_data):


    # Create a wide-format user vector (like ludde_ratings) filled with zeros
    user_vector = np.zeros(training_matrix.shape[1])
    
    # Map movie IDs to their indices in the model
    movie_indices = {}
    if hasattr(training_matrix, 'columns'):
        # If training_matrix is a DataFrame
        for i, col in enumerate(training_matrix.columns):
            movie_indices[col] = i
    else:
        # Assume movie IDs are consecutive integers starting from 1 or 0
        offset = 1 if training_matrix.shape[1] == 19915 else 0
        for i in range(training_matrix.shape[1]):
            movie_indices[i + offset] = i
    
    # Collect actual ratings and movie positions for comparison
    actual_ratings = []
    movie_positions = []
    
    # Fill the user vector with ratings from test_data
    for _, row in test_data.iterrows():
        movie_id = row['movieId']
        rating = row['Rating']
        
        if movie_id in movie_indices:
            position = movie_indices[movie_id]
            user_vector[position] = rating
            actual_ratings.append(rating)
            movie_positions.append(position)
    
    # Transform the user vector to get user factors
    user_factors = nmf_model.transform([user_vector])
    
    # Generate predictions for all movies
    predictions_all = user_factors @ nmf_model.components_
    
    # Extract predictions for only the movies we rated in test_data
    predicted_ratings = [predictions_all[0, pos] for pos in movie_positions]
    
    # Calculate metrics
    if len(actual_ratings) > 0:
        rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
        mae = mean_absolute_error(actual_ratings, predicted_ratings)
        me = np.mean(np.array(predicted_ratings) - np.array(actual_ratings))
        corr = np.corrcoef(actual_ratings, predicted_ratings)[0, 1] if len(actual_ratings) > 1 else np.nan
    else:
        rmse, mae, me, corr = np.nan, np.nan, np.nan, np.nan
    
    return {
        'rmse': rmse,
        'mae': mae,
        'mean_error': me,
        'correlation': corr,
        'num_predictions': len(actual_ratings)
    }

In [24]:
user_movie_matrix = filtered_df.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)
user_movie_matrix_scaled = scale_ratings(user_movie_matrix)


In [None]:
nmf_model, training_matrix, _ = fit_model(
    user_movie_matrix_scaled,
    n_components=200,
    sample_size=1,
    max_iter=800,
)




--- Fitting model with 100% of users ---
Training matrix shape: (8896, 13568)


In [None]:
""" # Save the model
dump(nmf_model, 'nmf_model_100comp.joblib')

# Save the training matrix
dump(training_matrix, 'training_matrix_100comp.joblib')

# Alternatively, save everything in one file
model_data = {
    'model': nmf_model,
    'training_matrix': training_matrix,
    'n_components': 100,
    'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')
}
dump(model_data, 'recommender_data_100comp.joblib')

print("Model and training matrix saved successfully!") """

' # Save the model\ndump(nmf_model, \'nmf_model_100comp.joblib\')\n\n# Save the training matrix\ndump(training_matrix, \'training_matrix_100comp.joblib\')\n\n# Alternatively, save everything in one file\nmodel_data = {\n    \'model\': nmf_model,\n    \'training_matrix\': training_matrix,\n    \'n_components\': 100,\n    \'timestamp\': pd.Timestamp.now().strftime(\'%Y-%m-%d %H:%M\')\n}\ndump(model_data, \'recommender_data_100comp.joblib\')\n\nprint("Model and training matrix saved successfully!") '

In [None]:
elliott_data, elliott_ratings = prepare_user_ratings(
    "data/loelliot_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
ludde_data, ludde_ratings = prepare_user_ratings(
    "data/ludde_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
charlie_data, charlie_ratings = prepare_user_ratings(
    "data/chaarll_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
tilda_data, tilda_ratings = prepare_user_ratings(
    "data/tilda_h_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
zorrodor_data, zorrodor_ratings = prepare_user_ratings(
    "data/zorrodor_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
lukas_data, lukas_ratings = prepare_user_ratings(
    "data/lukas_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
clara_data, clara_ratings = prepare_user_ratings(
    "data/clar_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
voided_data, voided_ratings = prepare_user_ratings(
    "data/voided_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
skellic_data, skellic_ratings = prepare_user_ratings(
    "data/skellic_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)
liv_data, liv_ratings = prepare_user_ratings(
    "data/liv_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)

In [None]:
training_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5679 entries, 59 to 330943
Columns: 11659 entries, 1 to 286897
dtypes: float64(11659)
memory usage: 505.2 MB


In [None]:
ludde_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 0 to 425
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rating   426 non-null    float64
 1   movieId  426 non-null    int64  
dtypes: float64(1), int64(1)
memory usage: 6.8 KB


In [None]:
ludde_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 0 to 0
Columns: 11659 entries, 1 to 286897
dtypes: float64(11659)
memory usage: 91.2 KB


In [None]:
# recommendations = get_recommendations(
#     nmf_model=nmf_model,
#     training_matrix=training_matrix,
#     my_ratings=ludde_ratings,
#     my_data=ludde_data,
#     n_recommendations=200,
#     movie_id_to_title_map=movie_id_to_title,
# )
# Evaluate the model with your test data
metrics = evaluate_nmf_predictions(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    test_data=ludde_data  # Your test data with  movie_id, rating columns
)

# Display metrics
print(f"RMSE: {metrics['rmse']:.4f}")
print(f"MAE: {metrics['mae']:.4f}")
print(f"Mean Error: {metrics['mean_error']:.4f}")
print(f"Correlation: {metrics['correlation']:.4f}")
print(f"Number of predictions: {metrics['num_predictions']}")

RMSE: 2.2984
MAE: 2.1159
Mean Error: -2.1153
Correlation: 0.3247
Number of predictions: 410


In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=ludde_ratings,
    my_data=ludde_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 426 movies
Found 11249 movies you haven't rated

Top Recommendations:
Movie: American Beauty (1999), Similarity Rating: 0.41/1.00
Movie: Spirited Away (Sen to Chihiro no kamikakushi) (2001), Similarity Rating: 0.38/1.00
Movie: Donnie Darko (2001), Similarity Rating: 0.36/1.00
Movie: Old Boy (2003), Similarity Rating: 0.36/1.00
Movie: Wonder Woman (2017), Similarity Rating: 0.35/1.00
Movie: Godfather, The (1972), Similarity Rating: 0.34/1.00
Movie: Nightcrawler (2014), Similarity Rating: 0.34/1.00
Movie: Coco (2017), Similarity Rating: 0.34/1.00
Movie: Three Billboards Outside Ebbing, Missouri (2017), Similarity Rating: 0.33/1.00
Movie: Ratatouille (2007), Similarity Rating: 0.32/1.00
Movie: Sixth Sense, The (1999), Similarity Rating: 0.32/1.00
Movie: Beautiful Mind, A (2001), Similarity Rating: 0.32/1.00
Movie: Gran Torino (2008), Similarity Rating: 0.31/1.00
Movie: The Butterfly Effect (2004), Similarity Rating: 0.30/1.00
Movie: One Flew Over the Cuckoo's Nest (1975), Sim

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=tilda_ratings,
    my_data=tilda_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 285 movies
Found 11387 movies you haven't rated

Top Recommendations:
Movie: Inside Out (2015), Similarity Rating: 0.32/1.00
Movie: Ex Machina (2015), Similarity Rating: 0.30/1.00
Movie: Zootopia (2016), Similarity Rating: 0.27/1.00
Movie: Three Billboards Outside Ebbing, Missouri (2017), Similarity Rating: 0.27/1.00
Movie: The Imitation Game (2014), Similarity Rating: 0.27/1.00
Movie: Seven (a.k.a. Se7en) (1995), Similarity Rating: 0.27/1.00
Movie: The Martian (2015), Similarity Rating: 0.26/1.00
Movie: Pulp Fiction (1994), Similarity Rating: 0.25/1.00
Movie: Room (2015), Similarity Rating: 0.25/1.00
Movie: Incredibles, The (2004), Similarity Rating: 0.25/1.00
Movie: Finding Nemo (2003), Similarity Rating: 0.24/1.00
Movie: Inglourious Basterds (2009), Similarity Rating: 0.24/1.00
Movie: Ratatouille (2007), Similarity Rating: 0.24/1.00
Movie: Matrix, The (1999), Similarity Rating: 0.24/1.00
Movie: Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001), Similarity Rating: 0.2

In [None]:

recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=elliott_ratings,
    my_data=elliott_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)
# params:    
"""
min_avg_rating=2.5
user_min_percentile=0.25
user_max_percentile=0.999
min_ratings_per_movie=3
max_users_per_movie=250
user_similarity_threshold=0.15
fit: 
n_components=200,
sample_size=1,
max_iter=800
"""
# response "Jag har sett typ en tredjedel av dessa filmer kanske, men inte sen jag började med letterboxd"

You've rated 177 movies
Found 11512 movies you haven't rated

Top Recommendations:
Movie: Arrival (2016), Similarity Rating: 0.21/1.00
Movie: Interstellar (2014), Similarity Rating: 0.20/1.00
Movie: The Favourite, Similarity Rating: 0.20/1.00
Movie: Inside Out (2015), Similarity Rating: 0.16/1.00
Movie: Coco (2017), Similarity Rating: 0.16/1.00
Movie: Gone Girl (2014), Similarity Rating: 0.16/1.00
Movie: Zootopia (2016), Similarity Rating: 0.16/1.00
Movie: Up (2009), Similarity Rating: 0.15/1.00
Movie: Inception (2010), Similarity Rating: 0.15/1.00
Movie: Spider-Man: Into the Spider-Verse (2018), Similarity Rating: 0.15/1.00
Movie: Black Panther (2017), Similarity Rating: 0.15/1.00
Movie: Three Billboards Outside Ebbing, Missouri (2017), Similarity Rating: 0.14/1.00
Movie: The Martian (2015), Similarity Rating: 0.14/1.00
Movie: Deadpool (2016), Similarity Rating: 0.14/1.00
Movie: Upgrade (2018), Similarity Rating: 0.14/1.00
Movie: Black Swan (2010), Similarity Rating: 0.14/1.00
Movie: 

'\nmin_avg_rating=2.5\nuser_min_percentile=0.25\nuser_max_percentile=0.999\nmin_ratings_per_movie=3\nmax_users_per_movie=250\nuser_similarity_threshold=0.15\nfit: \nn_components=200,\nsample_size=1,\nmax_iter=800\n'

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=voided_ratings,
    my_data=voided_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)
# params:    
"""
min_avg_rating=2.5
user_min_percentile=0.25
user_max_percentile=0.999
min_ratings_per_movie=3
max_users_per_movie=250
user_similarity_threshold=0.15
fit: 
n_components=200,
sample_size=1,
max_iter=800
"""
"""jo men det gör dom, har typ alla förrutom guardians of the galaxy på min watchlist haha
och det stämmer att de jag redan hade sett som totoro hade jag inte rateat"""

You've rated 271 movies
Found 11404 movies you haven't rated

Top Recommendations:
Movie: Reservoir Dogs (1992), Similarity Rating: 0.33/1.00
Movie: Godfather, The (1972), Similarity Rating: 0.33/1.00
Movie: Matrix, The (1999), Similarity Rating: 0.33/1.00
Movie: One Flew Over the Cuckoo's Nest (1975), Similarity Rating: 0.32/1.00
Movie: WALL·E (2008), Similarity Rating: 0.32/1.00
Movie: Star Wars: Episode IV - A New Hope (1977), Similarity Rating: 0.31/1.00
Movie: Dark Knight, The (2008), Similarity Rating: 0.30/1.00
Movie: American Beauty (1999), Similarity Rating: 0.30/1.00
Movie: Usual Suspects, The (1995), Similarity Rating: 0.29/1.00
Movie: 2001: A Space Odyssey (1968), Similarity Rating: 0.29/1.00
Movie: Shawshank Redemption, The (1994), Similarity Rating: 0.28/1.00
Movie: Guardians of the Galaxy (2014), Similarity Rating: 0.28/1.00
Movie: Coco (2017), Similarity Rating: 0.28/1.00
Movie: Gone Girl (2014), Similarity Rating: 0.27/1.00
Movie: Amelie (Fabuleux destin d'Amélie Poula

'jo men det gör dom, har typ alla förrutom guardians of the galaxy på min watchlist haha\noch det stämmer att de jag redan hade sett som totoro hade jag inte rateat'

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=zorrodor_ratings,
    my_data=zorrodor_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

# params:    
"""
min_avg_rating=2.5
user_min_percentile=0.25
user_max_percentile=0.999
min_ratings_per_movie=3
max_users_per_movie=250
user_similarity_threshold=0.15
fit: 
n_components=200,
sample_size=1,
max_iter=800
"""
# zorro:
# "Dom första 20 är alla bra filmer"
# "Eller de e nästan bara grejer jag gillar"
# "Några luriga dock men de mesta e gött"
# "Jag har rateat ganska lite också"
# "Så den har inte sååå mkt att gå på"

You've rated 45 movies
Found 11624 movies you haven't rated

Top Recommendations:
Movie: In the Mood For Love (Fa yeung nin wa) (2000), Similarity Rating: 0.06/1.00
Movie: Whiplash (2014), Similarity Rating: 0.06/1.00
Movie: Before Sunrise (1995), Similarity Rating: 0.06/1.00
Movie: The Handmaiden (2016), Similarity Rating: 0.06/1.00
Movie: Boyhood (2014), Similarity Rating: 0.05/1.00
Movie: Three Colors: Blue (Trois couleurs: Bleu) (1993), Similarity Rating: 0.05/1.00
Movie: Her (2013), Similarity Rating: 0.05/1.00
Movie: Old Boy (2003), Similarity Rating: 0.05/1.00
Movie: Mad Max: Fury Road (2015), Similarity Rating: 0.05/1.00
Movie: 2001: A Space Odyssey (1968), Similarity Rating: 0.05/1.00
Movie: Parasite (2019), Similarity Rating: 0.05/1.00
Movie: Hunt, The (Jagten) (2012), Similarity Rating: 0.05/1.00
Movie: Clockwork Orange, A (1971), Similarity Rating: 0.05/1.00
Movie: Blade Runner (1982), Similarity Rating: 0.05/1.00
Movie: 8 1/2 (8½) (1963), Similarity Rating: 0.05/1.00
Movie

'\nmin_avg_rating=2.5\nuser_min_percentile=0.25\nuser_max_percentile=0.999\nmin_ratings_per_movie=3\nmax_users_per_movie=250\nuser_similarity_threshold=0.15\nfit: \nn_components=200,\nsample_size=1,\nmax_iter=800\n'

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=liv_ratings,
    my_data=liv_data,
    n_recommendations=100,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 106 movies
Found 11568 movies you haven't rated

Top Recommendations:
Movie: Spirited Away (Sen to Chihiro no kamikakushi) (2001), Similarity Rating: 0.13/1.00
Movie: Fight Club (1999), Similarity Rating: 0.12/1.00
Movie: Up (2009), Similarity Rating: 0.11/1.00
Movie: Parasite (2019), Similarity Rating: 0.11/1.00
Movie: Finding Nemo (2003), Similarity Rating: 0.11/1.00
Movie: Donnie Darko (2001), Similarity Rating: 0.10/1.00
Movie: Inception (2010), Similarity Rating: 0.10/1.00
Movie: We Need to Talk About Kevin (2011), Similarity Rating: 0.10/1.00
Movie: Mean Girls (2004), Similarity Rating: 0.10/1.00
Movie: Zootopia (2016), Similarity Rating: 0.10/1.00
Movie: Inside Out (2015), Similarity Rating: 0.10/1.00
Movie: Eternal Sunshine of the Spotless Mind (2004), Similarity Rating: 0.10/1.00
Movie: Grand Budapest Hotel, The (2014), Similarity Rating: 0.10/1.00
Movie: Moonrise Kingdom (2012), Similarity Rating: 0.10/1.00
Movie: Knives Out (2019), Similarity Rating: 0.10/1.00
M

In [None]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=skellic_ratings,
    my_data=skellic_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 338 movies
Found 11342 movies you haven't rated

Top Recommendations:
Movie: Shining, The (1980), Similarity Rating: 0.39/1.00
Movie: Coco (2017), Similarity Rating: 0.38/1.00
Movie: Trainspotting (1996), Similarity Rating: 0.36/1.00
Movie: Godfather, The (1972), Similarity Rating: 0.35/1.00
Movie: Blade Runner (1982), Similarity Rating: 0.35/1.00
Movie: Fight Club (1999), Similarity Rating: 0.34/1.00
Movie: Inception (2010), Similarity Rating: 0.34/1.00
Movie: Memento (2000), Similarity Rating: 0.33/1.00
Movie: Clockwork Orange, A (1971), Similarity Rating: 0.33/1.00
Movie: Reservoir Dogs (1992), Similarity Rating: 0.32/1.00
Movie: Star Wars: Episode IV - A New Hope (1977), Similarity Rating: 0.31/1.00
Movie: Guardians of the Galaxy (2014), Similarity Rating: 0.31/1.00
Movie: Finding Nemo (2003), Similarity Rating: 0.31/1.00
Movie: Shawshank Redemption, The (1994), Similarity Rating: 0.30/1.00
Movie: Interstellar (2014), Similarity Rating: 0.30/1.00
Movie: Moana (2016), S