In [111]:
import pandas as pd
from sklearn.decomposition import NMF
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
ratings= pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')


In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


In [5]:
movies_with_ratings = pd.merge(movies, ratings, on='movieId', how='inner')

In [7]:

average_ratings = movies_with_ratings.groupby('movieId')['rating'].mean()

popular_movies = average_ratings[average_ratings >= 3.0].index

filtered_movies_with_ratings = movies_with_ratings[movies_with_ratings['movieId'].isin(popular_movies)]

# remove non expert users, ie, users with less than 75% of the quantile of ratings, keeping users with more than 89 ratings
user_rating_counts = filtered_movies_with_ratings.groupby('userId')['rating'].count()
print(user_rating_counts.describe())
filtered_users = user_rating_counts[(user_rating_counts >= user_rating_counts.quantile(0.75)) & 
                                    (user_rating_counts <= user_rating_counts.quantile(0.99))].index

filtered_movies_with_ratings = filtered_movies_with_ratings[filtered_movies_with_ratings['userId'].isin(filtered_users)]

user_rating_counts_after_filter = filtered_movies_with_ratings.groupby('userId')['rating'].count()

user_rating_counts_after_filter.describe()


count    330143.000000
mean         89.922140
std         190.271904
min           1.000000
25%          12.000000
50%          28.000000
75%          89.000000
max       23659.000000
Name: rating, dtype: float64


count    79691.000000
mean       237.325557
std        159.363827
min         89.000000
25%        122.000000
50%        180.000000
75%        296.000000
max        869.000000
Name: rating, dtype: float64

In [8]:

filtered_movies_with_ratings.drop(columns=["timestamp","genres","title"], inplace=True)
filtered_movies_with_ratings





Unnamed: 0,movieId,userId,rating
2,1,7,4.0
3,1,10,3.0
5,1,14,2.5
6,1,21,3.0
7,1,24,4.5
...,...,...,...
33832143,288939,16020,3.5
33832144,288941,7644,5.0
33832145,288941,16020,4.5
33832157,288967,47791,3.5


In [33]:
# filter out movies with less than x amount of ratings to clean up further
# after looking up some of the movies around the 70-75th quantile range, these movies are super niche and not very popular, so I will filter out movies with less than 30 ish ratings
movie_rating_counts = filtered_movies_with_ratings.groupby('movieId').size()
movie_rating_counts.quantile(0.70)




np.float64(27.0)

In [None]:

min_ratings_threshold = 30
popular_movies = movie_rating_counts[movie_rating_counts >= min_ratings_threshold].index


filtered_movies_with_ratings = filtered_movies_with_ratings[
    filtered_movies_with_ratings['movieId'].isin(popular_movies)
]

In [35]:


user_movie_matrix = filtered_movies_with_ratings.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)


In [74]:
#drop movies that are missing values in movieId
my_data = pd.read_csv("data/lddec_ratings_with_ids.csv")
my_data = my_data.dropna(subset=['movieId'])

# drop movies i have rated under 3
# my_data = my_data[my_data['rating_num'] >= 3.0]

# testing another approach, as having movies i have rated lower than 3 might be useful for the model to learn what i dont like
my_data['scaled_rating'] = (my_data['Rating'] - 1) / 4.0


my_data

Unnamed: 0,Rating,movieId,scaled_rating
0,5.0,109487,1.000
1,4.0,205156,0.750
2,5.0,202439,1.000
3,5.0,58559,1.000
4,3.5,171763,0.625
...,...,...,...
420,3.5,35836,0.625
421,3.5,1172,0.625
422,4.0,117176,0.750
423,4.5,106920,0.875


In [None]:
user_movie_matrix_filled = user_movie_matrix.fillna(0)


def scale_sparse_ratings(matrix):

    data = matrix.values.copy()
    rows, cols = data.shape
    
    nonzero_mask = data > 0
    
    # Scale each user's ratings to be between 0 and 1,
    scaler = MinMaxScaler()
    for i in range(rows):
        # Get indices of rated movies for this user
        rated_indices = np.where(nonzero_mask[i])[0]
        if len(rated_indices) > 1:  # Only scale if user has rated multiple movies
            # Extract, reshape for scaler, scale, and put back
            user_ratings = data[i, rated_indices].reshape(-1, 1)
            scaled_ratings = scaler.fit_transform(user_ratings).flatten()
            data[i, rated_indices] = scaled_ratings
    
    # Convert back to DataFrame
    return pd.DataFrame(data, index=matrix.index, columns=matrix.columns)

# Scale the full user-movie matrix once
user_movie_matrix_scaled = scale_sparse_ratings(user_movie_matrix_filled)

# Prepare and scale your personal ratings once
my_ratings = pd.DataFrame(0, index=[0], columns=user_movie_matrix_scaled.columns)
for _, row in my_data.iterrows():
    movie_id = row['movieId']
    if movie_id in my_ratings.columns:
        # Use the original rating (1-5 scale)
        my_ratings.loc[0, movie_id] = float(row['Rating'])

# Instead of using MinMaxScaler which adapts to your min/max ratings,
# use a fixed scale to preserve the meaning of your ratings:
if my_ratings.iloc[0].any():  # If there are any non-zero ratings
    nonzero_mask = my_ratings.iloc[0] > 0
    if nonzero_mask.sum() > 1:
        rated_indices = np.where(nonzero_mask)[0]
        # Apply fixed scaling: (rating - 1) / 4 maps 1→0, 3→0.5, 5→1
        my_ratings.iloc[0, rated_indices] = (my_ratings.iloc[0, rated_indices] - 1) / 4.0


In [None]:
user_movie_matrix_scaled

In [99]:

movie_id_to_title = dict(zip(movies['movieId'], movies['title']))
movies_with_genres = movies.copy()

In [104]:
def prepare_user_ratings(ratings_file_path, user_movie_matrix_columns):
    """
    Load and prepare user ratings for recommendation
    
    Parameters:
    -----------
    ratings_file_path : str
        Path to CSV file with user ratings
    user_movie_matrix_columns : pandas.Index
        Column names from the user-movie matrix, used to create compatible ratings
        
    Returns:
    --------
    tuple
        (user_data_df, ratings_matrix_df)
    """
    # Load user data
    user_data = pd.read_csv(ratings_file_path)
    user_data = user_data.dropna(subset=['movieId'])
    
    # Add scaled ratings column
    user_data['scaled_rating'] = (user_data['Rating'] - 1) / 4.0
    
    # Create user-movie ratings matrix
    user_ratings = pd.DataFrame(0, index=[0], columns=user_movie_matrix_columns)
    
    # Fill with scaled ratings
    for _, row in user_data.iterrows():
        movie_id = row['movieId']
        if movie_id in user_ratings.columns:
            user_ratings.loc[0, movie_id] = row['scaled_rating']
    
    return user_data, user_ratings





  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_rat

In [None]:
def split_data(user_movie_matrix, test_size=0.2, random_state=42):
    """
    Split the user-movie matrix into training and test sets in a memory-efficient way.
    
    Parameters:
    -----------
    user_movie_matrix : pandas.DataFrame
        Matrix of user ratings for movies
    test_size : float, default=0.2
        Proportion of ratings to use for testing
    random_state : int, default=42
        Random seed for reproducibility
        
    Returns:
    --------
    tuple
        (train_matrix, test_matrix, train_mask, test_mask)
    """
    # Create copies of the matrix for train and test
    train_matrix = user_movie_matrix.copy()
    test_matrix = user_movie_matrix.copy()
    
    # Create masks of the same shape as user_movie_matrix
    train_mask = np.zeros(user_movie_matrix.shape, dtype=bool)
    test_mask = np.zeros(user_movie_matrix.shape, dtype=bool)
    
    # Set random seed
    np.random.seed(random_state)
    
    # Process each user (row) separately to avoid memory issues
    total_ratings = 0
    train_count = 0
    test_count = 0
    
    for i, user_id in enumerate(user_movie_matrix.index):
        # Get indices of non-NaN values for this user
        user_ratings = user_movie_matrix.loc[user_id]
        rated_items = user_ratings.dropna().index
        
        if len(rated_items) == 0:
            continue
            
        # Randomly select items for test set
        n_test = max(1, int(len(rated_items) * test_size))
        test_items = np.random.choice(rated_items, size=n_test, replace=False)
        
        # Update matrices and masks
        for item in rated_items:
            if item in test_items:
                # Add to test set
                train_matrix.loc[user_id, item] = np.nan
                test_mask[i, user_movie_matrix.columns.get_loc(item)] = True
                test_count += 1
            else:
                # Add to train set
                test_matrix.loc[user_id, item] = np.nan
                train_mask[i, user_movie_matrix.columns.get_loc(item)] = True
                train_count += 1
                
        total_ratings += len(rated_items)
        
        # Print progress occasionally
        if (i+1) % 1000 == 0 or i+1 == len(user_movie_matrix):
            print(f"Processed {i+1}/{len(user_movie_matrix)} users")
    
    print(f"Total ratings: {total_ratings}")
    print(f"Training set: {train_count} ratings ({train_count/total_ratings:.1%})")
    print(f"Test set: {test_count} ratings ({test_count/total_ratings:.1%})")
    
    return train_matrix, test_matrix, train_mask, test_mask

def train_model(
    train_matrix,
    n_components=50,
    max_iter=200,
    init='nndsvd',
    solver='cd',
    tol=0.0001,
    l1_ratio=0.5,
    sample_size=0.1
):
    """
    Train an NMF model on the training matrix.
    
    Parameters:
    -----------
    train_matrix : pandas.DataFrame
        Matrix of user ratings for movies (training set)
    n_components : int, default=50
        Number of latent factors
    max_iter : int, default=200
        Maximum number of iterations
    init : str, default='nndsvd'
        Initialization method
    solver : str, default='cd'
        Solver to use
    tol : float, default=0.0001
        Tolerance for stopping criterion
    l1_ratio : float, default=0.5
        L1 ratio for regularization
    sample_size : float, default=1.0
        Proportion of users to sample for training
        
    Returns:
    --------
    tuple
        (nmf_model, training_matrix, training_time)
    """
    print(f"\n--- Training model with {sample_size*100:.0f}% of users ---")
    
    # Sample users 
    if sample_size < 1.0:
        n_users = int(train_matrix.shape[0] * sample_size)
        sampled_users = np.random.choice(train_matrix.index, size=n_users, replace=False)
        training_matrix = train_matrix.loc[sampled_users, :]
    else:
        training_matrix = train_matrix
    
    print(f"Training matrix shape: {training_matrix.shape}")
    
    # Train the model
    start_time = time.time()
    nmf = NMF(
        n_components=n_components,
        max_iter=max_iter,
        init=init,
        solver=solver,
        tol=tol,
        l1_ratio=l1_ratio
    )
    
    # Fill NaN values with zeros for training
    filled_matrix = training_matrix.fillna(0)
    
    user_factors = nmf.fit_transform(filled_matrix)
    item_factors = nmf.components_
    training_time = time.time() - start_time
    
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Iterations completed: {nmf.n_iter_}")
    print(f"Final error: {nmf.reconstruction_err_}")
    
    return nmf, training_matrix, training_time

def evaluate_model(nmf_model, test_matrix, test_mask):

    print("\n--- Evaluating model on test set ---")
    
    # Get original values from test_matrix
    original_values = []
    predicted_values = []
    
    # Fill NaN values with zeros for prediction
    filled_test_matrix = test_matrix.fillna(0)
    user_factors = nmf_model.transform(filled_test_matrix)
    predicted_ratings = np.dot(user_factors, nmf_model.components_)
    
    # Collect original and predicted values where test_mask is True
    for i in range(test_matrix.shape[0]):
        for j in range(test_matrix.shape[1]):
            if test_mask[i, j]:
                original = test_matrix.iloc[i, j]
                if not np.isnan(original):
                    predicted = predicted_ratings[i, j]
                    original_values.append(original)
                    predicted_values.append(predicted)
    
    # Calculate metrics
    mae = mean_absolute_error(original_values, predicted_values)
    rmse = np.sqrt(mean_squared_error(original_values, predicted_values))
    
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")
    
    return {
        'mae': mae,
        'rmse': rmse,
        'original_values': original_values,
        'predicted_values': predicted_values
    }

def get_recommendations(
    nmf_model,
    training_matrix,
    my_ratings,
    my_data,
    n_recommendations=20,
    movie_id_to_title_map=None
):
    """
    Get simple movie recommendations based on NMF model
    
    Parameters:
    -----------
    nmf_model : sklearn.decomposition.NMF
        Trained NMF model
    training_matrix : pandas.DataFrame
        Matrix used to train the model
    my_ratings : pandas.DataFrame
        User ratings in matrix format
    my_data : pandas.DataFrame
        User ratings data with movieId column
    n_recommendations : int, default=20
        Number of recommendations to return
    movie_id_to_title_map : dict, optional
        Dictionary mapping movie IDs to titles for display
        
    Returns:
    --------
    list
        List of (movie_id, predicted_rating) tuples
    """
    # Fill NaN values with zeros for transformation
    my_ratings_filled = my_ratings.fillna(0)
    
    # Transform personal ratings into factor space
    my_user_factors = nmf_model.transform(my_ratings_filled)
    
    # Generate predictions
    predicted_ratings = np.dot(my_user_factors, nmf_model.components_)
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=training_matrix.columns)
    
    # Get movies already rated
    rated_movies = set(int(movie_id) for movie_id in my_data['movieId'].values)
    print(f"You've rated {len(rated_movies)} movies")
    
    # Filter for unrated movies
    unrated_movies = [m for m in training_matrix.columns if int(m) not in rated_movies]
    print(f"Found {len(unrated_movies)} movies you haven't rated")
    
    # Get recommendations
    recommendations = []
    for movie_id in unrated_movies:
        pred_rating = predicted_ratings_df.loc[0, movie_id]
        recommendations.append((movie_id, pred_rating))
    
    # Sort by predicted rating
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    # Get top recommendations
    top_recommendations = recommendations[:n_recommendations]
    
    # Display recommendations
    if movie_id_to_title_map:
        print("\nTop Recommendations:")
        for movie_id, predicted_rating in top_recommendations:
            # Convert back to original rating scale
            original_scale_rating = predicted_rating * 4 + 1
            
            movie_title = movie_id_to_title_map.get(movie_id, f"Unknown Movie (ID: {movie_id})")
            print(f"Movie: {movie_title}, Predicted Rating: {original_scale_rating:.2f}/5.00")
    
    return top_recommendations



train_matrix, test_matrix, train_mask, test_mask = split_data(user_movie_matrix_scaled, test_size=0.2)

nmf_model, training_matrix, training_time = train_model(train_matrix, n_components=50)

my_data, my_ratings = prepare_user_ratings(
    "data/lddec_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
evaluation_results = evaluate_model(nmf_model, test_matrix, test_mask)


movie_id_to_title_map = dict(zip(movies['movieId'], movies['title']))

recommendations = get_recommendations(nmf_model, training_matrix, my_ratings, my_data, 
                                      movie_id_to_title_map=movie_id_to_title_map)



Processed 1000/79691 users


KeyboardInterrupt: 

In [None]:
def fit_model(
    user_movie_matrix,
    n_components=50,
    max_iter=200,
    init='nndsvd',
    solver='cd',
    tol=0.0001,
    l1_ratio=0.5,
    sample_size=1.0
):

    print(f"\n--- Fitting model with {sample_size*100:.0f}% of users ---")
    
    # Sample users 
    if sample_size < 1.0:
        n_users = int(user_movie_matrix.shape[0] * sample_size)
        sampled_users = np.random.choice(user_movie_matrix.index, size=n_users, replace=False)
        training_matrix = user_movie_matrix.loc[sampled_users, :]
    else:
        training_matrix = user_movie_matrix
    
    print(f"Training matrix shape: {training_matrix.shape}")
    
    # Train the model
    start_time = time.time()
    nmf = NMF(
        n_components=n_components,
        max_iter=max_iter,
        init=init,
        solver=solver,
        tol=tol,
        l1_ratio=l1_ratio
    )
    
    user_factors = nmf.fit_transform(training_matrix)
    item_factors = nmf.components_
    training_time = time.time() - start_time
    
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Iterations completed: {nmf.n_iter_}")
    print(f"Final error: {nmf.reconstruction_err_}")
    
    return nmf, training_matrix, training_time

def get_recommendations(
    nmf_model,
    training_matrix,
    my_ratings,
    my_data,
    n_recommendations=20,
    movie_id_to_title_map=None
):
    # Transform personal ratings into factor space
    my_user_factors = nmf_model.transform(my_ratings)
    
    # Generate predictions
    predicted_ratings = np.dot(my_user_factors, nmf_model.components_)
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=training_matrix.columns)
    
    # Get movies already rated
    rated_movies = set(int(movie_id) for movie_id in my_data['movieId'].values)
    print(f"You've rated {len(rated_movies)} movies")
    
    # Filter for unrated movies
    unrated_movies = [m for m in training_matrix.columns if int(m) not in rated_movies]
    print(f"Found {len(unrated_movies)} movies you haven't rated")
    
    # Get recommendations
    recommendations = []
    for movie_id in unrated_movies:
        pred_rating = predicted_ratings_df.loc[0, movie_id]
        recommendations.append((movie_id, pred_rating))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    top_recommendations = recommendations[:n_recommendations]
    
    if movie_id_to_title_map:
        print("\nTop Recommendations:")
        for movie_id, predicted_rating in top_recommendations:
            original_scale_rating = predicted_rating * 4 + 1
            
            movie_title = movie_id_to_title_map.get(movie_id, f"Unknown Movie (ID: {movie_id})")
            print(f"Movie: {movie_title}, Predicted Rating: {original_scale_rating:.2f}/5.00")
    
    return top_recommendations

In [130]:
def rank_calculation(df,sample_size=0.5):
    """
    
    Calculate the optimal rank of the specified dataframe.
    """
    # Read the data
    df = user_movie_matrix_scaled
    
    if sample_size < 1.0:
        n_users = int(user_movie_matrix.shape[0] * sample_size)
        sampled_users = np.random.choice(user_movie_matrix.index, size=n_users, replace=False)
        df = user_movie_matrix.loc[sampled_users, :]
    else:
        df = user_movie_matrix
    
    # Calculate benchmark value
    benchmark = np.linalg.norm(df, ord='fro') * 0.0001
    
    # Iterate through various values of rank to find optimal
    rank = 3
    while True:
        print(f"Trying rank {rank}...")
        # initialize the model
        model = NMF(n_components=rank, init='random', random_state=0, max_iter=500)
        W = model.fit_transform(df)
        H = model.components_
        V = W @ H
        
        # Calculate RMSE of original df and new V
        RMSE = np.sqrt(mean_squared_error(df, V))
        
        if RMSE < benchmark:
            return rank, V
        
        # Increment rank if RMSE isn't smaller than the benchmark
        rank += 1

    return rank

rank = rank_calculation(user_movie_matrix_scaled)
print(f"Optimal rank: {rank}")

Trying rank 3...
Optimal rank: (3, array([[3.86363336e+00, 2.14151894e+00, 8.06521072e-01, ...,
        2.94212173e-03, 2.53751001e-03, 1.96803623e-03],
       [1.16767316e+00, 6.02351711e-01, 2.07853628e-01, ...,
        1.39538037e-03, 1.18747429e-03, 9.20978599e-04],
       [1.48275134e+00, 7.92413207e-01, 3.00099851e-01, ...,
        1.15144167e-03, 8.97353532e-04, 6.95967406e-04],
       ...,
       [1.08810131e+00, 6.61290612e-01, 2.75967594e-01, ...,
        1.22143294e-04, 1.12376159e-04, 8.71564448e-05],
       [1.09859695e+00, 3.92121301e-01, 0.00000000e+00, ...,
        4.50739146e-03, 4.14695986e-03, 3.21628967e-03],
       [4.07261267e+00, 2.36636887e+00, 9.69738004e-01, ...,
        1.16165639e-03, 8.45650100e-04, 6.55867376e-04]],
      shape=(39845, 10542)))


In [None]:
# Fit the model
nmf_model, training_matrix, _ = fit_model(
    user_movie_matrix_scaled,
    n_components=20,
    l1_ratio=0.5,
    sample_size=1,
    max_iter=500
)



--- Fitting model with 100% of users ---
Training matrix shape: (79691, 10542)


In [119]:
elliott_data, elliott_ratings = prepare_user_ratings(
    "data/loelliot_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
ludde_data, ludde_ratings = prepare_user_ratings(
    "data/lddec_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
charlie_data, charlie_ratings = prepare_user_ratings(
    "data/chaarll_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
tilda_data, tilda_ratings = prepare_user_ratings(
    "data/tilda_h_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)

  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_ratings.loc[0, movie_id] = row['scaled_rating']
  user_rat

In [132]:

recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=my_ratings,
    my_data=my_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)


You've rated 425 movies
Found 10166 movies you haven't rated

Top Recommendations:
Movie: Ratatouille (2007), Predicted Rating: 3.29/5.00
Movie: Ocean's Eleven (2001), Predicted Rating: 3.13/5.00
Movie: How to Train Your Dragon (2010), Predicted Rating: 3.00/5.00
Movie: Departed, The (2006), Predicted Rating: 2.96/5.00
Movie: Star Trek (2009), Predicted Rating: 2.95/5.00
Movie: Saving Private Ryan (1998), Predicted Rating: 2.86/5.00
Movie: Minority Report (2002), Predicted Rating: 2.84/5.00
Movie: Casino Royale (2006), Predicted Rating: 2.81/5.00
Movie: Bourne Supremacy, The (2004), Predicted Rating: 2.77/5.00
Movie: Kill Bill: Vol. 2 (2004), Predicted Rating: 2.77/5.00
Movie: Spirited Away (Sen to Chihiro no kamikakushi) (2001), Predicted Rating: 2.77/5.00
Movie: Beautiful Mind, A (2001), Predicted Rating: 2.76/5.00
Movie: 300 (2007), Predicted Rating: 2.68/5.00
Movie: Matrix Reloaded, The (2003), Predicted Rating: 2.68/5.00
Movie: Kung Fu Panda (2008), Predicted Rating: 2.65/5.00
Mov

In [123]:


recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=elliott_ratings,
    my_data=elliott_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 177 movies
Found 10411 movies you haven't rated

Top Recommendations:
Movie: Marriage Story (2019), Predicted Rating: 2.27/5.00
Movie: Arrival (2016), Predicted Rating: 2.26/5.00
Movie: Spider-Man: Into the Spider-Verse (2018), Predicted Rating: 2.20/5.00
Movie: Interstellar (2014), Predicted Rating: 2.17/5.00
Movie: Moonlight, Predicted Rating: 2.15/5.00
Movie: Ex Machina (2015), Predicted Rating: 2.11/5.00
Movie: Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan (2006), Predicted Rating: 2.08/5.00
Movie: Whiplash (2014), Predicted Rating: 2.07/5.00
Movie: Uncut Gems (2019), Predicted Rating: 1.98/5.00
Movie: The Lighthouse (2019), Predicted Rating: 1.96/5.00
Movie: Big Short, The (2015), Predicted Rating: 1.91/5.00
Movie: Eighth Grade (2018), Predicted Rating: 1.90/5.00
Movie: Inception (2010), Predicted Rating: 1.88/5.00
Movie: Juno (2007), Predicted Rating: 1.86/5.00
Movie: Fantastic Mr. Fox (2009), Predicted Rating: 1.86/5.00
Movie: O

In [46]:

movie_id_to_title = dict(zip(movies['movieId'], movies['title']))

sample_sizes = [1.0] # amount of users to sample

for sample_size in sample_sizes:
    print(f"\n--- Testing with {sample_size*100:.0f}% of users (all movies) ---")
    

    n_users = int(user_movie_matrix_scaled.shape[0] * sample_size)
    sampled_users = np.random.choice(user_movie_matrix_scaled.index, size=n_users, replace=False)
    

    small_matrix = user_movie_matrix_scaled.loc[sampled_users, :]
    
    print(f"Sampled matrix shape: {small_matrix.shape} (vs. original {user_movie_matrix_scaled.shape})")
    

    start_time = time.time()
    nmf = NMF(
        n_components=50,
        max_iter=200,
        init='nndsvd',
        solver='cd',
        tol=0.0001
    )
    user_factors = nmf.fit_transform(small_matrix)
    item_factors = nmf.components_
    training_time = time.time() - start_time
    
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Iterations completed: {nmf.n_iter_}")
    print(f"Final error: {nmf.reconstruction_err_}")

    my_user_factors = nmf.transform(my_ratings)


    predicted_ratings = np.dot(my_user_factors, item_factors)
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=small_matrix.columns)

    rated_movies = set(int(movie_id) for movie_id in my_data['movieId'].values)
    print(f"You've rated {len(rated_movies)} movies")


    unrated_movies = [m for m in small_matrix.columns if int(m) not in rated_movies]
    print(f"Found {len(unrated_movies)} movies you haven't rated")

    recommendations = []
    for movie_id in unrated_movies:
        pred_rating = predicted_ratings_df.loc[0, movie_id]
        recommendations.append((movie_id, pred_rating))

    recommendations.sort(key=lambda x: x[1], reverse=True)
    movies_with_genres = pd.read_csv('./data/movies.csv')

    # Create a larger pool of candidate recommendations for diversity
    top_candidates = recommendations[:500]  # Use a larger pool than you'll display

    # Add diversity boosting
    def add_diversity(recommendations_list, n_recommendations=200):
        # Load movie genres if needed
        diverse_recommendations = []
        already_selected_genres = set()
        
        # First pass: Calculate genre diversity scores
        for movie_id, score in recommendations_list:
            # Get the movie's genres
            movie_info = movies_with_genres[movies_with_genres['movieId'] == int(movie_id)]
            if movie_info.empty:
                continue
                
            genres = movie_info.iloc[0]['genres'].split('|')
            
            # Calculate novelty bonus for introducing new genres
            new_genres = [g for g in genres if g not in already_selected_genres]
            diversity_bonus = len(new_genres) * 0.05  # Adjust weight as needed
            novelty_score = score + diversity_bonus
            
            diverse_recommendations.append((movie_id, score, novelty_score, genres))
            
            # Mark these genres as seen
            already_selected_genres.update(genres)
        
        # Sort by diversity-adjusted score
        diverse_recommendations.sort(key=lambda x: x[2], reverse=True)
        
        # Return the top N recommendations
        return [(movie_id, original_score) for movie_id, original_score, _, _ in diverse_recommendations[:n_recommendations]]

    # Get diverse recommendations
    diverse_recommendations = add_diversity(top_candidates)

    # Replace the original recommendations with diverse ones
    recommendations = diverse_recommendations
# Add this after defining diverse_recommendations

# Calculate movie popularity
    movie_pop_counts = filtered_movies_with_ratings.groupby('movieId').size()

    # Add popularity diversity (favoring some less popular movies)
    def add_popularity_diversity(recommendations_list, n_recommendations=200):
        # Get top candidates
        candidates = recommendations_list.copy()
        
        # Calculate popularity bonus/penalty
        popularity_adjusted = []
        for movie_id, score in candidates:
            # Get movie popularity (number of ratings)
            pop_count = movie_pop_counts.get(int(movie_id), 0)
            
            # Slight boost for less-rated movies (adjust constants as needed)
            popularity_factor = 1.0 if pop_count > 500 else 1.1
            adjusted_score = score * popularity_factor
            
            popularity_adjusted.append((movie_id, score, adjusted_score))
        
        # Sort by adjusted score
        popularity_adjusted.sort(key=lambda x: x[2], reverse=True)
        
        # Return original scores but in new order
        return [(movie_id, original_score) for movie_id, original_score, _ in popularity_adjusted[:n_recommendations]]

    # Apply popularity adjustment after genre diversity
    recommendations = add_popularity_diversity(diverse_recommendations)
    # Now display the recommendations
    print("\nTop Recommendations with Genre Diversity:")
    for movie_id, predicted_rating in recommendations[:20]:
        # Convert 0-1 scale back to 1-5 scale
        original_scale_rating = predicted_rating * 4 + 1
        
        movie_title = movie_id_to_title.get(movie_id, f"Unknown Movie (ID: {movie_id})")
        print(f"Movie: {movie_title}, Predicted Rating: {original_scale_rating:.2f}/5.00")


--- Testing with 100% of users (all movies) ---
Sampled matrix shape: (79691, 10542) (vs. original (79691, 10542))




Training time: 597.63 seconds
Iterations completed: 200
Final error: 2332.042324479858
You've rated 361 movies
Found 10202 movies you haven't rated

Top 20 Recommendations (movies you haven't seen):
Movie: Saving Private Ryan (1998), Predicted Rating: 2.79/5.00
Movie: American Beauty (1999), Predicted Rating: 2.64/5.00
Movie: Nightcrawler (2014), Predicted Rating: 2.59/5.00
Movie: Three Billboards Outside Ebbing, Missouri (2017), Predicted Rating: 2.56/5.00
Movie: Sixth Sense, The (1999), Predicted Rating: 2.50/5.00
Movie: Braveheart (1995), Predicted Rating: 2.46/5.00
Movie: The Hateful Eight (2015), Predicted Rating: 2.32/5.00
Movie: Usual Suspects, The (1995), Predicted Rating: 2.30/5.00
Movie: Birdman: Or (The Unexpected Virtue of Ignorance) (2014), Predicted Rating: 2.27/5.00
Movie: Departed, The (2006), Predicted Rating: 2.23/5.00
Movie: Prisoners (2013), Predicted Rating: 2.17/5.00
Movie: Drive (2011), Predicted Rating: 2.13/5.00
Movie: Terminator 2: Judgment Day (1991), Predict