In [1]:
import pandas as pd
from sklearn.decomposition import NMF
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
ratings= pd.read_csv('./data/ratings.csv')
movies = pd.read_csv('./data/movies.csv')
movie_id_to_title = dict(zip(movies['movieId'], movies['title']))


In [3]:
movies_with_ratings = pd.merge(movies, ratings, on='movieId', how='inner')

In [4]:

average_ratings = movies_with_ratings.groupby('movieId')['rating'].mean()

popular_movies = average_ratings[average_ratings >= 2.5].index # was 3

filtered_movies_with_ratings = movies_with_ratings[movies_with_ratings['movieId'].isin(popular_movies)]

# remove non expert users, ie, users with less than 75% of the quantile of ratings, keeping users with more than 89 ratings
user_rating_counts = filtered_movies_with_ratings.groupby('userId')['rating'].count()
print(user_rating_counts.describe())
filtered_users = user_rating_counts[(user_rating_counts >= user_rating_counts.quantile(0.85)) & 
                                    (user_rating_counts <= user_rating_counts.quantile(0.97))].index

filtered_movies_with_ratings = filtered_movies_with_ratings[filtered_movies_with_ratings['userId'].isin(filtered_users)]

user_rating_counts_after_filter = filtered_movies_with_ratings.groupby('userId')['rating'].count()

user_rating_counts_after_filter.describe()


count    330846.000000
mean         99.657965
std         221.569474
min           1.000000
25%          14.000000
50%          31.000000
75%          97.000000
max       30314.000000
Name: rating, dtype: float64


count    39828.000000
mean       295.678267
std        103.761686
min        169.000000
25%        209.000000
50%        267.000000
75%        363.000000
max        563.000000
Name: rating, dtype: float64

In [5]:
filtered_movies_with_ratings.drop(columns=["timestamp","genres","title"], inplace=True)

In [6]:
# filter out movies with less than x amount of ratings to clean up further
# after looking up some of the movies around the 70-75th quantile range, these movies are super niche and not very popular, so I will filter out movies with less than 30 ish ratings
movie_rating_counts = filtered_movies_with_ratings.groupby('movieId').size()
movie_rating_counts.quantile(0.70)

np.float64(20.0)

In [7]:
def limit_ratings_per_movie(ratings_df, max_ratings_per_movie=1000, random_seed=42):
    """
    Limit the number of ratings per movie to improve diversity.
    
    Parameters:
    -----------
    ratings_df : pandas.DataFrame
        DataFrame containing user ratings with columns 'userId', 'movieId', 'rating'
    max_ratings_per_movie : int
        Maximum number of ratings to keep per movie
    random_seed : int
        Random seed for reproducibility
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with limited ratings per movie
    """
    print(f"Limiting movies to a maximum of {max_ratings_per_movie} ratings each...")
    
    # Get counts of ratings per movie
    movie_rating_counts = ratings_df.groupby('movieId').size()
    print(f"Before limiting: {len(ratings_df)} total ratings across {len(movie_rating_counts)} movies")
    print(f"Movies with >{max_ratings_per_movie} ratings: {(movie_rating_counts > max_ratings_per_movie).sum()}")
    
    # Initialize a list to store the filtered ratings
    limited_ratings = []
    
    # Process each movie's ratings
    for movie_id, movie_df in ratings_df.groupby('movieId'):
        if len(movie_df) > max_ratings_per_movie:
            # Randomly sample max_ratings_per_movie ratings for this movie
            np.random.seed(random_seed + int(movie_id))  # Different seed for each movie
            limited_ratings.append(movie_df.sample(max_ratings_per_movie))
        else:
            # Keep all ratings for movies with fewer than max_ratings_per_movie
            limited_ratings.append(movie_df)
    
    # Combine all movies' ratings back into a single DataFrame
    result_df = pd.concat(limited_ratings, ignore_index=True)
    
    print(f"After limiting: {len(result_df)} total ratings across {len(movie_rating_counts)} movies")
    
    return result_df

In [8]:

min_ratings_threshold = 50
popular_movies = movie_rating_counts[movie_rating_counts >= min_ratings_threshold].index


filtered_movies_with_ratings = filtered_movies_with_ratings[
    filtered_movies_with_ratings['movieId'].isin(popular_movies)
]
filtered_movies_with_ratings.describe()

filtered_movies_with_ratings = limit_ratings_per_movie(
    filtered_movies_with_ratings, 
    max_ratings_per_movie=1000
)

# pca för att hitta så olika användare som möjligt?, detta bör också minska "generalisingen" som händer

# endast fit_transform på träningsdatan
# endast transform på test data
filtered_movies_with_ratings.describe()

Limiting movies to a maximum of 1000 ratings each...
Before limiting: 11569829 total ratings across 8625 movies
Movies with >1000 ratings: 2307
After limiting: 4005070 total ratings across 8625 movies


Unnamed: 0,movieId,userId,rating
count,4005070.0,4005070.0,4005070.0
mean,37248.15,165002.8,3.505087
std,56951.56,95582.75,1.035603
min,1.0,21.0,0.5
25%,1997.0,81821.0,3.0
50%,4705.0,165646.0,3.5
75%,60040.0,247843.0,4.0
max,286897.0,330975.0,5.0


In [9]:

user_movie_matrix = filtered_movies_with_ratings.pivot(index='userId', columns='movieId', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)


In [10]:
# #drop movies that are missing values in movieId
# my_data = pd.read_csv("data/lddec_ratings_with_ids.csv")
# my_data = my_data.dropna(subset=['movieId'])

# # drop movies i have rated under 3
# # my_data = my_data[my_data['rating_num'] >= 3.0]

# # testing another approach, as having movies i have rated lower than 3 might be useful for the model to learn what i dont like
# my_data['scaled_rating'] = (my_data['Rating'] - 1) / 4.0


# my_data

In [11]:
def scale_ratings(matrix):
    """
    Scale each user's ratings using MinMaxScaler to range [0,1].
    Assumes each user has rated multiple movies.
    
    Parameters:
    -----------
    matrix : pandas.DataFrame
        User-movie matrix with users as rows and movies as columns
        
    Returns:
    --------
    pandas.DataFrame
        Scaled user-movie matrix
    """
    data = matrix.copy().values
    rows = data.shape[0]
    
    # Scale each user's ratings (each row)
    for i in range(rows):
        # Find rated movies for this user
        nonzero_mask = data[i, :] > 0
        rated_indices = np.where(nonzero_mask)[0]
        
        # Get user's ratings and reshape for scaler
        user_ratings = data[i, rated_indices].reshape(-1, 1)
        
        # Apply MinMaxScaler
        scaler = MinMaxScaler()
        scaled_ratings = scaler.fit_transform(user_ratings).flatten()
        
        # Update the matrix with scaled ratings
        data[i, rated_indices] = scaled_ratings
    
    # Convert back to DataFrame
    scaled_matrix = pd.DataFrame(data, index=matrix.index, columns=matrix.columns)
    return scaled_matrix

In [12]:
def prepare_user_ratings(ratings_file_path, user_movie_matrix_columns):
    """
    Load and prepare user ratings for recommendation using MinMaxScaler.
    Assumes the user has rated multiple movies.
    
    Parameters:
    -----------
    ratings_file_path : str
        Path to CSV file with user ratings
    user_movie_matrix_columns : pandas.Index
        Column names from the user-movie matrix
        
    Returns:
    --------
    tuple
        (user_data_df, ratings_matrix_df)
    """
    # Load user data
    user_data = pd.read_csv(ratings_file_path)
    user_data = user_data.dropna(subset=['movieId'])
    
    # Create user-movie ratings matrix with zero values - using float dtype
    user_ratings = pd.DataFrame(0.0, index=[0], columns=user_movie_matrix_columns, dtype=np.float64)
    
    # Fill with original ratings
    for _, row in user_data.iterrows():
        movie_id = row['movieId']
        if movie_id in user_ratings.columns:
            user_ratings.loc[0, movie_id] = float(row['Rating'])
    
    # Get indices of rated movies
    nonzero_mask = user_ratings.values > 0
    rated_indices = np.where(nonzero_mask[0])[0]
    
    # Get this user's ratings
    user_rating_values = user_ratings.iloc[0, rated_indices].values.reshape(-1, 1)
    
    # Apply MinMaxScaler
    scaler = MinMaxScaler()
    scaled_ratings = scaler.fit_transform(user_rating_values).flatten()
    
    # Update the user ratings with scaled values
    user_ratings.iloc[0, rated_indices] = scaled_ratings
    
    return user_data, user_ratings

In [None]:
def fit_model(
    user_movie_matrix_scaled,
    n_components=50,
    max_iter=200,
    init='random',
    solver='cd',
    tol=0.0001,

    sample_size=1.0
):

    print(f"\n--- Fitting model with {sample_size*100:.0f}% of users ---")
    
    # Sample users 
    if sample_size < 1.0:
        n_users = int(user_movie_matrix.shape[0] * sample_size)
        sampled_users = np.random.choice(user_movie_matrix.index, size=n_users, replace=False)
        training_matrix = user_movie_matrix.loc[sampled_users, :]
    else:
        training_matrix = user_movie_matrix
    
    print(f"Training matrix shape: {training_matrix.shape}")
    
    # Train the model
    start_time = time.time()
    nmf = NMF(
        n_components=n_components,
        max_iter=max_iter,
        verbose=0,
        init="nndsvd",
        #solver=solver,
        #tol=tol,
        #l1_ratio=l1_ratio
    )
    
    user_factors = nmf.fit_transform(training_matrix)
    item_factors = nmf.components_
    training_time = time.time() - start_time
    
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Iterations completed: {nmf.n_iter_}")
    print(f"Final error: {nmf.reconstruction_err_}")
    
    return nmf, training_matrix, training_time



In [14]:
def get_recommendations(
    nmf_model,
    training_matrix,
    my_ratings,
    my_data,
    n_recommendations=20,
    movie_id_to_title_map=None
):
    # Transform personal ratings into factor space
    my_user_factors = nmf_model.transform(my_ratings)
    
    # Generate predictions
    predicted_ratings = np.dot(my_user_factors, nmf_model.components_)
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=training_matrix.columns)
    
    # Get movies already rated
    rated_movies = set(int(movie_id) for movie_id in my_data['movieId'].values)
    print(f"You've rated {len(rated_movies)} movies")
    
    # Filter for unrated movies
    unrated_movies = [m for m in training_matrix.columns if int(m) not in rated_movies]
    print(f"Found {len(unrated_movies)} movies you haven't rated")
    
    # Get recommendations
    recommendations = []
    for movie_id in unrated_movies:
        pred_rating = predicted_ratings_df.loc[0, movie_id]
        recommendations.append((movie_id, pred_rating))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    top_recommendations = recommendations[:n_recommendations]
    
    if movie_id_to_title_map:
        print("\nTop Recommendations:")
        for movie_id, predicted_rating in top_recommendations:
            # Convert from 0-1 scale back to 1-5 scale
            original_scale_rating = predicted_rating * 4 + 1
            
            movie_title = movie_id_to_title_map.get(movie_id, f"Unknown Movie (ID: {movie_id})")
            print(f"Movie: {movie_title}, Predicted Rating: {original_scale_rating:.2f}/5.00")
    
    return top_recommendations

In [15]:
def get_recommendations_by_genre(
    nmf_model,
    training_matrix,
    my_ratings,
    my_data,
    movie_metadata,  # DataFrame containing movie genres
    n_per_genre=5,
    movie_id_to_title_map=None
):
    """
    Get movie recommendations sorted by genre, with top n movies per genre.
    
    Parameters:
    -----------
    nmf_model : sklearn.decomposition.NMF
        Trained NMF model
    training_matrix : pandas.DataFrame
        User-movie matrix used for training
    my_ratings : pandas.DataFrame
        User's ratings
    my_data : pandas.DataFrame
        User's ratings data
    movie_metadata : pandas.DataFrame
        DataFrame containing movieId and genres
    n_per_genre : int
        Number of recommendations per genre
    movie_id_to_title_map : dict
        Mapping from movie ID to movie title
        
    Returns:
    --------
    dict
        Dictionary mapping genres to lists of recommended movies
    """
    # Transform personal ratings into factor space
    my_user_factors = nmf_model.transform(my_ratings)
    
    # Generate predictions
    predicted_ratings = np.dot(my_user_factors, nmf_model.components_)
    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=training_matrix.columns)
    
    # Get movies already rated
    rated_movies = set(my_data['movieId'].astype(int).values)
    print(f"You've rated {len(rated_movies)} movies")
    
    # Filter for unrated movies
    unrated_movies = [m for m in training_matrix.columns if int(m) not in rated_movies]
    print(f"Found {len(unrated_movies)} movies you haven't rated")
    
    # Get all recommendations with predicted ratings
    recommendations = []
    for movie_id in unrated_movies:
        movie_id_int = int(movie_id)
        pred_rating = predicted_ratings_df.loc[0, movie_id]
        
        # Get movie genres
        movie_info = movie_metadata[movie_metadata['movieId'] == movie_id_int]
        if len(movie_info) > 0:
            genres = movie_info['genres'].iloc[0].split('|')
            
            # Add recommendation with genres
            for genre in genres:
                if genre != '(no genres listed)':
                    recommendations.append((movie_id_int, pred_rating, genre))
    
    # Group recommendations by genre
    genre_recommendations = {}
    for movie_id, rating, genre in recommendations:
        if genre not in genre_recommendations:
            genre_recommendations[genre] = []
        genre_recommendations[genre].append((movie_id, rating))
    
    # Sort recommendations within each genre and get top n
    for genre in genre_recommendations:
        genre_recommendations[genre].sort(key=lambda x: x[1], reverse=True)
        genre_recommendations[genre] = genre_recommendations[genre][:n_per_genre]
    
    # Print recommendations by genre
    if movie_id_to_title_map:
        print("\n===== TOP RECOMMENDATIONS BY GENRE =====")
        
        # Sort genres alphabetically for nicer display
        for genre in sorted(genre_recommendations.keys()):
            print(f"\n----- {genre.upper()} -----")
            
            for i, (movie_id, predicted_rating) in enumerate(genre_recommendations[genre], 1):
                # Convert from 0-1 scale back to 1-5 scale
                original_scale_rating = predicted_rating * 4 + 1
                
                movie_title = movie_id_to_title_map.get(movie_id, f"Unknown Movie (ID: {movie_id})")
                print(f"{i}. {movie_title} - {original_scale_rating:.2f}/5.00")
    
    return genre_recommendations

In [16]:
user_movie_matrix_scaled = scale_ratings(user_movie_matrix)


In [25]:
# Try different n_components values
component_options = [50, 100, 200, 300, 500]
results = {}

for n_comp in component_options:
    print(f"\nTesting with n_components={n_comp}")
    
    start_time = time.time()
    nmf_model, training_matrix, _ = fit_model(
        user_movie_matrix_scaled,
        n_components=n_comp,
        sample_size=0.1,
        max_iter=300
    )
    train_time = time.time() - start_time
    
    # Calculate quality metrics
    results[n_comp] = {
        'training_time': train_time,
        'iterations': nmf_model.n_iter_,
        'error': nmf_model.reconstruction_err_
    }

# Print results comparison
print("\nResults Summary:")
for n_comp, metrics in results.items():
    print(f"\nn_components = {n_comp}:")
    print(f"  Training time: {metrics['training_time']:.2f} seconds")
    print(f"  Iterations: {metrics['iterations']}")
    print(f"  Reconstruction error: {metrics['error']:.2f}")


Testing with n_components=50

--- Fitting model with 10% of users ---
Training matrix shape: (3982, 8625)




Training time: 24.09 seconds
Iterations completed: 300
Final error: 2162.344768137473

Testing with n_components=100

--- Fitting model with 10% of users ---
Training matrix shape: (3982, 8625)




Training time: 57.97 seconds
Iterations completed: 300
Final error: 2108.134907447252

Testing with n_components=200

--- Fitting model with 10% of users ---
Training matrix shape: (3982, 8625)




Training time: 189.07 seconds
Iterations completed: 300
Final error: 2058.3686222400506

Testing with n_components=300

--- Fitting model with 10% of users ---
Training matrix shape: (3982, 8625)


KeyboardInterrupt: 

In [26]:
nmf_model, training_matrix, _ = fit_model(
    user_movie_matrix_scaled,
    n_components=30,
    sample_size=0.5,
    max_iter=400
)


--- Fitting model with 50% of users ---
Training matrix shape: (19914, 8625)




Training time: 146.80 seconds
Iterations completed: 400
Final error: 4897.447702899946


In [18]:
elliott_data, elliott_ratings = prepare_user_ratings(
    "data/loelliot_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
ludde_data, ludde_ratings = prepare_user_ratings(
    "data/lddec_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
charlie_data, charlie_ratings = prepare_user_ratings(
    "data/chaarll_ratings_with_ids.csv", 
    user_movie_matrix_scaled.columns
)
tilda_data, tilda_ratings = prepare_user_ratings(
    "data/tilda_h_ratings_with_ids.csv",
    user_movie_matrix_scaled.columns
)

In [28]:
recommendations1 = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=ludde_ratings,
    my_data=ludde_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title
)


You've rated 425 movies
Found 8224 movies you haven't rated

Top Recommendations:
Movie: Band of Brothers (2001), Predicted Rating: 3.25/5.00
Movie: Ford v. Ferrari (2019), Predicted Rating: 3.13/5.00
Movie: Palm Springs (2020), Predicted Rating: 3.06/5.00
Movie: The Gentlemen (2020), Predicted Rating: 2.97/5.00
Movie: The Founder (2016), Predicted Rating: 2.93/5.00
Movie: Soul (2020), Predicted Rating: 2.91/5.00
Movie: Whiplash (2013), Predicted Rating: 2.90/5.00
Movie: The Trial of the Chicago 7 (2020), Predicted Rating: 2.85/5.00
Movie: Green Book (2018), Predicted Rating: 2.83/5.00
Movie: Molly's Game (2017), Predicted Rating: 2.64/5.00
Movie: Lion (2016), Predicted Rating: 2.58/5.00
Movie: Hacksaw Ridge (2016), Predicted Rating: 2.55/5.00
Movie: The Irishman (2019), Predicted Rating: 2.54/5.00
Movie: Spotlight (2015), Predicted Rating: 2.54/5.00
Movie: Hell or High Water (2016), Predicted Rating: 2.52/5.00
Movie: Sully (2016), Predicted Rating: 2.52/5.00
Movie: Darkest Hour (2017)

In [27]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=tilda_ratings,
    my_data=tilda_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

You've rated 285 movies
Found 8369 movies you haven't rated

Top Recommendations:
Movie: The Batman (2022), Predicted Rating: 2.60/5.00
Movie: Spider-Man: No Way Home (2021), Predicted Rating: 2.53/5.00
Movie: Palm Springs (2020), Predicted Rating: 2.46/5.00
Movie: Whiplash (2013), Predicted Rating: 2.35/5.00
Movie: Band of Brothers (2001), Predicted Rating: 2.35/5.00
Movie: Uncut Gems (2019), Predicted Rating: 2.29/5.00
Movie: 1917 (2019), Predicted Rating: 2.24/5.00
Movie: The Gentlemen (2020), Predicted Rating: 2.22/5.00
Movie: The Trial of the Chicago 7 (2020), Predicted Rating: 2.21/5.00
Movie: Planet Earth (2006), Predicted Rating: 2.19/5.00
Movie: Tenet (2020), Predicted Rating: 2.17/5.00
Movie: Little Women (2019), Predicted Rating: 2.16/5.00
Movie: Manchester by the Sea (2016), Predicted Rating: 2.16/5.00
Movie: I, Tonya (2017), Predicted Rating: 2.15/5.00
Movie: Shang-Chi and the Legend of the Ten Rings (2021), Predicted Rating: 2.14/5.00
Movie: The Founder (2016), Predicted 

In [21]:

recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=elliott_ratings,
    my_data=elliott_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

violation: 1.0
violation: 0.466268049533562
violation: 0.01157317202051252
violation: 0.00035236039643320876
violation: 1.4711407076715088e-05
Converged at iteration 6
You've rated 177 movies
Found 8497 movies you haven't rated

Top Recommendations:
Movie: Marriage Story (2019), Predicted Rating: 1.82/5.00
Movie: Moonlight, Predicted Rating: 1.77/5.00
Movie: Whiplash (2013), Predicted Rating: 1.73/5.00
Movie: Uncut Gems (2019), Predicted Rating: 1.73/5.00
Movie: The Favourite, Predicted Rating: 1.72/5.00
Movie: Hunt for the Wilderpeople (2016), Predicted Rating: 1.71/5.00
Movie: Wind River (2017), Predicted Rating: 1.69/5.00
Movie: Three Billboards Outside Ebbing, Missouri (2017), Predicted Rating: 1.69/5.00
Movie: The Irishman (2019), Predicted Rating: 1.69/5.00
Movie: Molly's Game (2017), Predicted Rating: 1.69/5.00
Movie: The Ballad of Buster Scruggs (2018), Predicted Rating: 1.68/5.00
Movie: Nocturnal Animals, Predicted Rating: 1.67/5.00
Movie: The Disaster Artist (2017), Predicted

In [22]:
recommendations = get_recommendations(
    nmf_model=nmf_model,
    training_matrix=training_matrix,
    my_ratings=charlie_ratings,
    my_data=charlie_data,
    n_recommendations=200,
    movie_id_to_title_map=movie_id_to_title,
)

violation: 1.0
violation: 0.38589988327361535
violation: 0.048884345555042674
violation: 0.011580789607341267
violation: 0.001911020399634589
violation: 0.0002708757685699208
violation: 2.6625589847774434e-05
Converged at iteration 8
You've rated 81 movies
Found 8546 movies you haven't rated

Top Recommendations:
Movie: Marriage Story (2019), Predicted Rating: 1.60/5.00
Movie: Call Me by Your Name (2017), Predicted Rating: 1.58/5.00
Movie: Moonlight, Predicted Rating: 1.56/5.00
Movie: I, Tonya (2017), Predicted Rating: 1.54/5.00
Movie: Whiplash (2013), Predicted Rating: 1.54/5.00
Movie: Wind River (2017), Predicted Rating: 1.53/5.00
Movie: Uncut Gems (2019), Predicted Rating: 1.53/5.00
Movie: Palm Springs (2020), Predicted Rating: 1.53/5.00
Movie: The Favourite, Predicted Rating: 1.52/5.00
Movie: Hunt for the Wilderpeople (2016), Predicted Rating: 1.52/5.00
Movie: Three Billboards Outside Ebbing, Missouri (2017), Predicted Rating: 1.50/5.00
Movie: The Irishman (2019), Predicted Rating: