In [1]:
# Install the Surprise library
!pip uninstall -y numpy
!pip install numpy==1.26.4
!pip install surprise

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4


In [2]:
# Import required modules
import numpy as np
from surprise import Dataset  # For loading and handling datasets
from surprise import Reader   # For parsing custom datasets
from surprise import SVD      # Singular Value Decomposition algorithm
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore  # K-Nearest Neighbors algorithms
from surprise import NMF      # Non-negative Matrix Factorization algorithm
from surprise import BaselineOnly  # Basic algorithm using baselines
from surprise.model_selection import train_test_split  # For splitting data
from surprise.model_selection import cross_validate    # For cross-validation
from surprise.model_selection import GridSearchCV      # For hyperparameter tuning
from surprise import accuracy  # For computing prediction accuracy metrics

In [3]:
# Load the built-in MovieLens dataset
data = Dataset.load_builtin('ml-100k')

In [4]:
# Split the dataset into train and test sets (75% training, 25% testing)
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [5]:
# Define a list of algorithms to compare
algorithms = [
    SVD(),
    KNNBasic(sim_options={'user_based': True}),  # User-based collaborative filtering
    KNNBasic(sim_options={'user_based': False}), # Item-based collaborative filtering
    KNNWithMeans(sim_options={'user_based': True}),
    NMF(),
    BaselineOnly()
]

# Evaluate each algorithm using cross-validation
results = {}
for algo in algorithms:
    algo_name = algo.__class__.__name__
    sim_option = ''
    
    # Add user/item based info for KNN algorithms
    if algo_name.startswith('KNN'):
        user_based = algo.sim_options.get('user_based', True)
        sim_option = 'User-based' if user_based else 'Item-based'
        algo_name = f"{algo_name} ({sim_option})"
    
    # Run 5-fold cross-validation
    cv_results = cross_validate(algo, data, measures=['RMSE', 'MAE'], 
                               cv=5, verbose=False)
    
    # Store results
    results[algo_name] = {
        'RMSE': cv_results['test_rmse'].mean(),
        'MAE': cv_results['test_mae'].mean()
    }

# Print comparison table
print("\nAlgorithm Comparison:")
print("-" * 60)
print(f"{'Algorithm':<30} {'RMSE':<15} {'MAE':<15}")
print("-" * 60)
for algo_name, metrics in results.items():
    print(f"{algo_name:<30} {metrics['RMSE']:<15.4f} {metrics['MAE']:<15.4f}")

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [6]:
# Define parameter grid for SVD
param_grid = {
   'n_factors': [50, 100, 150],
   'n_epochs': [10, 20, 30],
   'lr_all': [0.002, 0.005, 0.01],
   'reg_all': [0.02, 0.1, 0.5],
   'random_state': [42]
}


# Perform grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)


# Print best parameters
print("\nBest RMSE Parameters:")
print(gs.best_params['rmse'])
print(f"Best RMSE Score: {gs.best_score['rmse']:.4f}")


print("\nBest MAE Parameters:")
print(gs.best_params['mae'])
print(f"Best MAE Score: {gs.best_score['mae']:.4f}")


# Train the model with the best parameters
best_algo = SVD(**gs.best_params['mae'])
best_algo.fit(trainset)


Best RMSE Parameters:
{'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42}
Best RMSE Score: 0.9119

Best MAE Parameters:
{'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42}
Best MAE Score: 0.7212


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1082be330>

In [7]:
# Make predictions on the testset
predictions = best_algo.test(testset)

# Look at the first few predictions
for pred in predictions[:3]:
    print(f"User: {pred.uid}, Item: {pred.iid}, "
          f"Actual Rating: {pred.r_ui:.2f}, Predicted Rating: {pred.est:.2f}, "
          f"Error: {pred.r_ui - pred.est:.2f}")

User: 391, Item: 591, Actual Rating: 4.00, Predicted Rating: 3.43, Error: 0.57
User: 181, Item: 1291, Actual Rating: 1.00, Predicted Rating: 1.52, Error: -0.52
User: 637, Item: 268, Actual Rating: 2.00, Predicted Rating: 2.77, Error: -0.77


In [8]:
# Create a full training set that includes all users
full_trainset = data.build_full_trainset()

# Fit model to all users and data
best_algo.fit(full_trainset)

def get_top_n_recommendations(algo, data, user_id, n=10):
    """
    Generate top-N recommendations for a specific user
    
    Parameters:
    -----------
    algo : surprise.prediction_algorithms
        Trained algorithm
    data : surprise.Trainset
        The full training dataset
    user_id : str
        The user ID for whom to generate recommendations
    n : int, default=10
        Number of recommendations to generate
        
    Returns:
    --------
    list of tuples
        (item_id, predicted_rating) sorted by predicted rating in descending order
    """
    # Get a list of all items
    all_item_ids = data.all_items()
    
    # Convert raw user ID to inner ID used by the trainset
    try:
        inner_user_id = data.to_inner_uid(user_id)
    except ValueError:
        print(f"User {user_id} doesn't exist in the training set")
        return []
    
    # Get items rated by this user
    user_items = [j for (j, _) in data.ur[inner_user_id]]
    
    # Items not rated by the user
    unrated_items = [item_id for item_id in all_item_ids if item_id not in user_items]
    
    # Predict ratings for unrated items
    predictions = []
    for item_id in unrated_items:
        # Convert inner item ID back to raw ID for prediction
        raw_item_id = data.to_raw_iid(item_id)
        # Get prediction
        pred = algo.predict(user_id, raw_item_id)
        predictions.append((raw_item_id, pred.est))
    
    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top n recommendations
    return predictions[:n]

# Example usage
user_id = '196'  # Choose a user ID from the dataset
top_recommendations = get_top_n_recommendations(algo, trainset, user_id, n=5)

print(f"Top 5 movie recommendations for user {user_id}:")
for movie_id, predicted_rating in top_recommendations:
    print(f"Movie ID: {movie_id}, Predicted Rating: {predicted_rating:.2f}")

Top 5 movie recommendations for user 196:
Movie ID: 408, Predicted Rating: 4.47
Movie ID: 64, Predicted Rating: 4.43
Movie ID: 318, Predicted Rating: 4.43
Movie ID: 483, Predicted Rating: 4.42
Movie ID: 169, Predicted Rating: 4.42
