In [5]:
# conda install scikit-surprise

In [27]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

##### Load and Prepare Data

In [29]:
# Load the movies dataset (to map movie IDs to titles)
file_path = 'data_files/ml-latest-small/movies.csv'
movies_df = pd.read_csv(file_path)

# Load the ratings dataset
ratings_path = 'data_files/ml-latest-small/ratings.csv'
ratings_df = pd.read_csv(ratings_path)

In [31]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [33]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [35]:
# Drop the timestamp column - we don't need it for SVD
ratings_df = ratings_df.drop('timestamp', axis=1)

ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


##### Prepare Data for surprise

In [37]:
# The Reader object helps parse the file or dataframe
# We need to specify the rating scale (MovieLens is 0.5 to 5)
reader = Reader(rating_scale=(0.5, 5.0))

# Load the dataset from the pandas dataframe
# The columns must be in the order: user, item, rating
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

print("\nData loaded into surprise format.")


Data loaded into surprise format.


##### Train-Test Split and Model Training

In [42]:
# Split the data into training and testing sets
# test_size=0.2 means 20% of the data is for testing
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [44]:
# --- Train the SVD Model ---
# SVD is a Matrix Factorization algorithm (a form of collaborative filtering)
print("\nTraining the SVD model...")
model = SVD(n_factors=100, n_epochs=20, random_state=42)
model.fit(trainset)

print("Training complete.")


Training the SVD model...
Training complete.


In [46]:
# --- Evaluate the Model on the Test Set ---
print("\nEvaluating model on the test set...")
predictions = model.test(testset)


Evaluating model on the test set...


In [48]:
# Calculate RMSE (Root Mean Squared Error)
# This measures the average magnitude of the error in our rating predictions
rmse = accuracy.rmse(predictions)
print(f"Test Set RMSE: {rmse}")

RMSE: 0.8807
Test Set RMSE: 0.8807462819979623


##### An RMSE of ~0.87 means that, on average, our model's rating predictions are off by about 0.87 stars on a 5-star scale. This is quite good!

##### Get Recommendations for a Specific User

In [54]:
import heapq
from collections import defaultdict

def get_top_n_recommendations(user_id, n=10):
    """
    Get the top-N movie recommendations for a specific user.
    
    Args:
        user_id (int): The user ID to get recommendations for.
        n (int): The number of recommendations to return.
        
    Returns:
        list: A list of (movie_id, predicted_rating) tuples.
    """
    
    # 1. Get a list of all movie IDs
    all_movie_ids = ratings_df['movieId'].unique()
    
    # 2. Get a list of movie IDs that the user has already rated
    rated_movie_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    
    # 3. Get a list of movie IDs the user has NOT rated (the "anti-testset")
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    
    # 4. Predict ratings for all unrated movies
    print(f"\nPredicting ratings for {len(unrated_movie_ids)} unrated movies for user {user_id}...")
    testset_for_user = [[user_id, movie_id, 4.0] for movie_id in unrated_movie_ids] # 4.0 is a dummy rating
    
    predictions = model.test(testset_for_user)
    
    # 5. Sort predictions and get the top N
    top_n = []
    for uid, iid, true_r, est, _ in predictions:
        top_n.append((iid, est)) # Append (movieId, predicted_rating)
        
    # Sort by predicted rating (est) in descending order
    top_n.sort(key=lambda x: x[1], reverse=True)
    
    return top_n[:n]

# --- Get and Display Recommendations ---
USER_TO_RECOMMEND = 1 # Example user
top_10_recs = get_top_n_recommendations(user_id=USER_TO_RECOMMEND, n=10)

print(f"\n--- Top 10 Movie Recommendations for User {USER_TO_RECOMMEND} ---")

for movie_id, pred_rating in top_10_recs:
    # Get the movie title from the movies_df
    movie_title = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    print(f"  - Movie: {movie_title} (ID: {movie_id})")
    print(f"    Predicted Rating: {pred_rating:.2f}")


Predicting ratings for 9492 unrated movies for user 1...

--- Top 10 Movie Recommendations for User 1 ---
  - Movie: Departed, The (2006) (ID: 48516)
    Predicted Rating: 5.00
  - Movie: North by Northwest (1959) (ID: 908)
    Predicted Rating: 5.00
  - Movie: Casablanca (1942) (ID: 912)
    Predicted Rating: 5.00
  - Movie: Seven Samurai (Shichinin no samurai) (1954) (ID: 2019)
    Predicted Rating: 5.00
  - Movie: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) (ID: 750)
    Predicted Rating: 5.00
  - Movie: Lord of the Rings: The Return of the King, The (2003) (ID: 7153)
    Predicted Rating: 5.00
  - Movie: Blade Runner (1982) (ID: 541)
    Predicted Rating: 5.00
  - Movie: One Flew Over the Cuckoo's Nest (1975) (ID: 1193)
    Predicted Rating: 5.00
  - Movie: Grand Day Out with Wallace and Gromit, A (1989) (ID: 1223)
    Predicted Rating: 5.00
  - Movie: Lost in Translation (2003) (ID: 6711)
    Predicted Rating: 5.00
