In [6]:
# Import libraries
import pandas as pd
import numpy as np  # Single version of numpy
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Reading and preparing the data
ratings = pd.read_csv('/content/rating.csv', nrows=1000000)
movies = pd.read_csv('/content/movie.csv', nrows=1000000)

# Extract and clean the year from the movie title
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False)
movies['title'] = movies['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()

# Number of unique users and movies
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
print(f'Number of users = {n_users} | Number of movies = {n_movies}')

# Define the reader with rating scale
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Load data into Surprise's Dataset class
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize the SVD algorithm
svd = SVD(n_factors=50)

# Train the model
svd.fit(trainset)

# Test the model
predictions = svd.test(testset)
print(f"Model accuracy (RMSE): {accuracy.rmse(predictions)}")

# Function to recommend movies
def recommend_movies(user_id, movies_df, original_ratings, model, num_recommendations=10):
    """
    Recommend top N movies for a given user using the trained SVD model.
    :param user_id: The ID of the user for whom recommendations are to be made
    :param movies_df: DataFrame with movie details
    :param original_ratings: DataFrame with user ratings
    :param model: Trained SVD model
    :param num_recommendations: Number of movies to recommend
    :return: DataFrames of already rated and recommended movies
    """

    # Filter out movies the user has already rated
    rated_movies = original_ratings[original_ratings['userId'] == user_id]
    rated_movie_ids = set(rated_movies['movieId'])

    # Predict ratings for unrated movies
    all_movie_ids = set(movies_df['movieId'])
    unrated_movie_ids = all_movie_ids - rated_movie_ids

    recommendations = []
    for movie_id in unrated_movie_ids:
        est_rating = model.predict(user_id, movie_id).est
        recommendations.append((movie_id, est_rating))

    # Sort by estimated rating
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:num_recommendations]

    # Convert to DataFrame for easier viewing
    rec_movie_ids = [movie_id for movie_id, _ in recommendations]
    recommended_movies = movies_df[movies_df['movieId'].isin(rec_movie_ids)]
    recommended_movies = recommended_movies.assign(Predicted_Rating=[est for _, est in recommendations])

    # Display already rated movies and recommended movies
    print(f'User {user_id} has already rated {len(rated_movie_ids)} movies.')
    print(f'Recommending top {num_recommendations} movies not yet rated by User {user_id}.')

    return rated_movies.merge(movies_df, on='movieId', how='left'), recommended_movies

# Get recommendations for a user
already_rated, recommendations = recommend_movies(152, movies, ratings, svd, num_recommendations=20)

# Display top rated movies and recommended movies
print("Top 20 movies that User 152 has rated:")
print(already_rated.head(20))
print("\nTop 20 movies that User 152 may enjoy:")
print(recommendations)


Number of users = 3245 | Number of movies = 12083
RMSE: 1.8139
Model accuracy (RMSE): 1.8139392897164632
User 152 has already rated 154 movies.
Recommending top 20 movies not yet rated by User 152.
Top 20 movies that User 152 has rated:
    userId  movieId  rating            timestamp  \
0      152      2.0     3.0  2006-10-18 23:47:31   
1      152     16.0     4.5  2006-10-18 23:48:15   
2      152     19.0     2.5  2006-10-18 23:47:46   
3      152     39.0     5.0  2006-10-18 23:57:43   
4      152     47.0     4.0  2006-10-18 23:52:27   
5      152     50.0     4.0  2006-10-18 23:56:03   
6      152     72.0     2.0  2006-10-18 23:49:42   
7      152    104.0     3.0  2006-10-18 23:48:21   
8      152    141.0     3.5  2006-10-18 23:56:24   
9      152    150.0     3.5  2006-10-18 23:53:38   
10     152    153.0     4.0  2006-10-18 23:56:00   
11     152    185.0     2.5  2006-10-18 23:56:33   
12     152    231.0     5.0  2006-10-18 23:55:47   
13     152    235.0     3.5  2006-1