In [1]:
!pip install surprise

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
folder_path = '../data/raw/ml-100k'


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# Load and preprocess the data
ratings_df = pd.read_csv(folder_path+'/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
users_df = pd.read_csv(folder_path+'/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
movies_df = pd.read_csv(folder_path+'/u.item', sep='|', encoding='latin-1', names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB_URL'] + ['genre'+str(i) for i in range(19)])
movies_df['release_year'] = movies_df['title'].str.extract(r'\((\d{4})\)')


In [4]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

# Load the dataset
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], Reader(rating_scale=(1, 5)))

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Using SVD (Singular Value Decomposition)
model = SVD()

# Train the model
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate RMSE
accuracy.rmse(predictions)


RMSE: 0.9390


0.9390242505561608

In [5]:
user_id = 1

In [6]:

all_movie_ids = movies_df['movie_id'].unique()

# Predict ratings for all movies the user hasn't rated yet
user_unrated_movies = ratings_df[~ratings_df['movie_id'].isin(ratings_df[ratings_df['user_id'] == user_id]['movie_id'])]
user_unrated_movies = user_unrated_movies['movie_id'].unique()

predicted_ratings = []
for movie_id in user_unrated_movies:
    predicted = model.predict(uid=user_id, iid=movie_id)
    predicted_ratings.append((movie_id, predicted.est))

# Identify the movie with the highest predicted rating
best_movie_id = max(predicted_ratings, key=lambda x: x[1])[0]

# Find the movie title
best_movie_title = movies_df[movies_df['movie_id'] == best_movie_id]['title'].iloc[0]

print(f"Recommended Movie for User {user_id}: {best_movie_title}")

Recommended Movie for User 1: One Flew Over the Cuckoo's Nest (1975)


In [7]:
# Predict ratings for these movies
predicted_ratings = []
for movie_id in user_unrated_movies:
    predicted = model.predict(uid=user_id, iid=movie_id)
    predicted_ratings.append((movie_id, predicted.est))

# Sort the predictions by estimated ratings in descending order
predicted_ratings.sort(key=lambda x: x[1], reverse=True)

# Select top N movies
top_n = 10  # Number of top recommendations to retrieve
top_movies = predicted_ratings[:top_n]

# Fetching movie titles and ratings
recommended_movies = [(movies_df[movies_df['movie_id'] == m_id]['title'].iloc[0], rating) for m_id, rating in top_movies]

# Display the recommendations with estimated ratings
for movie, rating in recommended_movies:
    print(f"Movie: {movie}, Estimated Rating: {rating}")

Movie: One Flew Over the Cuckoo's Nest (1975), Estimated Rating: 4.966745206305402
Movie: Casablanca (1942), Estimated Rating: 4.859361082687392
Movie: Secrets & Lies (1996), Estimated Rating: 4.7907693201282475
Movie: Jackie Brown (1997), Estimated Rating: 4.704498527538829
Movie: Maltese Falcon, The (1941), Estimated Rating: 4.641128407297116
Movie: Annie Hall (1977), Estimated Rating: 4.624287627710041
Movie: Lawrence of Arabia (1962), Estimated Rating: 4.6194202709832135
Movie: When We Were Kings (1996), Estimated Rating: 4.566762884961987
Movie: Chinatown (1974), Estimated Rating: 4.558289380869701
Movie: Close Shave, A (1995), Estimated Rating: 4.514808147709102
