# Movie Recommendation System using Matrix Factorization with Singular Value Decomposition

With Matrix Factorization, we would be able to look at a user's historical rating to see what movies to recommend them.

In [7]:
import pandas as pd 
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import Reader
import os
import boto3
from dotenv import load_dotenv
import pickle
import re

In [2]:
load_dotenv()

bucket_name = os.getenv("AWS_BUCKET_NAME")
ratings_file = os.getenv("AWS_RATINGS_FILE")
models_file = os.getenv("AWS_MODEL_FILE")

s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("AWS_SECRET"),
    region_name=os.getenv("AWS_REGION")
)

s3.download_file(bucket_name, ratings_file, "ratings.csv")
s3.download_file(bucket_name, models_file, models_file)
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("../BigMovieData/ml-32m/movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [None]:
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

In [7]:
model = SVD()
model.fit(trainset)
from surprise import accuracy
predictions = model.test(testset)
print("RMSE:", accuracy.rmse(predictions))

RMSE: 0.8741
RMSE: 0.874079876653131


In [22]:
with open("recommender_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [24]:
personal_ratings = pd.read_csv("../personal_letterboxd/ratings.csv")
personal_ratings["Year"] = personal_ratings["Year"].astype(str)
personal_ratings['title'] = personal_ratings["Name"] + " (" + personal_ratings['Year'] + ")"
personal_ratings.head()

user_rating_merged = personal_ratings.merge(
    movies,
    left_on=["title"],
    right_on=["title"],
    how="inner"
)

user_rating_merged['userId'] = 200949
user_rating_merged.head()
final_user_rating = user_rating_merged[['userId', 'movieId', 'Rating']]
final_user_rating.rename(columns={"Rating": "rating"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_user_rating.rename(columns={"Rating": "rating"}, inplace=True)


In [20]:
ratings = pd.concat([ratings, final_user_rating], ignore_index= True)

In [25]:
all_movies = ratings['movieId'].unique()
def recommend_movies(user_id, n_recommendations=10):
    watched_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    movie_predictions = [
        (movie, loaded_model.predict(user_id, movie).est) for movie in all_movies if movie not in watched_movies
    ]
    movie_predictions.sort(key=lambda x: x[1], reverse=True)
    top_movies = movie_predictions[:n_recommendations]
    movies_df = pd.read_csv('../BigMovieData/ml-32m/movies.csv')
    recommended_movies = [(movies_df[movies_df['movieId'] == movie_id]['title'].values[0], rating) 
                          for movie_id, rating in top_movies]
    
    return recommended_movies

user_id = 200949
recommendations = recommend_movies(user_id)
for movie in recommendations:
    print(movie)


('Planet Earth II (2016)', 4.455477720171993)
('The Work of Director Chris Cunningham (2003)', 4.417926060626628)
('I Am So Proud of You (2008)', 4.416756684529041)
('Twelve Angry Men (1954)', 4.4016952499585384)
('Planet Earth (2006)', 4.389601579900872)
('Cosmos', 4.382901230440704)
('Band of Brothers (2001)', 4.373974033992065)
('The Roosevelts: An Intimate History (2014)', 4.373105950965507)
('Dominion (2018)', 4.360389776641251)
('Shawshank Redemption, The (1994)', 4.339912271934893)
