In [1]:
import pandas as pd
import numpy as np
import matrix_factorization_utilities

In [2]:
raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

In [3]:
raw_training_dataset_df.head()

Unnamed: 0,user_id,movie_id,value
0,97,30,5
1,31,9,5
2,73,32,3
3,31,3,5
4,73,20,2


In [4]:
movies = pd.read_csv("movies.csv",index_col= "movie_id")

In [5]:
movies.head()

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Sheriff 1,"crime drama, western"
2,The Big City Judge 1,legal drama
3,The Sheriff 2,"crime drama, western"
4,Just a Regular Family,reality
5,The Big City Judge 2,legal drama


In [6]:
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

In [7]:
ratings_training_df.head()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,4.0,,...,,,,4.0,,,,,,
2,5.0,,,,,,,,,,...,,,,,,,,,,4.0
3,4.0,,5.0,,,,,,,,...,,,,,,,,,,
4,,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,,2.0,5.0,5.0


## Test the model and accuracy

In [8]:
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_training_df.as_matrix(),
                                                                    num_features=11,
                                                                    regularization_amount=1.1)

Optimization terminated successfully.
         Current function value: 315.538580
         Iterations: 1062
         Function evaluations: 1594
         Gradient evaluations: 1594


In [9]:
predicted_ratings = np.matmul(U, M)

In [10]:
rmse_training = matrix_factorization_utilities.RMSE(ratings_training_df.as_matrix(),
                                                    predicted_ratings)
rmse_testing = matrix_factorization_utilities.RMSE(ratings_testing_df.as_matrix(),
                                                   predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))

Training RMSE: 0.24952555662048573
Testing RMSE: 1.2096517096071573


## Train on full data and recommend

In [11]:
# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id',
                            columns='movie_id',
                            aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.as_matrix(),
                                                                    num_features=15,
                                                                    regularization_amount=0.1)

# Find all predicted ratings by multiplying U and M matrices
predicted_ratings = np.matmul(U, M)

         Current function value: 32.504358
         Iterations: 3000
         Function evaluations: 4484
         Gradient evaluations: 4484


#### Find out a similar movie

In [12]:
# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)

In [13]:
# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama


In [14]:
# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id - 1]

print("The attributes for this movie are:")
print(current_movie_features)

The attributes for this movie are:
[ 1.01265846 -0.84600746 -0.75000191  0.63657191 -0.97930488 -1.67208911
 -0.95850372  0.28101059 -0.30078403 -0.05226875 -0.10147958 -0.23882663
 -0.03060674 -0.83667771  1.07977951]


In [15]:
# The main logic for finding similar movies:

# 1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

# 4. Create a new column in the movie list with the difference score for each movie
movies['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][0:5])

The five most similar movies are:
                             title  difference_score
movie_id                                            
5             The Big City Judge 2          0.000000
10         Surrounded by Zombies 1          2.774825
8         Sci-Fi Murder Detectives          3.496417
9                      Biker Gangs          3.523295
3                    The Sheriff 2          4.038590


#### Recommend for a given user

In [16]:
print("Enter a user_id to get recommendations (Between 1 and 100):")
user_id_to_search = int(input())

Enter a user_id to get recommendations (Between 1 and 100):
23


In [17]:
print("Movies previously reviewed by user_id {}:".format(user_id_to_search))

reviewed_movies_df = raw_dataset_df[raw_dataset_df['user_id'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(movies_df, on='movie_id')

print(reviewed_movies_df[['title', 'genre', 'value']])

Movies previously reviewed by user_id 23:
                     title                      genre  value
154  The Serious Detective            detective drama      5
155           Sports Nerds                     comedy      3
156      Behind the Scenes               comedy-drama      4
157     Post-Apocalyptia 1  sci-fi, thriller, mystery      4
158         The Spy Family                  spy drama      5
159  My Complicated Family               comedy-drama      5
160   The Big City Judge 1                legal drama      4
161           Drugs & Guns                crime drama      4


In [18]:
print("Movies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search - 1]
movies_df['rating'] = user_ratings

already_reviewed = reviewed_movies_df['movie_id']
recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

print(recommended_df[['title', 'genre', 'rating']].head(5))

Movies we will recommend:
                              title                     genre    rating
movie_id                                                               
13                    The Sheriff 3      crime drama, western  4.847849
21                  Political Gaffs  comedy, political satire  4.463688
19        Fake News about Fake News            satire, comedy  4.334878
20                       Buy My App                    comedy  4.239815
6                 Attack on Earth 1            sci-fi, action  4.152353
