#### Let's collect some recommendations for a new users that loves Disney Movies! 

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
import pickle

In [8]:
# for calculating recommendations
query = {
    # movieId, rating
    4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}


# for testing the recommender after getting some recommendations
relevant_items = [
    596, 4016, 1033, 134853, 
    2018, 588, 364, 26999, 75395,2085, 
    1907, 2078, 1032, 177765   
]

# Non Negative Matrix Factorization for Recommender Systems
---



In [9]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

In [10]:
# which movies are in the query?
movies.set_index('movieId').loc[query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4470,Ariel (1988),Drama
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
616,"Aristocats, The (1970)",Animation|Children
1029,Dumbo (1941),Animation|Children|Drama|Musical


---
## 1. Model Development

### Preprocessing

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [11]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
# calculate the number of ratings per movie
ratings_per_movie = ratings.groupby('movieId')['userId'].count()
ratings_per_movie

movieId
1         215
2         110
3          52
4           7
5          49
         ... 
193581      1
193583      1
193585      1
193587      1
193609      1
Name: userId, Length: 9724, dtype: int64

In [13]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = ratings_per_movie.loc[ratings_per_movie > 20]
popular_movies

movieId
1         215
2         110
3          52
5          49
6         102
         ... 
148626     26
152081     32
164179     26
166528     27
168252     25
Name: userId, Length: 1235, dtype: int64

In [14]:
# filter the ratings matrix and only keep the popular movies
ratings = ratings.set_index('movieId').loc[popular_movies.index]
ratings = ratings.reset_index()
ratings

Unnamed: 0,movieId,userId,rating,timestamp
0,1,1,4.0,964982703
1,1,5,4.0,847434962
2,1,7,4.5,1106635946
3,1,15,2.5,1510577970
4,1,17,4.5,1305696483
...,...,...,...,...
66653,168252,567,4.0,1525283936
66654,168252,586,5.0,1529899336
66655,168252,596,5.0,1535627159
66656,168252,599,3.5,1498529615


In [15]:
#user_item matrix
'''user_item = pd.pivot_table(ratings, 
                           values='rating', 
                           index='userId', 
                           columns='movieId'
)
user_item'''

"user_item = pd.pivot_table(ratings, \n                           values='rating', \n                           index='userId', \n                           columns='movieId'\n)\nuser_item"

In [16]:
# Initialize a sparse user-item rating matrix 
# (data, (row_ind, col_ind)
R = csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

### Training

- initialize the model
- fit it on the user item matrix


In [17]:
# initialize the unsupervised model
# 55 hidden features, F=55
model = NMF(n_components=55, init='nndsvd', max_iter=10000, tol=0.01, verbose=2)

# fit it to the user-item rating matrix
model.fit(R)

# initialzed P, Q matrix with random values
# iterate and optimize the values stored in P and Q

violation: 1.0
violation: 0.3198407444776186
violation: 0.18560236151196122
violation: 0.13793176709705945
violation: 0.11262875034790074
violation: 0.09506722095218485
violation: 0.08272177587280928
violation: 0.07281883187700439
violation: 0.06385144916043073
violation: 0.05660484722466009
violation: 0.05112193173010419
violation: 0.04681211487182119
violation: 0.04385023840768013
violation: 0.0421868583254791
violation: 0.04133260886277145
violation: 0.04064494044902701
violation: 0.03961155980527065
violation: 0.037992501545404705
violation: 0.03583699426795149
violation: 0.033581890989523557
violation: 0.03149230602099127
violation: 0.029415048278976307
violation: 0.02777149885112388
violation: 0.02632959750591218
violation: 0.02512444539423292
violation: 0.02406433475541025
violation: 0.023014277912057048
violation: 0.02191683843588806
violation: 0.02073828130249116
violation: 0.01974226233722642
violation: 0.018716395628841313
violation: 0.017653014497686137
violation: 0.0168639

NMF(init='nndsvd', max_iter=10000, n_components=55, tol=0.01, verbose=2)

### Model inspection

In [18]:
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

### Create Rhat 

In [19]:
#####this is the one liner for calculating Rhat

# R -> encoding -> P -> decoding -> Rhat
R_hat = model.inverse_transform(model.transform(R))

violation: 1.0
violation: 0.7615559462392816
violation: 0.27503185222575577
violation: 0.11173440074793993
violation: 0.05386357992307483
violation: 0.029376751626224676
violation: 0.017376665463083963
violation: 0.01163498181070659
violation: 0.007998169482135225
Converged at iteration 10


In [20]:
R_hat

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.50574693, 0.01504283, ..., 0.        , 0.        ,
        0.02628296],
       [0.        , 0.05102188, 0.0547873 , ..., 0.        , 0.        ,
        0.18777417],
       ...,
       [0.        , 2.46575991, 1.66882392, ..., 0.        , 0.        ,
        0.07564565],
       [0.        , 0.90262613, 0.52295751, ..., 0.        , 0.        ,
        0.04024933],
       [0.        , 4.23815562, 0.44559897, ..., 0.        , 0.        ,
        5.81622743]])

#### the reconstruction error

$$
L(R, \hat{R}) = \sqrt{\sum_i\sum_j(R_{ij}-\hat{R}_{ij})^2} = \sqrt{\sum_i\sum_j(R_{ij}-PQ_{ij})^2}
$$

In [21]:
R.shape, R_hat.shape

((611, 168253), (611, 168253))

In [22]:
# reconstruction error
np.sqrt(np.sum(np.square(R - R_hat)))

629.5886239395618

In [23]:
model.reconstruction_err_

629.4625703869821

---
## 2. Model deployment: Make recommendations for a new user

### Save the trained model on your hard drive

In [24]:
with open('./nmf_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [25]:
!ls

matrix_factorization_filled.ipynb  ml-latest-small  nmf_recommender.pkl


### Read the model from hard drive

In [26]:
with open('./nmf_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [27]:
model.reconstruction_err_

629.4625703869821

### Receive a user query

In [28]:
query

{4470: 5, 48: 5, 594: 5, 27619: 5, 152081: 5, 595: 5, 616: 5, 1029: 5}

In [29]:
#R[1,:]

### Construct a user vector

we need the same input as was used during training!

In [30]:
#list(query.values())

In [31]:
data = list(query.values())   # the ratings of the new user
row_ind = [0]*len(data)       # we use just a single row 0 for this user 
col_ind = list(query.keys())  # the columns (=movieId) of the ratings
data, row_ind, col_ind

([5, 5, 5, 5, 5, 5, 5, 5],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [4470, 48, 594, 27619, 152081, 595, 616, 1029])

In [32]:
# new user vector: needs to have the same format as the training data

user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
user_vec

<1x168253 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [33]:
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

### Calculate the score

1. transform the user vector to its dense representation (encoding) 
2. inverse transform the dense vector into the sparse representation (decoding)

$$
\hat{r}_{ij} = p_i' \cdot q_j 
$$

In [34]:
# user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat


scores = model.inverse_transform(model.transform(user_vec))

# convert to a pandas series
scores = pd.Series(scores[0])
scores

violation: 1.0
violation: 0.9987903118950021
violation: 0.018533304108427837
violation: 0.0017041118715157334
Converged at iteration 5


0         0.000000
1         0.409770
2         0.301097
3         0.023311
4         0.000000
            ...   
168248    0.000000
168249    0.000000
168250    0.000000
168251    0.000000
168252    0.019530
Length: 168253, dtype: float64

### Give recommendations

In [35]:
query.keys()

dict_keys([4470, 48, 594, 27619, 152081, 595, 616, 1029])

In [36]:
# give a zero score to movies the user has allready seen
scores[query.keys()] = 0

In [37]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

364      0.973785
588      0.817745
2081     0.705287
34       0.676333
596      0.609434
           ...   
56557    0.000000
56558    0.000000
56559    0.000000
56560    0.000000
84126    0.000000
Length: 168253, dtype: float64

In [38]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([364, 588, 2081, 34, 596, 1907, 2078, 1022, 1028, 3114], dtype='int64')

In [39]:
movies.set_index('movieId').loc[recommendations]

Unnamed: 0,title,genres
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance
34,Babe (1995),Children|Drama
596,Pinocchio (1940),Animation|Children|Fantasy|Musical
1907,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...
2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
1022,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance
1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical
3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy


In [40]:
# recall@10: fraction of relevant items in the top 10 recommendations


In [41]:
# precision@10: fraction of recommendations that are relevant


---
## 3. Project Task: NMF recommender function

1. Collect different user queries for "typical" users (e.g. a horror movie buff) and evaluate the algorithm
2. Set the number of components to a very low number (e.g. 2). What happens to the recommendations?
3. Implement a recommender function that recommends movies to a new user based on the NMF model!

Note: Training of the model happens outside of the function! Don't retrain the model every time you want to calculate recommendations for a user.


In [55]:
# collaborative filtering = look at ratings only!
def recommend_nmf(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    #print(R)
    # construct a user vector
    data = list(query.values())   # the ratings of the new user
    row_ind = [0]*len(data)       # we use just a single row 0 for this user 
    col_ind = list(query.keys()) 
    user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
    
    # 2. scoring
    
    # calculate the score with the NMF model
    # user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat
    scores = model.inverse_transform(model.transform(user_vec))
    # convert to a pandas series
    scores = pd.Series(scores[0])
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    # give a zero score to movies the user has allready seen
    scores[query.keys()] = 0
    # return the top-k highst rated movie ids or titles
    recommendations = scores.head(10).index
    
    return recommendations
movies.set_index('movieId').loc[recommendations]

Unnamed: 0,title,genres
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance
34,Babe (1995),Children|Drama
596,Pinocchio (1940),Animation|Children|Fantasy|Musical
1907,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...
2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
1022,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance
1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical
3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy


In [None]:
# recommender.py
# from recommender import recommend_nmf