In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle
from sklearn.decomposition import NMF

## Let's collect some recommendations for a new users that loves Disney Movies!

In [2]:
# for calculating recommendations
query = {
    # movieId, rating
    4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}


# for testing the recommender after getting some recommendations
relevant_items = [
    596, 4016, 1033, 134853, 
    2018, 588, 364, 26999, 75395,2085, 
    1907, 2078, 1032, 177765   
]

In [3]:
ratings = pd.read_csv('/Users/RocketLinksDE/Documents/GitHub/spiced/week_10/ml-latest-small/ratings.csv')
movies = pd.read_csv('/Users/RocketLinksDE/Documents/GitHub/spiced/week_10/ml-latest-small/movies.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# which movies are in the query?
movies.set_index('movieId').loc[query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4470,Ariel (1988),Drama
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
616,"Aristocats, The (1970)",Animation|Children
1029,Dumbo (1941),Animation|Children|Drama|Musical


## 1. Model Development

In [5]:
# calculate the number of ratings per movie
ratings_per_movie = ratings.groupby('movieId')['userId'].count()
ratings_per_movie

movieId
1         215
2         110
3          52
4           7
5          49
         ... 
193581      1
193583      1
193585      1
193587      1
193609      1
Name: userId, Length: 9724, dtype: int64

In [6]:
## filter out movies rated by less than 20 users
popular_movies = ratings_per_movie.loc[ratings_per_movie > 20]
popular_movies

movieId
1         215
2         110
3          52
5          49
6         102
         ... 
148626     26
152081     32
164179     26
166528     27
168252     25
Name: userId, Length: 1235, dtype: int64

In [7]:
def recommend_popular(query, ratings, k=10):
    ratings_per_movie = ratings.groupby('movieId')['userId'].count()
    popular_movies = ratings_per_movie.loc[ratings_per_movie > 20]
    
    

In [8]:
# filter the ratings matrix and only keep the popular movies
ratings = ratings.set_index('movieId').loc[popular_movies.index]
ratings = ratings.reset_index()
ratings

Unnamed: 0,movieId,userId,rating,timestamp
0,1,1,4.0,964982703
1,1,5,4.0,847434962
2,1,7,4.5,1106635946
3,1,15,2.5,1510577970
4,1,17,4.5,1305696483
...,...,...,...,...
66653,168252,567,4.0,1525283936
66654,168252,586,5.0,1529899336
66655,168252,596,5.0,1535627159
66656,168252,599,3.5,1498529615


In [9]:
R = csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

## Training
* initialize the model
* fit it on the user item matrix
* optionally, tune the number of components (hidden features): what happens if you set the number of components to a  really low number?
* decrease the tol to train for a longer time

In [10]:
# initialize the unsupervised model
# 55 hidden features, F=55
model = NMF(n_components=55, init='nndsvd', max_iter=10000, tol=0.01, verbose=2)

In [11]:
# fit it to the user-item rating matrix
model.fit(R)

# initialzed P, Q matrix with random values
# iterate and optimize the values stored in P and Q

violation: 1.0
violation: 0.312688159751157
violation: 0.18594517446677739
violation: 0.1359703680290255
violation: 0.10912099268744488
violation: 0.09253770969101086
violation: 0.08192192262335489
violation: 0.07439645545108457
violation: 0.0681743652603671
violation: 0.06317237522637863
violation: 0.05822162355908504
violation: 0.054244956201236205
violation: 0.05094453899605004
violation: 0.04758270155268873
violation: 0.044400052680260205
violation: 0.041058344404514446
violation: 0.03833746489223955
violation: 0.035884135061409285
violation: 0.03379007632612116
violation: 0.031663441267238225
violation: 0.029596151503196517
violation: 0.027872135215161832
violation: 0.026519686584410798
violation: 0.025499170417113432
violation: 0.024429681871017506
violation: 0.02341154517401604
violation: 0.02238346991032432
violation: 0.021341784759208175
violation: 0.020617190974163323
violation: 0.019956716153993366
violation: 0.019317617216910773
violation: 0.018731963102511802
violation: 0.

NMF(init='nndsvd', max_iter=10000, n_components=55, tol=0.01, verbose=2)

### Model inspection

In [12]:
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

In [13]:
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

#### the hidden features

In [14]:
model.components_.shape

(55, 168253)

In [15]:
# user-'genre' matrix [611x55]
P = model.transform(R)

# movie-'genre' matrix [55x168253]
Q = model.components_

P.shape, Q.shape

violation: 1.0
violation: 0.746415428484573
violation: 0.27618465611007953
violation: 0.11186081809192554
violation: 0.05364920429907
violation: 0.029684225769965
violation: 0.01766177658215403
violation: 0.011064545367471533
violation: 0.007412201725745626
Converged at iteration 10


((611, 55), (55, 168253))

In [16]:
# user with id 1: sparse format
R[1,:]

<1x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 180 stored elements in Compressed Sparse Row format>

In [17]:
# user with id 1: dense embedding
P[1, :]

array([0.        , 0.40796061, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.11501499, 0.53934925, 0.        , 0.        ,
       0.        , 0.        , 1.56947415, 0.        , 0.        ,
       0.35082989, 0.        , 0.        , 0.        , 0.43392417,
       0.        , 0.65230331, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.68878068, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.72333418, 0.        ,
       0.        , 1.0936821 , 0.        , 0.        , 0.59283708,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [18]:
# dense embedding for movie with id 1
Q[:, 1]

array([0.31036501, 1.22737335, 0.73113405, 0.        , 0.34440536,
       0.36545458, 0.        , 0.36707707, 0.        , 0.37988131,
       1.09694117, 3.37449219, 0.64708839, 0.62159093, 0.48272676,
       0.03273423, 0.07252813, 0.85048476, 0.        , 0.03453774,
       0.63974333, 0.35613006, 0.51867604, 0.        , 0.        ,
       0.64452738, 0.        , 0.25221844, 1.55345229, 0.        ,
       0.30336938, 0.37910218, 0.10264947, 0.        , 0.39816343,
       0.        , 0.18503625, 0.        , 0.13437929, 0.        ,
       0.        , 0.16161771, 0.15465435, 0.32895284, 0.25427941,
       0.29150529, 0.00696596, 0.2601212 , 0.36870314, 0.01817937,
       0.        , 0.37738219, 0.14364054, 0.50923273, 0.        ])

In [19]:
# R -> encoding -> P -> decoding -> Rhat
R_hat = model.inverse_transform(model.transform(R))

violation: 1.0
violation: 0.746415428484573
violation: 0.27618465611007953
violation: 0.11186081809192554
violation: 0.05364920429907
violation: 0.029684225769965
violation: 0.01766177658215403
violation: 0.011064545367471533
violation: 0.007412201725745626
Converged at iteration 10


In [20]:
R_hat

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.05343751, 1.11304938, ..., 0.        , 0.        ,
        0.03003791],
       [0.        , 0.27881919, 0.01425878, ..., 0.        , 0.        ,
        0.24490577],
       ...,
       [0.        , 2.35311775, 0.93905417, ..., 0.        , 0.        ,
        0.14926988],
       [0.        , 0.7881344 , 0.75212741, ..., 0.        , 0.        ,
        0.02828104],
       [0.        , 5.01541203, 1.22867506, ..., 0.        , 0.        ,
        5.76422399]])

#### the reconstruction error

$$
L(R, \hat{R}) = \sqrt{\sum_i\sum_j(R_{ij}-\hat{R}_{ij})^2} = \sqrt{\sum_i\sum_j(R_{ij}-PQ_{ij})^2}
$$

In [21]:
R.shape, R_hat.shape

((611, 168253), (611, 168253))

In [22]:
# reconstruction error
np.sqrt(np.sum(np.square(R - R_hat)))

628.8392748632737

In [23]:
model.reconstruction_err_

628.6986813163818

## Model deployment: Make recommendations for a new user

In [24]:
with open('./nmf_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [25]:
!ls

[34m10_2_PCA[m[m
[34m10_3_CollaborativeFilteringwithMatrixFactorization[m[m
[34m10_4_ NeighborhoodbasedCollaborativeFiltering[m[m
[34m10_5_Web_App[m[m
[34m10_6_Clustering[m[m
Complete Sentences.ipynb
[34mProject[m[m
Project-RecommenderSystem-try2.ipynb
Project-RecommenderSystem-try3.ipynb
Unsupervised learning.odp
Week_10.pdf
[34mdeploy_webapp_arjun[m[m
exploratory_analysis_worksheet.ipynb
exploratory_analysis_worksheet_solved.ipynb
[34mml-latest-small[m[m
nmf_recommender.pkl
recommender_systems_intro.ipynb
recommender_systems_intro_filled.ipynb


### Read the model from hard drive

In [26]:
with open('./nmf_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [27]:
model.reconstruction_err_

628.6986813163818

In [28]:
query

{4470: 5, 48: 5, 594: 5, 27619: 5, 152081: 5, 595: 5, 616: 5, 1029: 5}

In [29]:
# construct a user vector

data = list(query.values())   # the ratings of the new user
row_ind = [0]*len(data)       # we use just a single row 0 for this user 
col_ind = list(query.keys())  # the columns (=movieId) of the ratings
data, row_ind, col_ind

user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
user_vec

<1x168253 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

### Calculate the score

1. transform the user vector to its dense representation (encoding) 
2. inverse transform the dense vector into the sparse representation (decoding)

$$
\hat{r}_{ij} = p_i' \cdot q_j 
$$

In [30]:
# user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat


scores = model.inverse_transform(model.transform(user_vec))

# convert to a pandas series
scores = pd.Series(scores[0])
scores

violation: 1.0
violation: 1.3521251263037153
violation: 0.023571104993659144
violation: 0.0037322356319788677
Converged at iteration 5


0         0.000000
1         0.404354
2         0.320013
3         0.093926
4         0.000000
            ...   
168248    0.000000
168249    0.000000
168250    0.000000
168251    0.000000
168252    0.027551
Length: 168253, dtype: float64

## Ranking

In [31]:
# give a zero score to movies the user has allready seen
scores[query.keys()] = 0

In [32]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

364      0.809879
588      0.660973
2081     0.610272
596      0.586351
1073     0.582943
           ...   
56634    0.000000
56635    0.000000
56636    0.000000
56637    0.000000
84126    0.000000
Length: 168253, dtype: float64

In [33]:
# get the movieIds of the top 3 entries
recommendations = scores.head(3).index
recommendations

Int64Index([364, 588, 2081], dtype='int64')

In [34]:
movies.set_index('movieId').loc[recommendations]

Unnamed: 0,title,genres
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance


In [35]:
## Short way: Use function for all the steps of deploying model on new user

In [36]:
# collaborative filtering = look at ratings only!
def recommend_nmf(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation

    # construct a user vector
    data = list(query.values())   # the ratings of the new user
    row_ind = [0]*len(data)       # we use just a single row 0 for this user 
    col_ind = list(query.keys())  # the columns (=movieId) of the ratings
    data, row_ind, col_ind

    user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
    
   
    # 2. scoring
    # calculate the score with the NMF model
    scores = model.inverse_transform(model.transform(user_vec))
    # convert to a pandas series
    scores = pd.Series(scores[0])
    
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    scores[query.keys()] = 0
    scores = scores.sort_values(ascending=False)
    
    # return the top-k highst rated movie ids or titles
    recommendations = scores.head(k).index
    movies.set_index('movieId').loc[recommendations]
    
    return recommendations

In [37]:
top_recommendations = recommend_nmf(query, model, ratings, k=3)
movies.set_index('movieId').loc[top_recommendations]

violation: 1.0
violation: 1.3521251263037153
violation: 0.023571104993659144
violation: 0.0037322356319788677
Converged at iteration 5


Unnamed: 0,title,genres
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance


In [None]:
def recommend_test_nmf(movies, user_rating, model, k=5):
    """
    return k random unseen movies for user 
    """
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    # construct a user vector
 
    data = list(user_rating.values())   # the ratings of the new user
    row_ind = [0]*len(data)       # we use just a single row 0 for this user 
    col_ind = list(user_rating.keys())  # the columns (=movieId) of the ratings
    # R.shape[1] = 168253
    user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, 168253))
   
    # 3. scoring
    # calculate the score with the NMF model
    scores = model.inverse_transform(model.transform(user_vec))
    # convert to a pandas series
    scores = pd.Series(scores[0])

    # 4. ranking

    # filter out movies allready seen by the user
    # give a zero score to movies the user has already seen
    scores[user_rating.keys()] = 0
    # sort the scores from high to low 
    scores = scores.sort_values(ascending=False)
    
    # return the top-k highst rated movie ids or titles
    recommendations = scores.head(k).index
    recommendations = movies.loc[recommendations]
    return list(recommendations["title"])





In [None]:
movies = pd.read_csv('/Users/RocketLinksDE/Documents/GitHub/spiced/week_10/ml-latest-small/movies.csv')

In [None]:
from thefuzz import process

In [None]:
def match_movie_title(input_title, movie_titles):
    """
    Matches inputed movie title to existing one in the list with fuzzywuzzy
    """
    matched_title = process.extractOne(input_title, movie_titles)[0]

    return matched_title

In [None]:
def lookup_movie_name(movies, user_movie_titles):
    """
    Convert output of recommendation to movie id
    """
    # match title to movieId

    movie_titles = list(movies["title"])
    intended_movies = [match_movie_title(title, movie_titles) for title in user_movie_titles]
    
    movies = movies.reset_index()
    boolean = movies["title"].isin(intended_movies)
    movie_id = list(movies[boolean]["movieId"])
    return movie_id