In [208]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle

In [209]:
# DRY - Don't repeat yourself!
# place a utils.py in the same folder as the notebook
# from utils import example_query, create_user_vector, create_rating_matrix

# for calculating recommendations
example_query = {
    # movieId, rating
    # 4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}

# Neighborhood Based Filtering for Recommender Systems
---

> The key idea is that the rating of u for a new item i is likely to be similar to that of another user v,if u and v have rated other items in a similar way. Likewise,u is likely to rate two items i and j in a similar fashion, if other users have given similar ratings to these two items.

##### Use ratings of similar users (or items) to predict what you like! But: How can we measure similarity/distance? 

- Cosine Similariy/Distance (works good for sparse high dimensional data)
- Jaccard Similarity/Distance (only works on binarized vectors)
- Pearson Correlation/Distance (cosine similarity on centered vectors)
- Euclidian Distance/Similarity (not good for sparse high dimensional data)

You find many more metrics here:https://docs.scipy.org/doc/scipy/reference/spatial.distance.html

In [210]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [211]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [212]:
#movies['genres'] = movies['genres'].str.split('|')

In [213]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [214]:
genres = []

In [215]:
for i,item in enumerate(movies['genres'].loc[:]):
    genres.extend(item.split('|'))

In [216]:
genres = set(genres)
genres

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [217]:
for genre in genres:
    #print(genre in movies['genres'].loc[0])
    movies[genre] = movies['genres'].apply(lambda x: 1 if x.find(genre) !=-1 else 0)

In [218]:
movies # use this to create movies distances and suggest with this, not with score!

Unnamed: 0,movieId,title,genres,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,...,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,1,...,0,0,0,1,1,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
9739,193585,Flint (2017),Drama,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
## 1. Model Development

### Preprocessing (same as for the NMF model!)

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [41]:
ratings['movieId'].nunique() 

9724

In [42]:
R = ratings.pivot(index='userId',columns='movieId',values='rating')

In [43]:
R.fillna(0, inplace=True)

In [44]:
R

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
#R = csr_matrix(R)



R.shape

(610, 9724)

### Training (new!)

- initialize the model: pick a distance metric
- fit it to the user item matrix: only stores the data and doesn't do further. all the calculations take place later!

In [46]:
# which metrics can we use for sparse matrics?
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [47]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')#'manhattan'

# fit it to the user-item rating matrix
model.fit(R)

NearestNeighbors(metric='cosine')

### Save the trained model on your hard drive

In [48]:
with open('./distance_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

---
## 2. Model deployment: Make recommendations for a new user

### Read the model from hard drive

In [49]:
with open('./distance_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [50]:
# if you have loaded the model inside the utils.py you can also write:
# from utils import model

### Receive a user query

In [51]:
R[1029]

userId
1      5.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    0.0
607    0.0
608    0.0
609    0.0
610    0.0
Name: 1029, Length: 610, dtype: float64

In [52]:
example_query

{48: 5, 594: 5, 27619: 5, 152081: 5, 595: 5, 616: 5, 1029: 5}

In [53]:
movies.set_index('movieId').loc[example_query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
616,"Aristocats, The (1970)",Animation|Children
1029,Dumbo (1941),Animation|Children|Drama|Musical


### Construct a user vector (same as before!)

we need the same input as was used during training!

In [54]:
R.shape

(610, 9724)

In [55]:
example_query.keys()

dict_keys([48, 594, 27619, 152081, 595, 616, 1029])

In [56]:
R[[48, 594, 27619, 152081, 595, 616, 1029]]

movieId,48,594,27619,152081,595,616,1029
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,3.0,0.0,0.0
5,0.0,5.0,0.0,0.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...
606,0.0,3.5,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.5,2.5,0.0,0.0,2.5,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
user_vec = np.repeat(0, R.shape[1])
user_vec = pd.DataFrame(user_vec.reshape(1,9724),columns=R.columns)
user_vec[list(example_query.keys())] = list(example_query.values())
#user_vec[[list(query.keys())] 

In [58]:
user_vec[list(example_query.keys())]# 

movieId,48,594,27619,152081,595,616,1029
0,5,5,5,5,5,5,5


### Calculate the score (new!)

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [59]:
# calculates the distances to all other users in the data!
distances, userIds = model.kneighbors(user_vec, n_neighbors=10, return_distance=True)

# sklearn returns a list of predictions - extract the first and only value of the list
distances = distances[0]
userIds = userIds[0]

In [60]:
distances, userIds #### up to here we are good

(array([0.80602854, 0.84346311, 0.8461766 , 0.84869312, 0.84959286,
        0.85022216, 0.85977854, 0.86283904, 0.87003607, 0.87447407]),
 array([475, 169,  42,   4,  57, 234, 133, 483, 116,  19]))

In [61]:
R.loc[475][R.loc[475]>=0.9]

movieId
2         4.5
19        4.0
165       4.5
260       5.0
329       5.0
         ... 
135133    4.0
139385    4.0
152081    4.5
166528    4.0
168252    3.5
Name: 475, Length: 155, dtype: float64

In [62]:
ratings.set_index('userId').loc[475].sort_values('movieId')

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
475,2,4.5,1498031744
475,19,4.0,1498031776
475,165,4.5,1498031585
475,260,5.0,1498029369
475,329,5.0,1498031717
...,...,...,...
475,135133,4.0,1498031436
475,139385,4.0,1498031081
475,152081,4.5,1498029487
475,166528,4.0,1498029848


In [63]:
# only look at ratings for users that are similar!
neighborhood = ratings.set_index('userId').loc[userIds] #
neighborhood

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
475,2,4.5,1498031744
475,19,4.0,1498031776
475,165,4.5,1498031585
475,260,5.0,1498029369
475,329,5.0,1498031717
...,...,...,...
19,3825,3.0,965702777
19,3826,1.0,965702843
19,3835,2.0,965711413
19,3837,1.0,965704999


In [64]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood
scores = neighborhood.groupby('movieId')['rating'].sum()
scores

movieId
1         22.5
2         15.5
3         15.5
5          7.5
6          3.0
          ... 
135133     4.0
139385     4.0
152081     4.5
166528     4.0
168252     3.5
Name: rating, Length: 1877, dtype: float64

### Give recommendations (same as before!)

In [65]:
example_query.keys()

dict_keys([48, 594, 27619, 152081, 595, 616, 1029])

In [66]:
# give a zero score to movies the user has allready seen
allready_seen = scores.index.isin(example_query.keys())
scores.loc[allready_seen] = 0

In [67]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

movieId
260       33.5
2571      33.5
588       33.0
457       31.5
1198      31.0
          ... 
594        0.0
1029       0.0
48         0.0
595        0.0
152081     0.0
Name: rating, Length: 1877, dtype: float64

In [68]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([260, 2571, 588, 457, 1198, 1196, 1291, 1270, 1197, 364], dtype='int64', name='movieId')

In [69]:
# let's see the recommendations!
movies.set_index('movieId').loc[recommendations]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
457,"Fugitive, The (1993)",Thriller
1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX


---
## 3. Project Task: neighborhood based recommender function

- Collect different example queries for "typical" users (e.g. a horror movie buff) and try out the algorithm
- Set the number of neighbors to a very high or low number. What happens to the recommendations?
- Implement a recommender function that recommends movies to a new user based on the NearestNeighbor model!


- ⭐ **Bonus**: Calculate the score using a weighted sum or average. Use the distances to the other users as weights
- ⭐ **Bonus**: Use the method to find and recommend similar movies! Hint: Run the model on the transposed user item rating matrix.
- ⭐ **Bonus**: First use NMF to reduce the dimensionality of the sparse user item matrix. Then run neighborhood based recommendation on the dense matrix.

In [70]:
# collaborative filtering = look at ratings only!
def recommend_neighborhood(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained nearest neighbors model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    
    # construct a user vector
    
   
    # 2. scoring
    
    # find n neighbors
    
    # calculate their average rating
    
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]

In [71]:
# recommender.py
# from recommender import recommend_neighborhood