In [743]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle

In [744]:
# DRY - Don't repeat yourself!
# place a utils.py in the same folder as the notebook
# from utils import example_query, create_user_vector, create_rating_matrix

# for calculating recommendations
example_query = {
    # movieId, rating
    # 4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}

# Neighborhood Based Filtering for Recommender Systems
---

> The key idea is that the rating of u for a new item i is likely to be similar to that of another user v,if u and v have rated other items in a similar way. Likewise,u is likely to rate two items i and j in a similar fashion, if other users have given similar ratings to these two items.

##### Use ratings of similar users (or items) to predict what you like! But: How can we measure similarity/distance? 

- Cosine Similariy/Distance (works good for sparse high dimensional data)
- Jaccard Similarity/Distance (only works on binarized vectors)
- Pearson Correlation/Distance (cosine similarity on centered vectors)
- Euclidian Distance/Similarity (not good for sparse high dimensional data)

You find many more metrics here:https://docs.scipy.org/doc/scipy/reference/spatial.distance.html

In [745]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [746]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [747]:
#movies['genres'] = movies['genres'].str.split('|')

In [748]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [749]:
genres = []

In [750]:
for i,item in enumerate(movies['genres'].loc[:]):
    genres.extend(item.split('|'))

In [751]:
genres = set(genres)
genres

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [752]:
for genre in genres:
    #print(genre in movies['genres'].loc[0])
    movies[genre] = movies['genres'].apply(lambda x: 1 if x.find(genre) !=-1 else 0)

In [753]:
movies # use this to create movies distances and suggest with this, not with score!

Unnamed: 0,movieId,title,genres,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,...,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,1,...,0,0,0,1,1,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
9739,193585,Flint (2017),Drama,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
## 1. Model Development

### Preprocessing (same as for the NMF model!)

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [754]:
ratings['movieId'].nunique() 

9724

In [755]:
R = ratings.pivot(index='userId',columns='movieId',values='rating')

In [756]:
R.fillna(0, inplace=True)

In [757]:
R

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [758]:
#R = csr_matrix(R)



R.shape

(610, 9724)

### Training (new!)

- initialize the model: pick a distance metric
- fit it to the user item matrix: only stores the data and doesn't do further. all the calculations take place later!

In [759]:
# which metrics can we use for sparse matrics?
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [760]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')#'manhattan'

# fit it to the user-item rating matrix
model.fit(R)

NearestNeighbors(metric='cosine')

### Save the trained model on your hard drive

In [761]:
with open('./distance_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

---
## 2. Model deployment: Make recommendations for a new user

### Read the model from hard drive

In [762]:
with open('./distance_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [763]:
# if you have loaded the model inside the utils.py you can also write:
# from utils import model

### Receive a user query

In [764]:
R[1029]

userId
1      5.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    0.0
607    0.0
608    0.0
609    0.0
610    0.0
Name: 1029, Length: 610, dtype: float64

In [765]:
example_query

{48: 5, 594: 5, 27619: 5, 152081: 5, 595: 5, 616: 5, 1029: 5}

In [766]:
movies.set_index('movieId').loc[example_query.keys()]

Unnamed: 0_level_0,title,genres,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,...,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance,1,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,1,0,0
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,1,0,0,0,1,1,0,0,...,0,1,0,1,0,0,0,1,0,0
616,"Aristocats, The (1970)",Animation|Children,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1029,Dumbo (1941),Animation|Children|Drama|Musical,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0


### Construct a user vector (same as before!)

we need the same input as was used during training!

In [767]:
R.shape

(610, 9724)

In [768]:
example_query.keys()

dict_keys([48, 594, 27619, 152081, 595, 616, 1029])

In [769]:
R[[48, 594, 27619, 152081, 595, 616, 1029]]

movieId,48,594,27619,152081,595,616,1029
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,3.0,0.0,0.0
5,0.0,5.0,0.0,0.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...
606,0.0,3.5,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.5,2.5,0.0,0.0,2.5,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [770]:
user_vec = np.repeat(0, R.shape[1])
user_vec = pd.DataFrame(user_vec.reshape(1,9724),columns=R.columns)
user_vec[list(example_query.keys())] = list(example_query.values())
#user_vec[[list(query.keys())] 

In [771]:
user_vec[list(example_query.keys())]# 

movieId,48,594,27619,152081,595,616,1029
0,5,5,5,5,5,5,5


### Calculate the score (new!)

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [772]:
# calculates the distances to all other users in the data!
distances, index = model.kneighbors(user_vec, n_neighbors=5, return_distance=True)

# sklearn returns a list of predictions - extract the first and only value of the list
distances = distances[0]
index = index[0]

In [773]:
distances, index #### up to here we are good

(array([0.80602854, 0.84346311, 0.8461766 , 0.84869312, 0.84959286]),
 array([475, 169,  42,   4,  57]))

In [774]:
userIds = R.iloc[index].index

In [775]:
ratings.set_index('userId')

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
...,...,...,...
610,166534,4.0,1493848402
610,168248,5.0,1493850091
610,168250,5.0,1494273047
610,168252,5.0,1493846352


In [776]:
# only look at ratings for users that are similar!
neighborhood = ratings.set_index('userId').loc[userIds] #
neighborhood

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
476,1,4.0,835021447
476,2,4.0,835021693
476,10,3.0,835021420
476,11,3.0,835021635
476,13,3.0,835022487
...,...,...,...
58,648,5.0,847719035
58,708,5.0,847719464
58,736,3.0,847718894
58,780,5.0,847718910


In [777]:
neighborhood['movieId'].unique()

array([   1,    2,   10,   11,   13,   32,   34,   45,   48,   73,  150,
        153,  158,  161,  165,  185,  195,  215,  224,  236,  239,  252,
        261,  277,  289,  296,  300,  313,  314,  317,  318,  329,  337,
        339,  349,  350,  356,  357,  361,  362,  364,  368,  376,  377,
        378,  380,  412,  421,  434,  440,  454,  457,  468,  474,  480,
        500,  531,  539,  586,  587,  588,  589,  590,  592,  594,  595,
        596,  597,  616,    5,   95,  110,  168,  181,  203,  207,  208,
        266,  282,  292,  315,  344,  355,  410,  420,  553,  593,  733,
        736,    3,    7,    8,   23,   29,   47,   57,   60,   79,  102,
        107,  173,  174,  193,  217,  225,  231,  238,  253,  256,  259,
        262,  271,  276,  288,  316,  325,  343,  351,  367,  374,  382,
        413,  419,  432,  435,  442,  484,  502,  519,  520,  532,  542,
        552,  575,  609,  610,  631,  648,  661,  711,  783,  788,  810,
        828, 1064, 1073, 1084, 1105, 1356,   21,   

In [778]:
movies.set_index('movieId').loc[neighborhood['movieId'].unique()][genres]

Unnamed: 0_level_0,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,Action,Sci-Fi,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
11,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0
13,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0
551,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
555,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
708,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0


In [779]:
movies.set_index('movieId').loc[example_query.keys()][genres]

Unnamed: 0_level_0,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,Action,Sci-Fi,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
48,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
594,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0
27619,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
152081,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1
595,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0
616,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1029,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [780]:
movies.set_index('movieId')[genres]

Unnamed: 0_level_0,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,Action,Sci-Fi,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
193583,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
193585,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
193587,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [781]:
movie_genres = movies.set_index('movieId')[genres]

In [782]:
movie_genres.shape

(9742, 20)

In [783]:
neighborhood['movieId']

userId
476       1
476       2
476      10
476      11
476      13
       ... 
58      648
58      708
58      736
58      780
58     1073
Name: movieId, Length: 389, dtype: int64

In [784]:
movie_genres.loc[neighborhood['movieId']]

Unnamed: 0_level_0,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,Action,Sci-Fi,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
11,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0
13,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
708,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
736,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1
780,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1


In [785]:
pd.DataFrame(movie_genres.loc[example_query.keys()].mean()).transpose()

Unnamed: 0,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,Action,Sci-Fi,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
0,1.0,0.0,0.0,0.0,1.0,0.285714,0.428571,0.0,0.142857,0.0,0.0,0.142857,0.0,0.285714,0.285714,0.0,0.0,0.571429,0.0,0.285714


In [786]:
genre_model = NearestNeighbors()#metric='manhattan'

# fit it to the user-item rating matrix
genre_model.fit(movie_genres.loc[neighborhood['movieId'].unique()])
dist,ind = genre_model.kneighbors(pd.DataFrame(movie_genres.loc[example_query.keys()].mean()).transpose(), n_neighbors=50)

index = ind[0]

In [787]:
index

array([ 68,  27, 195,  66,   4,  64,   8, 140,  60, 138,  20, 116,  56,
         6, 110,  40, 136, 144,   0, 126, 119, 134,  12, 106,  91, 133,
       139,  65, 143,  58,  47, 187,  73,  99, 145, 142,  83,  96,   1,
        82,  39, 188, 190, 165,  23, 167, 158,  95, 102, 192])

In [789]:
movieIds = movie_genres.loc[neighborhood['movieId'].unique()].iloc[index]
movieIds

Unnamed: 0_level_0,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,Action,Sci-Fi,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
616,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
313,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
551,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
596,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
13,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
594,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0
48,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
783,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
588,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
661,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [791]:
movies.set_index('movieId').loc[movieIds.index]

Unnamed: 0_level_0,title,genres,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,...,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
616,"Aristocats, The (1970)",Animation|Children,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
313,"Swan Princess, The (1994)",Animation|Children,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
551,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
596,Pinocchio (1940),Animation|Children|Fantasy|Musical,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
13,Balto (1995),Adventure|Animation|Children,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance,1,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,1,0,0
783,"Hunchback of Notre Dame, The (1996)",Animation|Children|Drama|Musical|Romance,1,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,1,0,0
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,1
661,James and the Giant Peach (1996),Adventure|Animation|Children|Fantasy|Musical,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,1


In [630]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood
scores = neighborhood.groupby('movieId')['rating'].mean()
scores

movieId
1       4.333333
2       4.000000
3       4.000000
5       4.000000
7       5.000000
          ...   
1064    5.000000
1073    5.000000
1084    5.000000
1105    5.000000
1356    5.000000
Name: rating, Length: 199, dtype: float64

### Give recommendations (same as before!)

In [631]:
example_query.keys()

dict_keys([48, 594, 27619, 152081, 595, 616, 1029])

In [632]:
# give a zero score to movies the user has allready seen
allready_seen = scores.index.isin(example_query.keys())
scores.loc[allready_seen] = 0

In [633]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

movieId
1356    5.0
362     5.0
107     5.0
527     5.0
519     5.0
       ... 
19      1.0
48      0.0
594     0.0
595     0.0
616     0.0
Name: rating, Length: 199, dtype: float64

In [634]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([1356, 362, 107, 527, 519, 508, 502, 497, 484, 475], dtype='int64', name='movieId')

In [635]:
# let's see the recommendations!
movies.set_index('movieId').loc[recommendations]

Unnamed: 0_level_0,title,genres,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,...,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1356,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
362,"Jungle Book, The (1994)",Adventure|Children|Romance,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
107,Muppet Treasure Island (1996),Adventure|Children|Comedy|Musical,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,1
527,Schindler's List (1993),Drama|War,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
519,RoboCop 3 (1993),Action|Crime|Drama|Sci-Fi|Thriller,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
508,Philadelphia (1993),Drama,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
502,"Next Karate Kid, The (1994)",Action|Children|Romance,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
497,Much Ado About Nothing (1993),Comedy|Romance,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
484,Lassie (1994),Adventure|Children,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
475,In the Name of the Father (1993),Drama,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


---
## 3. Project Task: neighborhood based recommender function

- Collect different example queries for "typical" users (e.g. a horror movie buff) and try out the algorithm
- Set the number of neighbors to a very high or low number. What happens to the recommendations?
- Implement a recommender function that recommends movies to a new user based on the NearestNeighbor model!


- ⭐ **Bonus**: Calculate the score using a weighted sum or average. Use the distances to the other users as weights
- ⭐ **Bonus**: Use the method to find and recommend similar movies! Hint: Run the model on the transposed user item rating matrix.
- ⭐ **Bonus**: First use NMF to reduce the dimensionality of the sparse user item matrix. Then run neighborhood based recommendation on the dense matrix.

In [None]:
# collaborative filtering = look at ratings only!
def recommend_neighborhood(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained nearest neighbors model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    
    # construct a user vector
    
   
    # 2. scoring
    
    # find n neighbors
    
    # calculate their average rating
    
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]

In [None]:
# recommender.py
# from recommender import recommend_neighborhood