In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle

In [87]:
# DRY - Don't repeat yourself!
# place a utils.py in the same folder as the notebook
# from utils import example_query, create_user_vector, create_rating_matrix

# for calculating recommendations
example_query = {
    # movieId, rating
    # 4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}

# Neighborhood Based Filtering for Recommender Systems
---

> The key idea is that the rating of u for a new item i is likely to be similar to that of another user v,if u and v have rated other items in a similar way. Likewise,u is likely to rate two items i and j in a similar fashion, if other users have given similar ratings to these two items.

##### Use ratings of similar users (or items) to predict what you like! But: How can we measure similarity/distance? 

- Cosine Similariy/Distance (works good for sparse high dimensional data)
- Jaccard Similarity/Distance (only works on binarized vectors)
- Pearson Correlation/Distance (cosine similarity on centered vectors)
- Euclidian Distance/Similarity (not good for sparse high dimensional data)

You find many more metrics here:https://docs.scipy.org/doc/scipy/reference/spatial.distance.html

In [88]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [137]:
movies.head()

Unnamed: 0,movieId,title,genres,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,...,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,1,0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [90]:
#movies['genres'] = movies['genres'].str.split('|')

In [91]:
#movies

In [92]:
def genres_list(movies):
   genres = [] 
   for i,item in enumerate(movies['genres'].loc[:]):
       genres.extend(item.split('|'))
   genres = set(genres)
   return list(genres)

In [93]:
genres = genres_list(movies)
for genre in genres:
    movies[genre] = movies['genres'].apply(lambda x: 1 if x.find(genre) !=-1 else 0)

---
## 1. Model Development

### Preprocessing (same as for the NMF model!)

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [94]:
def get_R(ratings):
    R = ratings.pivot(index='userId',columns='movieId',values='rating')
    R.fillna(0, inplace=True)
    #R = csr_matrix(R)
    return R

R = get_R(ratings)

### Training

- initialize the model: pick a distance metric
- fit it to the user item matrix: only stores the data and doesn't do further. all the calculations take place later!

In [95]:
# which metrics can we use for sparse matrics?
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [96]:
# initialize the unsupervised model
def model(ratings):
    model = NearestNeighbors(metric='cosine')#'manhattan'
    R = get_R(ratings)
    # fit it to the user-item rating matrix
    model.fit(R)
    with open('./distance_recommender.pkl', 'wb') as file:
        pickle.dump(model, file)
    with open('./distance_recommender.pkl', 'rb') as file:
        model = pickle.load(file)
    return model

### Save the trained model on your hard drive

---
## 2. Model deployment: Make recommendations for a new user

### Read the model from hard drive

In [97]:
# if you have loaded the model inside the utils.py you can also write:
# from utils import model

In [98]:
model = model(ratings)

### Receive a user query

In [99]:
example_query

{48: 5, 594: 5, 27619: 5, 152081: 5, 595: 5, 616: 5, 1029: 5}

In [100]:
movies.set_index('movieId').loc[example_query.keys()]

Unnamed: 0_level_0,title,genres,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,...,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance,0,0,0,1,0,0,1,1,...,0,1,0,0,0,0,0,1,0,0
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,1
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,1,1,0
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,1,0,1
616,"Aristocats, The (1970)",Animation|Children,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1029,Dumbo (1941),Animation|Children|Drama|Musical,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0


### Construct a user vector (same as before!)

we need the same input as was used during training!

In [101]:
example_query.keys()

dict_keys([48, 594, 27619, 152081, 595, 616, 1029])

In [102]:
R[[48, 594, 27619, 152081, 595, 616, 1029]]

movieId,48,594,27619,152081,595,616,1029
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,3.0,0.0,0.0
5,0.0,5.0,0.0,0.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...
606,0.0,3.5,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.5,2.5,0.0,0.0,2.5,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
user_vec = np.repeat(0, R.shape[1])
user_vec = pd.DataFrame(user_vec.reshape(1,9724),columns=R.columns)
user_vec[list(example_query.keys())] = list(example_query.values())
#user_vec[[list(query.keys())] 

In [104]:
user_vec[list(example_query.keys())]# 

movieId,48,594,27619,152081,595,616,1029
0,5,5,5,5,5,5,5


### Calculate the score (new!)

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [105]:
# calculates the distances to all other users in the data!
distances, index = model.kneighbors(user_vec, n_neighbors=1, return_distance=True)

# sklearn returns a list of predictions - extract the first and only value of the list
distances = distances[0]
index = index[0]

In [106]:
distances, index #### up to here we are good

(array([0.80602854]), array([475]))

In [107]:
userIds = R.iloc[index].index

In [108]:
ratings.set_index('movieId').loc[example_query.keys()].set_index('userId').loc[[474,475,476]]

Unnamed: 0_level_0,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
474,3.0,983032069
474,5.0,1100119595
474,3.5,1100292175
474,3.0,983032230
475,4.5,1498029487
476,4.0,835022192
476,4.0,835021778
476,5.0,835021384
476,3.0,835021910


In [109]:
ratings.set_index('movieId').loc[example_query.keys()].set_index('userId').loc[userIds]

Unnamed: 0_level_0,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
476,4.0,835022192
476,4.0,835021778
476,5.0,835021384
476,3.0,835021910


In [110]:
# only look at ratings for users that are similar!
neighborhood = ratings.set_index('userId').loc[userIds] #
neighborhood

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
476,1,4.0,835021447
476,2,4.0,835021693
476,10,3.0,835021420
476,11,3.0,835021635
476,13,3.0,835022487
...,...,...,...
476,594,4.0,835021778
476,595,5.0,835021384
476,596,5.0,835021800
476,597,3.0,835022173


In [111]:
neighborhood['movieId'].unique()

array([  1,   2,  10,  11,  13,  32,  34,  45,  48,  73, 150, 153, 158,
       161, 165, 185, 195, 215, 224, 236, 239, 252, 261, 277, 289, 296,
       300, 313, 314, 317, 318, 329, 337, 339, 349, 350, 356, 357, 361,
       362, 364, 368, 376, 377, 378, 380, 412, 421, 434, 440, 454, 457,
       468, 474, 480, 500, 531, 539, 586, 587, 588, 589, 590, 592, 594,
       595, 596, 597, 616])

In [112]:
movies.set_index('movieId').loc[neighborhood['movieId'].unique()][genres]

Unnamed: 0_level_0,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,Documentary,IMAX,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
10,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0
11,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0
13,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
595,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1
596,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1
597,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0


In [113]:
movies.set_index('movieId').loc[example_query.keys()][genres]

Unnamed: 0_level_0,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,Documentary,IMAX,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
48,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0
594,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
27619,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
152081,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0
595,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1
616,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1029,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0


In [114]:
movies.set_index('movieId')[genres]

Unnamed: 0_level_0,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,Documentary,IMAX,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
193583,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
193585,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
193587,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [115]:
movie_genres = movies.set_index('movieId')[genres]

In [116]:
movie_genres.shape

(9742, 20)

In [117]:
neighborhood['movieId']

userId
476      1
476      2
476     10
476     11
476     13
      ... 
476    594
476    595
476    596
476    597
476    616
Name: movieId, Length: 69, dtype: int64

In [118]:
movie_genres.loc[neighborhood['movieId']]

Unnamed: 0_level_0,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,Documentary,IMAX,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
10,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0
11,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0
13,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
595,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1
596,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1
597,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0


In [135]:
pd.DataFrame(movie_genres.loc[example_query.keys()].mean()).transpose()

Unnamed: 0,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,Documentary,IMAX,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
0,0.0,0.0,0.0,1.0,0.0,0.0,0.285714,0.428571,0.0,0.142857,0.0,0.571429,0.285714,0.0,0.0,0.0,0.285714,1.0,0.142857,0.285714


In [136]:
genre_model = NearestNeighbors()#metric='manhattan'

# fit it to the user-item rating matrix
genre_model.fit(movie_genres.loc[neighborhood['movieId'].unique()])
dist,ind = genre_model.kneighbors(pd.DataFrame(movie_genres.loc[example_query.keys()].mean()).transpose(), n_neighbors=50)

index = ind[0]

In [121]:
index

array([68, 27, 66, 64,  4,  8, 60, 20,  6, 56, 40,  0, 58, 12, 65, 47, 39,
        1, 46, 22, 23, 32, 26, 28, 17, 55, 37, 49, 33, 52, 24, 67, 44, 21,
       29, 16, 38, 51,  3, 57, 18, 50,  9, 30, 10, 19, 62, 31,  7, 53])

In [122]:
movieIds = movie_genres.loc[neighborhood['movieId'].unique()].iloc[index]
movieIds

Unnamed: 0_level_0,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,Documentary,IMAX,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
616,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
313,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
596,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1
594,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
13,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
48,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0
588,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0
239,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
34,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
531,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [126]:
my_recomendations = movies.set_index('movieId').loc[movieIds.index]
myrecomendations_movieIds = my_recomendations.index

In [125]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood

scores = neighborhood.groupby('movieId')['rating'].mean()
scores

movieId
1      4.0
2      4.0
10     3.0
11     3.0
13     3.0
      ... 
594    4.0
595    5.0
596    5.0
597    3.0
616    3.0
Name: rating, Length: 69, dtype: float64

### Give recommendations (same as before!)

In [631]:
example_query.keys()

dict_keys([48, 594, 27619, 152081, 595, 616, 1029])

In [632]:
# give a zero score to movies the user has allready seen
allready_seen = scores.index.isin(example_query.keys())
scores.loc[allready_seen] = 0

In [127]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

movieId
616    3.0
313    4.0
596    5.0
594    4.0
13     3.0
48     4.0
588    5.0
239    3.0
34     4.0
531    4.0
364    4.0
1      4.0
586    3.0
158    3.0
595    5.0
421    4.0
362    5.0
2      4.0
412    3.0
261    5.0
277    3.0
337    4.0
300    4.0
314    4.0
215    4.0
500    3.0
357    4.0
440    3.0
339    4.0
468    3.0
289    3.0
597    3.0
378    3.0
252    4.0
317    3.0
195    2.0
361    3.0
457    5.0
11     3.0
539    4.0
224    4.0
454    4.0
73     4.0
318    5.0
150    4.0
236    3.0
590    5.0
329    3.0
45     3.0
474    4.0
Name: rating, dtype: float64

In [133]:
myscores = scores.loc[myrecomendations_movieIds].sort_values()
myscores
movies.set_index('movieId').loc[myscores.index]

Unnamed: 0_level_0,title,genres,Crime,Film-Noir,Western,Animation,(no genres listed),Thriller,Romance,Drama,...,War,Musical,Adventure,Sci-Fi,Horror,Mystery,Comedy,Children,Action,Fantasy
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195,Something to Talk About (1995),Comedy|Drama|Romance,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
616,"Aristocats, The (1970)",Animation|Children,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
45,To Die For (1995),Comedy|Drama|Thriller,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
468,Englishman Who Went Up a Hill But Came Down a ...,Comedy|Romance,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
289,Only You (1994),Comedy|Romance,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
277,Miracle on 34th Street (1994),Drama,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
597,Pretty Woman (1990),Comedy|Romance,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
412,"Age of Innocence, The (1993)",Drama,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
378,Speechless (1994),Comedy|Romance,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
317,"Santa Clause, The (1994)",Comedy|Drama|Fantasy,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1


In [634]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([1356, 362, 107, 527, 519, 508, 502, 497, 484, 475], dtype='int64', name='movieId')

In [635]:
# let's see the recommendations!
movies.set_index('movieId').loc[recommendations]

Unnamed: 0_level_0,title,genres,Animation,Mystery,Film-Noir,Crime,Children,Fantasy,Drama,Western,...,War,IMAX,Thriller,Romance,Comedy,Documentary,Horror,Musical,(no genres listed),Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1356,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
362,"Jungle Book, The (1994)",Adventure|Children|Romance,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
107,Muppet Treasure Island (1996),Adventure|Children|Comedy|Musical,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,1
527,Schindler's List (1993),Drama|War,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
519,RoboCop 3 (1993),Action|Crime|Drama|Sci-Fi|Thriller,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
508,Philadelphia (1993),Drama,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
502,"Next Karate Kid, The (1994)",Action|Children|Romance,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
497,Much Ado About Nothing (1993),Comedy|Romance,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
484,Lassie (1994),Adventure|Children,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
475,In the Name of the Father (1993),Drama,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


---
## 3. Project Task: neighborhood based recommender function

- Collect different example queries for "typical" users (e.g. a horror movie buff) and try out the algorithm
- Set the number of neighbors to a very high or low number. What happens to the recommendations?
- Implement a recommender function that recommends movies to a new user based on the NearestNeighbor model!


- ⭐ **Bonus**: Calculate the score using a weighted sum or average. Use the distances to the other users as weights
- ⭐ **Bonus**: Use the method to find and recommend similar movies! Hint: Run the model on the transposed user item rating matrix.
- ⭐ **Bonus**: First use NMF to reduce the dimensionality of the sparse user item matrix. Then run neighborhood based recommendation on the dense matrix.

In [None]:
# collaborative filtering = look at ratings only!
def recommend_neighborhood(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained nearest neighbors model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    
    # construct a user vector
    
   
    # 2. scoring
    
    # find n neighbors
    
    # calculate their average rating
    
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]

In [None]:
# recommender.py
# from recommender import recommend_neighborhood